diff --git "a/old_checkpoint-10370/trainer_state.json" "b/old_checkpoint-10370/trainer_state.json" new file mode 100644--- /dev/null +++ "b/old_checkpoint-10370/trainer_state.json" @@ -0,0 +1,72624 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10370, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.643201542912247e-05, + "grad_norm": 894.2868041992188, + "learning_rate": 5e-05, + "loss": 16.2999, + "step": 1 + }, + { + "epoch": 0.00019286403085824494, + "grad_norm": 127.65240478515625, + "learning_rate": 4.9999998852765385e-05, + "loss": 14.3195, + "step": 2 + }, + { + "epoch": 0.0002892960462873674, + "grad_norm": 38.59352111816406, + "learning_rate": 4.999999541106165e-05, + "loss": 12.8998, + "step": 3 + }, + { + "epoch": 0.0003857280617164899, + "grad_norm": 90.90760040283203, + "learning_rate": 4.9999989674889105e-05, + "loss": 14.5999, + "step": 4 + }, + { + "epoch": 0.00048216007714561236, + "grad_norm": 88.2799301147461, + "learning_rate": 4.999998164424827e-05, + "loss": 12.8077, + "step": 5 + }, + { + "epoch": 0.0005785920925747348, + "grad_norm": 42.58583068847656, + "learning_rate": 4.999997131913989e-05, + "loss": 12.1422, + "step": 6 + }, + { + "epoch": 0.0006750241080038572, + "grad_norm": 8.961556434631348, + "learning_rate": 4.999995869956492e-05, + "loss": 11.961, + "step": 7 + }, + { + "epoch": 0.0007714561234329798, + "grad_norm": 4.052907466888428, + "learning_rate": 4.9999943785524504e-05, + "loss": 11.8978, + "step": 8 + }, + { + "epoch": 0.0008678881388621022, + "grad_norm": 5.132551670074463, + "learning_rate": 4.999992657702002e-05, + "loss": 11.9047, + "step": 9 + }, + { + "epoch": 0.0009643201542912247, + "grad_norm": 5.546567440032959, + "learning_rate": 4.999990707405304e-05, + "loss": 11.7951, + "step": 10 + }, + { + "epoch": 0.0010607521697203472, + "grad_norm": 11.844557762145996, + "learning_rate": 4.999988527662537e-05, + "loss": 11.7688, + "step": 11 + }, + { + "epoch": 0.0011571841851494697, + "grad_norm": 23.252017974853516, + "learning_rate": 4.999986118473899e-05, + "loss": 11.7443, + "step": 12 + }, + { + "epoch": 0.001253616200578592, + "grad_norm": 9.230341911315918, + "learning_rate": 4.999983479839612e-05, + "loss": 11.6252, + "step": 13 + }, + { + "epoch": 0.0013500482160077145, + "grad_norm": 7.996587753295898, + "learning_rate": 4.999980611759919e-05, + "loss": 11.4877, + "step": 14 + }, + { + "epoch": 0.0014464802314368371, + "grad_norm": 4.676703929901123, + "learning_rate": 4.999977514235081e-05, + "loss": 11.4671, + "step": 15 + }, + { + "epoch": 0.0015429122468659595, + "grad_norm": 8.9671630859375, + "learning_rate": 4.999974187265385e-05, + "loss": 11.3048, + "step": 16 + }, + { + "epoch": 0.001639344262295082, + "grad_norm": 5.942743301391602, + "learning_rate": 4.999970630851135e-05, + "loss": 11.1501, + "step": 17 + }, + { + "epoch": 0.0017357762777242044, + "grad_norm": 14.976433753967285, + "learning_rate": 4.9999668449926576e-05, + "loss": 11.158, + "step": 18 + }, + { + "epoch": 0.0018322082931533268, + "grad_norm": 13.087100982666016, + "learning_rate": 4.999962829690299e-05, + "loss": 11.0841, + "step": 19 + }, + { + "epoch": 0.0019286403085824494, + "grad_norm": 4.704329013824463, + "learning_rate": 4.9999585849444295e-05, + "loss": 10.8378, + "step": 20 + }, + { + "epoch": 0.0020250723240115716, + "grad_norm": 5.675564765930176, + "learning_rate": 4.999954110755438e-05, + "loss": 10.7059, + "step": 21 + }, + { + "epoch": 0.0021215043394406945, + "grad_norm": 9.32601261138916, + "learning_rate": 4.999949407123735e-05, + "loss": 10.5637, + "step": 22 + }, + { + "epoch": 0.002217936354869817, + "grad_norm": 3.070762872695923, + "learning_rate": 4.999944474049752e-05, + "loss": 10.4711, + "step": 23 + }, + { + "epoch": 0.0023143683702989393, + "grad_norm": 7.12990140914917, + "learning_rate": 4.9999393115339435e-05, + "loss": 10.3326, + "step": 24 + }, + { + "epoch": 0.0024108003857280617, + "grad_norm": 3.46315336227417, + "learning_rate": 4.999933919576781e-05, + "loss": 10.2054, + "step": 25 + }, + { + "epoch": 0.002507232401157184, + "grad_norm": 8.220792770385742, + "learning_rate": 4.99992829817876e-05, + "loss": 10.1002, + "step": 26 + }, + { + "epoch": 0.0026036644165863066, + "grad_norm": 3.073145627975464, + "learning_rate": 4.999922447340397e-05, + "loss": 10.0363, + "step": 27 + }, + { + "epoch": 0.002700096432015429, + "grad_norm": 5.055649280548096, + "learning_rate": 4.999916367062229e-05, + "loss": 10.0294, + "step": 28 + }, + { + "epoch": 0.0027965284474445514, + "grad_norm": 5.133037567138672, + "learning_rate": 4.999910057344813e-05, + "loss": 9.9086, + "step": 29 + }, + { + "epoch": 0.0028929604628736743, + "grad_norm": 3.147897481918335, + "learning_rate": 4.99990351818873e-05, + "loss": 9.8498, + "step": 30 + }, + { + "epoch": 0.0029893924783027967, + "grad_norm": 0.9895338416099548, + "learning_rate": 4.999896749594578e-05, + "loss": 9.8446, + "step": 31 + }, + { + "epoch": 0.003085824493731919, + "grad_norm": 2.051626682281494, + "learning_rate": 4.9998897515629794e-05, + "loss": 9.7151, + "step": 32 + }, + { + "epoch": 0.0031822565091610415, + "grad_norm": 1.5337445735931396, + "learning_rate": 4.999882524094576e-05, + "loss": 9.7499, + "step": 33 + }, + { + "epoch": 0.003278688524590164, + "grad_norm": 1.575860857963562, + "learning_rate": 4.9998750671900316e-05, + "loss": 9.8312, + "step": 34 + }, + { + "epoch": 0.0033751205400192863, + "grad_norm": 1.4011952877044678, + "learning_rate": 4.9998673808500306e-05, + "loss": 9.8298, + "step": 35 + }, + { + "epoch": 0.0034715525554484088, + "grad_norm": 1.4939823150634766, + "learning_rate": 4.9998594650752784e-05, + "loss": 9.6559, + "step": 36 + }, + { + "epoch": 0.003567984570877531, + "grad_norm": 1.5983542203903198, + "learning_rate": 4.9998513198665006e-05, + "loss": 9.7958, + "step": 37 + }, + { + "epoch": 0.0036644165863066536, + "grad_norm": 1.1402848958969116, + "learning_rate": 4.999842945224447e-05, + "loss": 9.8077, + "step": 38 + }, + { + "epoch": 0.0037608486017357765, + "grad_norm": 2.045302152633667, + "learning_rate": 4.999834341149883e-05, + "loss": 9.7714, + "step": 39 + }, + { + "epoch": 0.003857280617164899, + "grad_norm": 1.8450353145599365, + "learning_rate": 4.9998255076436005e-05, + "loss": 9.8312, + "step": 40 + }, + { + "epoch": 0.003953712632594021, + "grad_norm": 1.4970803260803223, + "learning_rate": 4.99981644470641e-05, + "loss": 9.7581, + "step": 41 + }, + { + "epoch": 0.004050144648023143, + "grad_norm": 1.2604284286499023, + "learning_rate": 4.999807152339142e-05, + "loss": 9.6601, + "step": 42 + }, + { + "epoch": 0.004146576663452266, + "grad_norm": 1.8617265224456787, + "learning_rate": 4.9997976305426514e-05, + "loss": 9.7966, + "step": 43 + }, + { + "epoch": 0.004243008678881389, + "grad_norm": 1.0629761219024658, + "learning_rate": 4.999787879317811e-05, + "loss": 9.7612, + "step": 44 + }, + { + "epoch": 0.004339440694310511, + "grad_norm": 1.697013020515442, + "learning_rate": 4.999777898665515e-05, + "loss": 9.8474, + "step": 45 + }, + { + "epoch": 0.004435872709739634, + "grad_norm": 2.3008434772491455, + "learning_rate": 4.9997676885866804e-05, + "loss": 9.7878, + "step": 46 + }, + { + "epoch": 0.004532304725168756, + "grad_norm": 3.1291637420654297, + "learning_rate": 4.9997572490822444e-05, + "loss": 9.7692, + "step": 47 + }, + { + "epoch": 0.004628736740597879, + "grad_norm": 0.933137834072113, + "learning_rate": 4.999746580153165e-05, + "loss": 9.8038, + "step": 48 + }, + { + "epoch": 0.004725168756027001, + "grad_norm": 0.8688145875930786, + "learning_rate": 4.99973568180042e-05, + "loss": 9.7965, + "step": 49 + }, + { + "epoch": 0.0048216007714561235, + "grad_norm": 0.8407056331634521, + "learning_rate": 4.999724554025013e-05, + "loss": 9.7792, + "step": 50 + }, + { + "epoch": 0.004918032786885246, + "grad_norm": 1.4264503717422485, + "learning_rate": 4.999713196827961e-05, + "loss": 9.7759, + "step": 51 + }, + { + "epoch": 0.005014464802314368, + "grad_norm": 0.997123658657074, + "learning_rate": 4.9997016102103086e-05, + "loss": 9.7592, + "step": 52 + }, + { + "epoch": 0.005110896817743491, + "grad_norm": 1.007464051246643, + "learning_rate": 4.9996897941731204e-05, + "loss": 9.747, + "step": 53 + }, + { + "epoch": 0.005207328833172613, + "grad_norm": 1.2212399244308472, + "learning_rate": 4.999677748717479e-05, + "loss": 9.7479, + "step": 54 + }, + { + "epoch": 0.005303760848601736, + "grad_norm": 0.744339108467102, + "learning_rate": 4.99966547384449e-05, + "loss": 9.7707, + "step": 55 + }, + { + "epoch": 0.005400192864030858, + "grad_norm": 0.9119800329208374, + "learning_rate": 4.99965296955528e-05, + "loss": 9.7452, + "step": 56 + }, + { + "epoch": 0.005496624879459981, + "grad_norm": 1.5430867671966553, + "learning_rate": 4.9996402358509975e-05, + "loss": 9.6778, + "step": 57 + }, + { + "epoch": 0.005593056894889103, + "grad_norm": 0.7969384789466858, + "learning_rate": 4.999627272732811e-05, + "loss": 9.6843, + "step": 58 + }, + { + "epoch": 0.005689488910318226, + "grad_norm": 0.7266324162483215, + "learning_rate": 4.99961408020191e-05, + "loss": 9.7555, + "step": 59 + }, + { + "epoch": 0.0057859209257473485, + "grad_norm": 0.911257803440094, + "learning_rate": 4.9996006582595045e-05, + "loss": 9.7141, + "step": 60 + }, + { + "epoch": 0.0058823529411764705, + "grad_norm": 0.7173992991447449, + "learning_rate": 4.999587006906828e-05, + "loss": 9.7434, + "step": 61 + }, + { + "epoch": 0.005978784956605593, + "grad_norm": 0.7688730359077454, + "learning_rate": 4.999573126145132e-05, + "loss": 9.7149, + "step": 62 + }, + { + "epoch": 0.006075216972034715, + "grad_norm": 0.7378392815589905, + "learning_rate": 4.999559015975691e-05, + "loss": 9.6985, + "step": 63 + }, + { + "epoch": 0.006171648987463838, + "grad_norm": 0.6439594030380249, + "learning_rate": 4.9995446763998e-05, + "loss": 9.7224, + "step": 64 + }, + { + "epoch": 0.00626808100289296, + "grad_norm": 0.7672226428985596, + "learning_rate": 4.999530107418775e-05, + "loss": 9.6868, + "step": 65 + }, + { + "epoch": 0.006364513018322083, + "grad_norm": 0.8840787410736084, + "learning_rate": 4.999515309033953e-05, + "loss": 9.671, + "step": 66 + }, + { + "epoch": 0.006460945033751205, + "grad_norm": 1.5552018880844116, + "learning_rate": 4.9995002812466926e-05, + "loss": 9.6421, + "step": 67 + }, + { + "epoch": 0.006557377049180328, + "grad_norm": 1.7690881490707397, + "learning_rate": 4.999485024058373e-05, + "loss": 9.614, + "step": 68 + }, + { + "epoch": 0.006653809064609451, + "grad_norm": 3.4752726554870605, + "learning_rate": 4.9994695374703934e-05, + "loss": 9.595, + "step": 69 + }, + { + "epoch": 0.006750241080038573, + "grad_norm": 2.496119737625122, + "learning_rate": 4.999453821484177e-05, + "loss": 9.6996, + "step": 70 + }, + { + "epoch": 0.0068466730954676955, + "grad_norm": 2.8837668895721436, + "learning_rate": 4.9994378761011646e-05, + "loss": 9.6295, + "step": 71 + }, + { + "epoch": 0.0069431051108968175, + "grad_norm": 3.217867612838745, + "learning_rate": 4.9994217013228205e-05, + "loss": 9.6311, + "step": 72 + }, + { + "epoch": 0.00703953712632594, + "grad_norm": 1.7213551998138428, + "learning_rate": 4.999405297150629e-05, + "loss": 9.6631, + "step": 73 + }, + { + "epoch": 0.007135969141755062, + "grad_norm": 10.376355171203613, + "learning_rate": 4.9993886635860956e-05, + "loss": 9.8155, + "step": 74 + }, + { + "epoch": 0.007232401157184185, + "grad_norm": 7.931129455566406, + "learning_rate": 4.9993718006307466e-05, + "loss": 9.7128, + "step": 75 + }, + { + "epoch": 0.007328833172613307, + "grad_norm": 3.787161111831665, + "learning_rate": 4.99935470828613e-05, + "loss": 9.6943, + "step": 76 + }, + { + "epoch": 0.00742526518804243, + "grad_norm": 1.8758903741836548, + "learning_rate": 4.999337386553814e-05, + "loss": 9.6981, + "step": 77 + }, + { + "epoch": 0.007521697203471553, + "grad_norm": 3.2817249298095703, + "learning_rate": 4.99931983543539e-05, + "loss": 9.6579, + "step": 78 + }, + { + "epoch": 0.007618129218900675, + "grad_norm": 1.072784185409546, + "learning_rate": 4.9993020549324667e-05, + "loss": 9.6686, + "step": 79 + }, + { + "epoch": 0.007714561234329798, + "grad_norm": 10.332382202148438, + "learning_rate": 4.9992840450466775e-05, + "loss": 9.6596, + "step": 80 + }, + { + "epoch": 0.00781099324975892, + "grad_norm": 11.69206428527832, + "learning_rate": 4.999265805779675e-05, + "loss": 9.6592, + "step": 81 + }, + { + "epoch": 0.007907425265188043, + "grad_norm": 6.7987380027771, + "learning_rate": 4.999247337133132e-05, + "loss": 9.5992, + "step": 82 + }, + { + "epoch": 0.008003857280617165, + "grad_norm": 1.3900320529937744, + "learning_rate": 4.999228639108745e-05, + "loss": 9.662, + "step": 83 + }, + { + "epoch": 0.008100289296046287, + "grad_norm": 3.1216063499450684, + "learning_rate": 4.999209711708229e-05, + "loss": 9.5992, + "step": 84 + }, + { + "epoch": 0.00819672131147541, + "grad_norm": 2.7701573371887207, + "learning_rate": 4.999190554933323e-05, + "loss": 9.516, + "step": 85 + }, + { + "epoch": 0.008293153326904532, + "grad_norm": 1.3859941959381104, + "learning_rate": 4.9991711687857826e-05, + "loss": 9.5525, + "step": 86 + }, + { + "epoch": 0.008389585342333654, + "grad_norm": 2.5584001541137695, + "learning_rate": 4.999151553267389e-05, + "loss": 9.6262, + "step": 87 + }, + { + "epoch": 0.008486017357762778, + "grad_norm": 1.0593478679656982, + "learning_rate": 4.999131708379942e-05, + "loss": 9.5445, + "step": 88 + }, + { + "epoch": 0.0085824493731919, + "grad_norm": 1.6268181800842285, + "learning_rate": 4.999111634125262e-05, + "loss": 9.5234, + "step": 89 + }, + { + "epoch": 0.008678881388621022, + "grad_norm": 1.639072060585022, + "learning_rate": 4.999091330505192e-05, + "loss": 9.574, + "step": 90 + }, + { + "epoch": 0.008775313404050144, + "grad_norm": 1.171980619430542, + "learning_rate": 4.999070797521597e-05, + "loss": 9.5357, + "step": 91 + }, + { + "epoch": 0.008871745419479268, + "grad_norm": 2.4525468349456787, + "learning_rate": 4.999050035176359e-05, + "loss": 9.6213, + "step": 92 + }, + { + "epoch": 0.00896817743490839, + "grad_norm": 2.7690675258636475, + "learning_rate": 4.999029043471384e-05, + "loss": 9.4818, + "step": 93 + }, + { + "epoch": 0.009064609450337512, + "grad_norm": 3.0441715717315674, + "learning_rate": 4.9990078224086e-05, + "loss": 9.6024, + "step": 94 + }, + { + "epoch": 0.009161041465766635, + "grad_norm": 2.1662466526031494, + "learning_rate": 4.998986371989954e-05, + "loss": 9.4422, + "step": 95 + }, + { + "epoch": 0.009257473481195757, + "grad_norm": 2.756598949432373, + "learning_rate": 4.998964692217414e-05, + "loss": 9.5789, + "step": 96 + }, + { + "epoch": 0.00935390549662488, + "grad_norm": 2.225175380706787, + "learning_rate": 4.9989427830929706e-05, + "loss": 9.4556, + "step": 97 + }, + { + "epoch": 0.009450337512054001, + "grad_norm": 4.038797378540039, + "learning_rate": 4.9989206446186344e-05, + "loss": 9.5116, + "step": 98 + }, + { + "epoch": 0.009546769527483125, + "grad_norm": 3.8894338607788086, + "learning_rate": 4.998898276796437e-05, + "loss": 9.504, + "step": 99 + }, + { + "epoch": 0.009643201542912247, + "grad_norm": 1.622916340827942, + "learning_rate": 4.9988756796284314e-05, + "loss": 9.5015, + "step": 100 + }, + { + "epoch": 0.009739633558341369, + "grad_norm": 2.651989459991455, + "learning_rate": 4.9988528531166915e-05, + "loss": 9.4669, + "step": 101 + }, + { + "epoch": 0.009836065573770493, + "grad_norm": 4.06356954574585, + "learning_rate": 4.998829797263312e-05, + "loss": 9.5367, + "step": 102 + }, + { + "epoch": 0.009932497589199615, + "grad_norm": 2.1783876419067383, + "learning_rate": 4.99880651207041e-05, + "loss": 9.6399, + "step": 103 + }, + { + "epoch": 0.010028929604628737, + "grad_norm": 2.9223508834838867, + "learning_rate": 4.998782997540121e-05, + "loss": 9.6362, + "step": 104 + }, + { + "epoch": 0.010125361620057859, + "grad_norm": 3.57692813873291, + "learning_rate": 4.998759253674604e-05, + "loss": 9.6942, + "step": 105 + }, + { + "epoch": 0.010221793635486982, + "grad_norm": 3.2532835006713867, + "learning_rate": 4.998735280476039e-05, + "loss": 9.6784, + "step": 106 + }, + { + "epoch": 0.010318225650916104, + "grad_norm": 1.4151445627212524, + "learning_rate": 4.998711077946625e-05, + "loss": 9.6431, + "step": 107 + }, + { + "epoch": 0.010414657666345226, + "grad_norm": 6.995168209075928, + "learning_rate": 4.998686646088584e-05, + "loss": 9.3371, + "step": 108 + }, + { + "epoch": 0.010511089681774348, + "grad_norm": 4.151420593261719, + "learning_rate": 4.998661984904157e-05, + "loss": 9.4527, + "step": 109 + }, + { + "epoch": 0.010607521697203472, + "grad_norm": 4.930407524108887, + "learning_rate": 4.998637094395609e-05, + "loss": 9.5545, + "step": 110 + }, + { + "epoch": 0.010703953712632594, + "grad_norm": 4.9242730140686035, + "learning_rate": 4.9986119745652236e-05, + "loss": 9.3038, + "step": 111 + }, + { + "epoch": 0.010800385728061716, + "grad_norm": 1.8655768632888794, + "learning_rate": 4.998586625415307e-05, + "loss": 9.4745, + "step": 112 + }, + { + "epoch": 0.01089681774349084, + "grad_norm": 13.999795913696289, + "learning_rate": 4.998561046948185e-05, + "loss": 9.5215, + "step": 113 + }, + { + "epoch": 0.010993249758919962, + "grad_norm": 5.649501800537109, + "learning_rate": 4.998535239166206e-05, + "loss": 9.4432, + "step": 114 + }, + { + "epoch": 0.011089681774349084, + "grad_norm": 3.4978156089782715, + "learning_rate": 4.9985092020717364e-05, + "loss": 9.5933, + "step": 115 + }, + { + "epoch": 0.011186113789778206, + "grad_norm": 2.9217262268066406, + "learning_rate": 4.998482935667168e-05, + "loss": 9.5013, + "step": 116 + }, + { + "epoch": 0.01128254580520733, + "grad_norm": 4.939250946044922, + "learning_rate": 4.998456439954911e-05, + "loss": 9.4874, + "step": 117 + }, + { + "epoch": 0.011378977820636451, + "grad_norm": 8.035819053649902, + "learning_rate": 4.998429714937397e-05, + "loss": 9.5839, + "step": 118 + }, + { + "epoch": 0.011475409836065573, + "grad_norm": 6.353049278259277, + "learning_rate": 4.998402760617079e-05, + "loss": 9.6344, + "step": 119 + }, + { + "epoch": 0.011571841851494697, + "grad_norm": 4.468026638031006, + "learning_rate": 4.998375576996431e-05, + "loss": 9.5714, + "step": 120 + }, + { + "epoch": 0.011668273866923819, + "grad_norm": 5.576091766357422, + "learning_rate": 4.998348164077947e-05, + "loss": 9.5568, + "step": 121 + }, + { + "epoch": 0.011764705882352941, + "grad_norm": 4.194013595581055, + "learning_rate": 4.9983205218641436e-05, + "loss": 9.6163, + "step": 122 + }, + { + "epoch": 0.011861137897782063, + "grad_norm": 7.051516532897949, + "learning_rate": 4.998292650357558e-05, + "loss": 9.5002, + "step": 123 + }, + { + "epoch": 0.011957569913211187, + "grad_norm": 9.800139427185059, + "learning_rate": 4.998264549560747e-05, + "loss": 9.5733, + "step": 124 + }, + { + "epoch": 0.012054001928640309, + "grad_norm": 5.142115592956543, + "learning_rate": 4.998236219476291e-05, + "loss": 9.4704, + "step": 125 + }, + { + "epoch": 0.01215043394406943, + "grad_norm": 8.690695762634277, + "learning_rate": 4.998207660106789e-05, + "loss": 9.3609, + "step": 126 + }, + { + "epoch": 0.012246865959498553, + "grad_norm": 9.813724517822266, + "learning_rate": 4.9981788714548636e-05, + "loss": 9.4253, + "step": 127 + }, + { + "epoch": 0.012343297974927676, + "grad_norm": 10.550207138061523, + "learning_rate": 4.9981498535231565e-05, + "loss": 9.4227, + "step": 128 + }, + { + "epoch": 0.012439729990356798, + "grad_norm": 3.8106424808502197, + "learning_rate": 4.9981206063143296e-05, + "loss": 9.4544, + "step": 129 + }, + { + "epoch": 0.01253616200578592, + "grad_norm": 6.568062782287598, + "learning_rate": 4.9980911298310684e-05, + "loss": 9.4677, + "step": 130 + }, + { + "epoch": 0.012632594021215044, + "grad_norm": 7.460339069366455, + "learning_rate": 4.998061424076078e-05, + "loss": 9.487, + "step": 131 + }, + { + "epoch": 0.012729026036644166, + "grad_norm": 1.5145052671432495, + "learning_rate": 4.9980314890520843e-05, + "loss": 9.3732, + "step": 132 + }, + { + "epoch": 0.012825458052073288, + "grad_norm": 9.077214241027832, + "learning_rate": 4.998001324761836e-05, + "loss": 9.3743, + "step": 133 + }, + { + "epoch": 0.01292189006750241, + "grad_norm": 11.845178604125977, + "learning_rate": 4.9979709312081005e-05, + "loss": 9.2997, + "step": 134 + }, + { + "epoch": 0.013018322082931534, + "grad_norm": 7.872161865234375, + "learning_rate": 4.997940308393667e-05, + "loss": 9.3801, + "step": 135 + }, + { + "epoch": 0.013114754098360656, + "grad_norm": 4.386551380157471, + "learning_rate": 4.9979094563213465e-05, + "loss": 9.2207, + "step": 136 + }, + { + "epoch": 0.013211186113789778, + "grad_norm": 7.700911998748779, + "learning_rate": 4.9978783749939705e-05, + "loss": 9.2799, + "step": 137 + }, + { + "epoch": 0.013307618129218901, + "grad_norm": 4.671859264373779, + "learning_rate": 4.997847064414392e-05, + "loss": 9.2714, + "step": 138 + }, + { + "epoch": 0.013404050144648023, + "grad_norm": 4.7111287117004395, + "learning_rate": 4.997815524585484e-05, + "loss": 9.334, + "step": 139 + }, + { + "epoch": 0.013500482160077145, + "grad_norm": 5.037441253662109, + "learning_rate": 4.997783755510141e-05, + "loss": 9.4061, + "step": 140 + }, + { + "epoch": 0.013596914175506267, + "grad_norm": 4.4647674560546875, + "learning_rate": 4.9977517571912804e-05, + "loss": 9.2234, + "step": 141 + }, + { + "epoch": 0.013693346190935391, + "grad_norm": 7.284264087677002, + "learning_rate": 4.997719529631837e-05, + "loss": 9.2362, + "step": 142 + }, + { + "epoch": 0.013789778206364513, + "grad_norm": 2.2538692951202393, + "learning_rate": 4.997687072834769e-05, + "loss": 9.2032, + "step": 143 + }, + { + "epoch": 0.013886210221793635, + "grad_norm": 9.086790084838867, + "learning_rate": 4.9976543868030565e-05, + "loss": 9.2593, + "step": 144 + }, + { + "epoch": 0.013982642237222759, + "grad_norm": 7.1081671714782715, + "learning_rate": 4.9976214715396976e-05, + "loss": 9.3315, + "step": 145 + }, + { + "epoch": 0.01407907425265188, + "grad_norm": 2.4200658798217773, + "learning_rate": 4.997588327047714e-05, + "loss": 9.3167, + "step": 146 + }, + { + "epoch": 0.014175506268081003, + "grad_norm": 7.789939880371094, + "learning_rate": 4.997554953330149e-05, + "loss": 9.2257, + "step": 147 + }, + { + "epoch": 0.014271938283510125, + "grad_norm": 3.6552555561065674, + "learning_rate": 4.997521350390063e-05, + "loss": 9.2504, + "step": 148 + }, + { + "epoch": 0.014368370298939248, + "grad_norm": 11.350252151489258, + "learning_rate": 4.9974875182305424e-05, + "loss": 9.4313, + "step": 149 + }, + { + "epoch": 0.01446480231436837, + "grad_norm": 14.394383430480957, + "learning_rate": 4.997453456854691e-05, + "loss": 9.1136, + "step": 150 + }, + { + "epoch": 0.014561234329797492, + "grad_norm": 11.436792373657227, + "learning_rate": 4.9974191662656354e-05, + "loss": 9.3382, + "step": 151 + }, + { + "epoch": 0.014657666345226614, + "grad_norm": 2.372204542160034, + "learning_rate": 4.997384646466522e-05, + "loss": 9.2962, + "step": 152 + }, + { + "epoch": 0.014754098360655738, + "grad_norm": 10.472686767578125, + "learning_rate": 4.99734989746052e-05, + "loss": 9.0088, + "step": 153 + }, + { + "epoch": 0.01485053037608486, + "grad_norm": 10.252636909484863, + "learning_rate": 4.9973149192508185e-05, + "loss": 9.5069, + "step": 154 + }, + { + "epoch": 0.014946962391513982, + "grad_norm": 3.860837936401367, + "learning_rate": 4.997279711840627e-05, + "loss": 9.4305, + "step": 155 + }, + { + "epoch": 0.015043394406943106, + "grad_norm": 8.577580451965332, + "learning_rate": 4.997244275233176e-05, + "loss": 9.3974, + "step": 156 + }, + { + "epoch": 0.015139826422372228, + "grad_norm": 12.571183204650879, + "learning_rate": 4.9972086094317204e-05, + "loss": 9.2186, + "step": 157 + }, + { + "epoch": 0.01523625843780135, + "grad_norm": 10.443209648132324, + "learning_rate": 4.997172714439532e-05, + "loss": 9.2229, + "step": 158 + }, + { + "epoch": 0.015332690453230472, + "grad_norm": 8.219619750976562, + "learning_rate": 4.997136590259905e-05, + "loss": 9.1583, + "step": 159 + }, + { + "epoch": 0.015429122468659595, + "grad_norm": 8.120513916015625, + "learning_rate": 4.997100236896156e-05, + "loss": 9.1694, + "step": 160 + }, + { + "epoch": 0.015525554484088717, + "grad_norm": 13.197624206542969, + "learning_rate": 4.9970636543516205e-05, + "loss": 9.0474, + "step": 161 + }, + { + "epoch": 0.01562198649951784, + "grad_norm": 26.85427474975586, + "learning_rate": 4.997026842629655e-05, + "loss": 8.9654, + "step": 162 + }, + { + "epoch": 0.01571841851494696, + "grad_norm": 23.274513244628906, + "learning_rate": 4.9969898017336405e-05, + "loss": 9.0459, + "step": 163 + }, + { + "epoch": 0.015814850530376085, + "grad_norm": 15.516063690185547, + "learning_rate": 4.996952531666975e-05, + "loss": 9.0024, + "step": 164 + }, + { + "epoch": 0.01591128254580521, + "grad_norm": 3.9129691123962402, + "learning_rate": 4.996915032433079e-05, + "loss": 9.0051, + "step": 165 + }, + { + "epoch": 0.01600771456123433, + "grad_norm": 3.2002618312835693, + "learning_rate": 4.996877304035395e-05, + "loss": 9.2411, + "step": 166 + }, + { + "epoch": 0.016104146576663453, + "grad_norm": 6.591782569885254, + "learning_rate": 4.996839346477386e-05, + "loss": 9.3997, + "step": 167 + }, + { + "epoch": 0.016200578592092573, + "grad_norm": 6.914554119110107, + "learning_rate": 4.996801159762533e-05, + "loss": 9.2592, + "step": 168 + }, + { + "epoch": 0.016297010607521697, + "grad_norm": 2.9077396392822266, + "learning_rate": 4.996762743894343e-05, + "loss": 9.3214, + "step": 169 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 4.730425834655762, + "learning_rate": 4.996724098876343e-05, + "loss": 9.1403, + "step": 170 + }, + { + "epoch": 0.01648987463837994, + "grad_norm": 6.659647464752197, + "learning_rate": 4.9966852247120764e-05, + "loss": 9.0486, + "step": 171 + }, + { + "epoch": 0.016586306653809064, + "grad_norm": 8.872209548950195, + "learning_rate": 4.996646121405114e-05, + "loss": 9.2309, + "step": 172 + }, + { + "epoch": 0.016682738669238188, + "grad_norm": 10.014256477355957, + "learning_rate": 4.996606788959043e-05, + "loss": 9.1189, + "step": 173 + }, + { + "epoch": 0.01677917068466731, + "grad_norm": 5.862621307373047, + "learning_rate": 4.9965672273774735e-05, + "loss": 9.2754, + "step": 174 + }, + { + "epoch": 0.016875602700096432, + "grad_norm": 5.396203517913818, + "learning_rate": 4.996527436664036e-05, + "loss": 9.4084, + "step": 175 + }, + { + "epoch": 0.016972034715525556, + "grad_norm": 4.160368919372559, + "learning_rate": 4.996487416822384e-05, + "loss": 9.3592, + "step": 176 + }, + { + "epoch": 0.017068466730954676, + "grad_norm": 6.849759101867676, + "learning_rate": 4.9964471678561895e-05, + "loss": 9.2475, + "step": 177 + }, + { + "epoch": 0.0171648987463838, + "grad_norm": 4.095441818237305, + "learning_rate": 4.9964066897691466e-05, + "loss": 9.2216, + "step": 178 + }, + { + "epoch": 0.017261330761812924, + "grad_norm": 6.5304059982299805, + "learning_rate": 4.99636598256497e-05, + "loss": 9.132, + "step": 179 + }, + { + "epoch": 0.017357762777242044, + "grad_norm": 5.4949951171875, + "learning_rate": 4.9963250462473965e-05, + "loss": 9.1716, + "step": 180 + }, + { + "epoch": 0.017454194792671168, + "grad_norm": 5.412527084350586, + "learning_rate": 4.9962838808201826e-05, + "loss": 9.1445, + "step": 181 + }, + { + "epoch": 0.017550626808100288, + "grad_norm": 5.967573165893555, + "learning_rate": 4.996242486287106e-05, + "loss": 9.1614, + "step": 182 + }, + { + "epoch": 0.01764705882352941, + "grad_norm": 12.069108009338379, + "learning_rate": 4.996200862651967e-05, + "loss": 8.9802, + "step": 183 + }, + { + "epoch": 0.017743490838958535, + "grad_norm": 11.306235313415527, + "learning_rate": 4.996159009918585e-05, + "loss": 9.0257, + "step": 184 + }, + { + "epoch": 0.017839922854387655, + "grad_norm": 12.983308792114258, + "learning_rate": 4.9961169280908015e-05, + "loss": 8.9716, + "step": 185 + }, + { + "epoch": 0.01793635486981678, + "grad_norm": 11.158196449279785, + "learning_rate": 4.996074617172478e-05, + "loss": 8.9546, + "step": 186 + }, + { + "epoch": 0.018032786885245903, + "grad_norm": 12.860184669494629, + "learning_rate": 4.996032077167499e-05, + "loss": 8.9343, + "step": 187 + }, + { + "epoch": 0.018129218900675023, + "grad_norm": 13.926271438598633, + "learning_rate": 4.995989308079768e-05, + "loss": 8.9105, + "step": 188 + }, + { + "epoch": 0.018225650916104147, + "grad_norm": 11.94168758392334, + "learning_rate": 4.9959463099132095e-05, + "loss": 8.9446, + "step": 189 + }, + { + "epoch": 0.01832208293153327, + "grad_norm": 5.498230934143066, + "learning_rate": 4.995903082671771e-05, + "loss": 8.916, + "step": 190 + }, + { + "epoch": 0.01841851494696239, + "grad_norm": 6.1581220626831055, + "learning_rate": 4.99585962635942e-05, + "loss": 8.886, + "step": 191 + }, + { + "epoch": 0.018514946962391515, + "grad_norm": 4.57788610458374, + "learning_rate": 4.9958159409801445e-05, + "loss": 8.8075, + "step": 192 + }, + { + "epoch": 0.018611378977820635, + "grad_norm": 6.465826511383057, + "learning_rate": 4.9957720265379535e-05, + "loss": 8.757, + "step": 193 + }, + { + "epoch": 0.01870781099324976, + "grad_norm": 10.428133010864258, + "learning_rate": 4.9957278830368784e-05, + "loss": 8.8086, + "step": 194 + }, + { + "epoch": 0.018804243008678882, + "grad_norm": 4.319121360778809, + "learning_rate": 4.995683510480969e-05, + "loss": 8.732, + "step": 195 + }, + { + "epoch": 0.018900675024108003, + "grad_norm": 23.592283248901367, + "learning_rate": 4.995638908874298e-05, + "loss": 8.6906, + "step": 196 + }, + { + "epoch": 0.018997107039537126, + "grad_norm": 30.249975204467773, + "learning_rate": 4.995594078220961e-05, + "loss": 8.6676, + "step": 197 + }, + { + "epoch": 0.01909353905496625, + "grad_norm": 15.37741756439209, + "learning_rate": 4.9955490185250706e-05, + "loss": 8.6084, + "step": 198 + }, + { + "epoch": 0.01918997107039537, + "grad_norm": 10.373183250427246, + "learning_rate": 4.995503729790763e-05, + "loss": 8.6367, + "step": 199 + }, + { + "epoch": 0.019286403085824494, + "grad_norm": 16.604633331298828, + "learning_rate": 4.9954582120221936e-05, + "loss": 8.5878, + "step": 200 + }, + { + "epoch": 0.019382835101253618, + "grad_norm": 3.4746763706207275, + "learning_rate": 4.995412465223542e-05, + "loss": 8.5162, + "step": 201 + }, + { + "epoch": 0.019479267116682738, + "grad_norm": 18.656450271606445, + "learning_rate": 4.995366489399005e-05, + "loss": 8.6247, + "step": 202 + }, + { + "epoch": 0.01957569913211186, + "grad_norm": 21.796419143676758, + "learning_rate": 4.9953202845528034e-05, + "loss": 8.6228, + "step": 203 + }, + { + "epoch": 0.019672131147540985, + "grad_norm": 11.909361839294434, + "learning_rate": 4.9952738506891774e-05, + "loss": 8.6478, + "step": 204 + }, + { + "epoch": 0.019768563162970106, + "grad_norm": 18.55316162109375, + "learning_rate": 4.995227187812389e-05, + "loss": 8.4626, + "step": 205 + }, + { + "epoch": 0.01986499517839923, + "grad_norm": 30.68003273010254, + "learning_rate": 4.995180295926719e-05, + "loss": 8.3621, + "step": 206 + }, + { + "epoch": 0.01996142719382835, + "grad_norm": 20.988414764404297, + "learning_rate": 4.995133175036474e-05, + "loss": 8.533, + "step": 207 + }, + { + "epoch": 0.020057859209257473, + "grad_norm": 3.8974483013153076, + "learning_rate": 4.995085825145976e-05, + "loss": 8.6086, + "step": 208 + }, + { + "epoch": 0.020154291224686597, + "grad_norm": 5.03350305557251, + "learning_rate": 4.995038246259573e-05, + "loss": 8.6082, + "step": 209 + }, + { + "epoch": 0.020250723240115717, + "grad_norm": 11.053686141967773, + "learning_rate": 4.9949904383816295e-05, + "loss": 8.5384, + "step": 210 + }, + { + "epoch": 0.02034715525554484, + "grad_norm": 16.593412399291992, + "learning_rate": 4.9949424015165344e-05, + "loss": 9.1174, + "step": 211 + }, + { + "epoch": 0.020443587270973965, + "grad_norm": 18.14801788330078, + "learning_rate": 4.9948941356686974e-05, + "loss": 9.4477, + "step": 212 + }, + { + "epoch": 0.020540019286403085, + "grad_norm": 7.712798595428467, + "learning_rate": 4.994845640842547e-05, + "loss": 9.3359, + "step": 213 + }, + { + "epoch": 0.02063645130183221, + "grad_norm": 12.913015365600586, + "learning_rate": 4.994796917042534e-05, + "loss": 9.3795, + "step": 214 + }, + { + "epoch": 0.020732883317261332, + "grad_norm": 23.182336807250977, + "learning_rate": 4.994747964273131e-05, + "loss": 9.0485, + "step": 215 + }, + { + "epoch": 0.020829315332690453, + "grad_norm": 5.426914691925049, + "learning_rate": 4.99469878253883e-05, + "loss": 9.2546, + "step": 216 + }, + { + "epoch": 0.020925747348119576, + "grad_norm": 16.920930862426758, + "learning_rate": 4.9946493718441455e-05, + "loss": 9.3407, + "step": 217 + }, + { + "epoch": 0.021022179363548697, + "grad_norm": 17.39436149597168, + "learning_rate": 4.994599732193612e-05, + "loss": 9.3737, + "step": 218 + }, + { + "epoch": 0.02111861137897782, + "grad_norm": 12.353123664855957, + "learning_rate": 4.994549863591785e-05, + "loss": 9.1345, + "step": 219 + }, + { + "epoch": 0.021215043394406944, + "grad_norm": 22.894454956054688, + "learning_rate": 4.9944997660432425e-05, + "loss": 9.3742, + "step": 220 + }, + { + "epoch": 0.021311475409836064, + "grad_norm": 11.741494178771973, + "learning_rate": 4.994449439552582e-05, + "loss": 9.4438, + "step": 221 + }, + { + "epoch": 0.021407907425265188, + "grad_norm": 8.260436058044434, + "learning_rate": 4.994398884124422e-05, + "loss": 9.3252, + "step": 222 + }, + { + "epoch": 0.02150433944069431, + "grad_norm": 17.854005813598633, + "learning_rate": 4.9943480997634015e-05, + "loss": 9.518, + "step": 223 + }, + { + "epoch": 0.021600771456123432, + "grad_norm": 19.695903778076172, + "learning_rate": 4.994297086474183e-05, + "loss": 9.6985, + "step": 224 + }, + { + "epoch": 0.021697203471552556, + "grad_norm": 22.78887176513672, + "learning_rate": 4.9942458442614475e-05, + "loss": 9.5985, + "step": 225 + }, + { + "epoch": 0.02179363548698168, + "grad_norm": 30.650970458984375, + "learning_rate": 4.994194373129899e-05, + "loss": 8.9518, + "step": 226 + }, + { + "epoch": 0.0218900675024108, + "grad_norm": 28.218109130859375, + "learning_rate": 4.99414267308426e-05, + "loss": 8.7617, + "step": 227 + }, + { + "epoch": 0.021986499517839923, + "grad_norm": 18.58802604675293, + "learning_rate": 4.9940907441292775e-05, + "loss": 8.7033, + "step": 228 + }, + { + "epoch": 0.022082931533269044, + "grad_norm": 10.300430297851562, + "learning_rate": 4.994038586269715e-05, + "loss": 8.6465, + "step": 229 + }, + { + "epoch": 0.022179363548698167, + "grad_norm": 3.016716241836548, + "learning_rate": 4.993986199510361e-05, + "loss": 8.6211, + "step": 230 + }, + { + "epoch": 0.02227579556412729, + "grad_norm": 7.911774158477783, + "learning_rate": 4.993933583856023e-05, + "loss": 8.595, + "step": 231 + }, + { + "epoch": 0.02237222757955641, + "grad_norm": 9.040964126586914, + "learning_rate": 4.993880739311532e-05, + "loss": 8.5485, + "step": 232 + }, + { + "epoch": 0.022468659594985535, + "grad_norm": 12.600432395935059, + "learning_rate": 4.993827665881734e-05, + "loss": 8.5756, + "step": 233 + }, + { + "epoch": 0.02256509161041466, + "grad_norm": 16.235652923583984, + "learning_rate": 4.993774363571503e-05, + "loss": 8.6447, + "step": 234 + }, + { + "epoch": 0.02266152362584378, + "grad_norm": 13.497354507446289, + "learning_rate": 4.99372083238573e-05, + "loss": 8.4438, + "step": 235 + }, + { + "epoch": 0.022757955641272903, + "grad_norm": 5.896644592285156, + "learning_rate": 4.993667072329329e-05, + "loss": 8.9414, + "step": 236 + }, + { + "epoch": 0.022854387656702026, + "grad_norm": 15.345155715942383, + "learning_rate": 4.9936130834072333e-05, + "loss": 9.3737, + "step": 237 + }, + { + "epoch": 0.022950819672131147, + "grad_norm": 23.988264083862305, + "learning_rate": 4.993558865624397e-05, + "loss": 9.6619, + "step": 238 + }, + { + "epoch": 0.02304725168756027, + "grad_norm": 14.451107025146484, + "learning_rate": 4.993504418985798e-05, + "loss": 9.5414, + "step": 239 + }, + { + "epoch": 0.023143683702989394, + "grad_norm": 8.56495475769043, + "learning_rate": 4.993449743496432e-05, + "loss": 9.2202, + "step": 240 + }, + { + "epoch": 0.023240115718418514, + "grad_norm": 5.20372200012207, + "learning_rate": 4.993394839161317e-05, + "loss": 9.0539, + "step": 241 + }, + { + "epoch": 0.023336547733847638, + "grad_norm": 7.948090076446533, + "learning_rate": 4.9933397059854937e-05, + "loss": 9.1092, + "step": 242 + }, + { + "epoch": 0.02343297974927676, + "grad_norm": 8.364212036132812, + "learning_rate": 4.99328434397402e-05, + "loss": 9.0809, + "step": 243 + }, + { + "epoch": 0.023529411764705882, + "grad_norm": 18.93521499633789, + "learning_rate": 4.993228753131978e-05, + "loss": 8.75, + "step": 244 + }, + { + "epoch": 0.023625843780135006, + "grad_norm": 12.883859634399414, + "learning_rate": 4.993172933464471e-05, + "loss": 9.0893, + "step": 245 + }, + { + "epoch": 0.023722275795564126, + "grad_norm": 10.570661544799805, + "learning_rate": 4.9931168849766194e-05, + "loss": 9.0618, + "step": 246 + }, + { + "epoch": 0.02381870781099325, + "grad_norm": 6.988922119140625, + "learning_rate": 4.9930606076735685e-05, + "loss": 9.0682, + "step": 247 + }, + { + "epoch": 0.023915139826422373, + "grad_norm": 10.418986320495605, + "learning_rate": 4.9930041015604844e-05, + "loss": 8.936, + "step": 248 + }, + { + "epoch": 0.024011571841851494, + "grad_norm": 6.718861103057861, + "learning_rate": 4.992947366642552e-05, + "loss": 9.0567, + "step": 249 + }, + { + "epoch": 0.024108003857280617, + "grad_norm": 6.089332103729248, + "learning_rate": 4.992890402924978e-05, + "loss": 9.1981, + "step": 250 + }, + { + "epoch": 0.02420443587270974, + "grad_norm": 14.033772468566895, + "learning_rate": 4.9928332104129916e-05, + "loss": 8.7899, + "step": 251 + }, + { + "epoch": 0.02430086788813886, + "grad_norm": 3.5815277099609375, + "learning_rate": 4.992775789111841e-05, + "loss": 9.0415, + "step": 252 + }, + { + "epoch": 0.024397299903567985, + "grad_norm": 6.328668594360352, + "learning_rate": 4.992718139026797e-05, + "loss": 9.1662, + "step": 253 + }, + { + "epoch": 0.024493731918997105, + "grad_norm": 11.775870323181152, + "learning_rate": 4.99266026016315e-05, + "loss": 8.6285, + "step": 254 + }, + { + "epoch": 0.02459016393442623, + "grad_norm": 11.865426063537598, + "learning_rate": 4.9926021525262126e-05, + "loss": 8.5628, + "step": 255 + }, + { + "epoch": 0.024686595949855353, + "grad_norm": 9.300846099853516, + "learning_rate": 4.9925438161213164e-05, + "loss": 8.5084, + "step": 256 + }, + { + "epoch": 0.024783027965284473, + "grad_norm": 6.031538963317871, + "learning_rate": 4.992485250953818e-05, + "loss": 8.4882, + "step": 257 + }, + { + "epoch": 0.024879459980713597, + "grad_norm": 16.60169219970703, + "learning_rate": 4.99242645702909e-05, + "loss": 8.4674, + "step": 258 + }, + { + "epoch": 0.02497589199614272, + "grad_norm": 17.81006622314453, + "learning_rate": 4.9923674343525304e-05, + "loss": 8.5366, + "step": 259 + }, + { + "epoch": 0.02507232401157184, + "grad_norm": 11.814688682556152, + "learning_rate": 4.9923081829295547e-05, + "loss": 8.5278, + "step": 260 + }, + { + "epoch": 0.025168756027000964, + "grad_norm": 14.831336975097656, + "learning_rate": 4.992248702765602e-05, + "loss": 8.5145, + "step": 261 + }, + { + "epoch": 0.025265188042430088, + "grad_norm": 6.953230857849121, + "learning_rate": 4.99218899386613e-05, + "loss": 8.8232, + "step": 262 + }, + { + "epoch": 0.02536162005785921, + "grad_norm": 19.26991081237793, + "learning_rate": 4.99212905623662e-05, + "loss": 8.5796, + "step": 263 + }, + { + "epoch": 0.025458052073288332, + "grad_norm": 17.302715301513672, + "learning_rate": 4.9920688898825724e-05, + "loss": 8.4775, + "step": 264 + }, + { + "epoch": 0.025554484088717456, + "grad_norm": 8.843317031860352, + "learning_rate": 4.992008494809509e-05, + "loss": 8.543, + "step": 265 + }, + { + "epoch": 0.025650916104146576, + "grad_norm": 6.635027885437012, + "learning_rate": 4.991947871022974e-05, + "loss": 8.3526, + "step": 266 + }, + { + "epoch": 0.0257473481195757, + "grad_norm": 8.703571319580078, + "learning_rate": 4.99188701852853e-05, + "loss": 8.3451, + "step": 267 + }, + { + "epoch": 0.02584378013500482, + "grad_norm": 8.34554386138916, + "learning_rate": 4.991825937331762e-05, + "loss": 8.3042, + "step": 268 + }, + { + "epoch": 0.025940212150433944, + "grad_norm": 6.35545539855957, + "learning_rate": 4.9917646274382775e-05, + "loss": 8.3017, + "step": 269 + }, + { + "epoch": 0.026036644165863067, + "grad_norm": 5.050647735595703, + "learning_rate": 4.991703088853701e-05, + "loss": 8.4453, + "step": 270 + }, + { + "epoch": 0.026133076181292188, + "grad_norm": 9.625547409057617, + "learning_rate": 4.9916413215836834e-05, + "loss": 8.3324, + "step": 271 + }, + { + "epoch": 0.02622950819672131, + "grad_norm": 10.472625732421875, + "learning_rate": 4.9915793256338904e-05, + "loss": 8.4918, + "step": 272 + }, + { + "epoch": 0.026325940212150435, + "grad_norm": 5.657322883605957, + "learning_rate": 4.991517101010015e-05, + "loss": 8.3709, + "step": 273 + }, + { + "epoch": 0.026422372227579555, + "grad_norm": 2.019336223602295, + "learning_rate": 4.9914546477177654e-05, + "loss": 8.173, + "step": 274 + }, + { + "epoch": 0.02651880424300868, + "grad_norm": 7.715568542480469, + "learning_rate": 4.991391965762875e-05, + "loss": 8.2618, + "step": 275 + }, + { + "epoch": 0.026615236258437803, + "grad_norm": 16.272722244262695, + "learning_rate": 4.9913290551510966e-05, + "loss": 8.1474, + "step": 276 + }, + { + "epoch": 0.026711668273866923, + "grad_norm": 12.170965194702148, + "learning_rate": 4.991265915888204e-05, + "loss": 8.1591, + "step": 277 + }, + { + "epoch": 0.026808100289296047, + "grad_norm": 28.3992862701416, + "learning_rate": 4.9912025479799917e-05, + "loss": 8.2753, + "step": 278 + }, + { + "epoch": 0.026904532304725167, + "grad_norm": 20.098548889160156, + "learning_rate": 4.991138951432276e-05, + "loss": 8.2511, + "step": 279 + }, + { + "epoch": 0.02700096432015429, + "grad_norm": 18.131864547729492, + "learning_rate": 4.991075126250892e-05, + "loss": 8.2287, + "step": 280 + }, + { + "epoch": 0.027097396335583415, + "grad_norm": 20.82680892944336, + "learning_rate": 4.991011072441701e-05, + "loss": 9.0141, + "step": 281 + }, + { + "epoch": 0.027193828351012535, + "grad_norm": 22.385047912597656, + "learning_rate": 4.9909467900105786e-05, + "loss": 9.8396, + "step": 282 + }, + { + "epoch": 0.02729026036644166, + "grad_norm": 26.391515731811523, + "learning_rate": 4.990882278963426e-05, + "loss": 9.7514, + "step": 283 + }, + { + "epoch": 0.027386692381870782, + "grad_norm": 29.28089141845703, + "learning_rate": 4.990817539306164e-05, + "loss": 9.5303, + "step": 284 + }, + { + "epoch": 0.027483124397299902, + "grad_norm": 17.823450088500977, + "learning_rate": 4.990752571044733e-05, + "loss": 8.9425, + "step": 285 + }, + { + "epoch": 0.027579556412729026, + "grad_norm": 13.826634407043457, + "learning_rate": 4.9906873741850965e-05, + "loss": 9.2626, + "step": 286 + }, + { + "epoch": 0.02767598842815815, + "grad_norm": 16.77344512939453, + "learning_rate": 4.9906219487332395e-05, + "loss": 9.6589, + "step": 287 + }, + { + "epoch": 0.02777242044358727, + "grad_norm": 17.347389221191406, + "learning_rate": 4.9905562946951645e-05, + "loss": 9.4188, + "step": 288 + }, + { + "epoch": 0.027868852459016394, + "grad_norm": 9.284642219543457, + "learning_rate": 4.990490412076899e-05, + "loss": 9.3834, + "step": 289 + }, + { + "epoch": 0.027965284474445518, + "grad_norm": 17.14708137512207, + "learning_rate": 4.990424300884488e-05, + "loss": 9.4904, + "step": 290 + }, + { + "epoch": 0.028061716489874638, + "grad_norm": 16.321523666381836, + "learning_rate": 4.9903579611240005e-05, + "loss": 9.3708, + "step": 291 + }, + { + "epoch": 0.02815814850530376, + "grad_norm": 13.560846328735352, + "learning_rate": 4.9902913928015235e-05, + "loss": 9.5175, + "step": 292 + }, + { + "epoch": 0.028254580520732882, + "grad_norm": 20.38266372680664, + "learning_rate": 4.9902245959231685e-05, + "loss": 9.4923, + "step": 293 + }, + { + "epoch": 0.028351012536162006, + "grad_norm": 17.068580627441406, + "learning_rate": 4.990157570495065e-05, + "loss": 9.5075, + "step": 294 + }, + { + "epoch": 0.02844744455159113, + "grad_norm": 9.444963455200195, + "learning_rate": 4.990090316523364e-05, + "loss": 9.3786, + "step": 295 + }, + { + "epoch": 0.02854387656702025, + "grad_norm": 23.599924087524414, + "learning_rate": 4.9900228340142394e-05, + "loss": 8.5802, + "step": 296 + }, + { + "epoch": 0.028640308582449373, + "grad_norm": 23.497085571289062, + "learning_rate": 4.989955122973883e-05, + "loss": 8.3927, + "step": 297 + }, + { + "epoch": 0.028736740597878497, + "grad_norm": 13.845474243164062, + "learning_rate": 4.9898871834085105e-05, + "loss": 8.2773, + "step": 298 + }, + { + "epoch": 0.028833172613307617, + "grad_norm": 20.137165069580078, + "learning_rate": 4.989819015324356e-05, + "loss": 8.4706, + "step": 299 + }, + { + "epoch": 0.02892960462873674, + "grad_norm": 25.318418502807617, + "learning_rate": 4.989750618727678e-05, + "loss": 8.2352, + "step": 300 + }, + { + "epoch": 0.029026036644165865, + "grad_norm": 11.734221458435059, + "learning_rate": 4.989681993624752e-05, + "loss": 8.259, + "step": 301 + }, + { + "epoch": 0.029122468659594985, + "grad_norm": 12.10399341583252, + "learning_rate": 4.989613140021877e-05, + "loss": 9.2258, + "step": 302 + }, + { + "epoch": 0.02921890067502411, + "grad_norm": 16.344738006591797, + "learning_rate": 4.989544057925372e-05, + "loss": 9.3958, + "step": 303 + }, + { + "epoch": 0.02931533269045323, + "grad_norm": 16.700851440429688, + "learning_rate": 4.9894747473415785e-05, + "loss": 9.4782, + "step": 304 + }, + { + "epoch": 0.029411764705882353, + "grad_norm": 12.228715896606445, + "learning_rate": 4.989405208276855e-05, + "loss": 9.4128, + "step": 305 + }, + { + "epoch": 0.029508196721311476, + "grad_norm": 10.761604309082031, + "learning_rate": 4.989335440737586e-05, + "loss": 9.2199, + "step": 306 + }, + { + "epoch": 0.029604628736740597, + "grad_norm": 8.977926254272461, + "learning_rate": 4.9892654447301753e-05, + "loss": 9.1741, + "step": 307 + }, + { + "epoch": 0.02970106075216972, + "grad_norm": 3.6272501945495605, + "learning_rate": 4.989195220261045e-05, + "loss": 9.2528, + "step": 308 + }, + { + "epoch": 0.029797492767598844, + "grad_norm": 7.643343925476074, + "learning_rate": 4.989124767336641e-05, + "loss": 9.0419, + "step": 309 + }, + { + "epoch": 0.029893924783027964, + "grad_norm": 5.204570293426514, + "learning_rate": 4.989054085963429e-05, + "loss": 9.1491, + "step": 310 + }, + { + "epoch": 0.029990356798457088, + "grad_norm": 6.281081676483154, + "learning_rate": 4.988983176147898e-05, + "loss": 9.0408, + "step": 311 + }, + { + "epoch": 0.03008678881388621, + "grad_norm": 9.022841453552246, + "learning_rate": 4.9889120378965534e-05, + "loss": 8.9294, + "step": 312 + }, + { + "epoch": 0.030183220829315332, + "grad_norm": 16.599653244018555, + "learning_rate": 4.988840671215925e-05, + "loss": 8.6822, + "step": 313 + }, + { + "epoch": 0.030279652844744456, + "grad_norm": 15.882601737976074, + "learning_rate": 4.988769076112564e-05, + "loss": 8.7486, + "step": 314 + }, + { + "epoch": 0.03037608486017358, + "grad_norm": 16.864871978759766, + "learning_rate": 4.9886972525930396e-05, + "loss": 8.6111, + "step": 315 + }, + { + "epoch": 0.0304725168756027, + "grad_norm": 17.258262634277344, + "learning_rate": 4.988625200663946e-05, + "loss": 8.5622, + "step": 316 + }, + { + "epoch": 0.030568948891031823, + "grad_norm": 17.277219772338867, + "learning_rate": 4.9885529203318924e-05, + "loss": 8.5078, + "step": 317 + }, + { + "epoch": 0.030665380906460944, + "grad_norm": 17.949615478515625, + "learning_rate": 4.988480411603516e-05, + "loss": 8.4533, + "step": 318 + }, + { + "epoch": 0.030761812921890067, + "grad_norm": 15.601511001586914, + "learning_rate": 4.98840767448547e-05, + "loss": 8.4514, + "step": 319 + }, + { + "epoch": 0.03085824493731919, + "grad_norm": 12.686090469360352, + "learning_rate": 4.98833470898443e-05, + "loss": 8.295, + "step": 320 + }, + { + "epoch": 0.03095467695274831, + "grad_norm": 5.200905799865723, + "learning_rate": 4.988261515107093e-05, + "loss": 8.2386, + "step": 321 + }, + { + "epoch": 0.031051108968177435, + "grad_norm": 3.4580233097076416, + "learning_rate": 4.988188092860177e-05, + "loss": 8.2085, + "step": 322 + }, + { + "epoch": 0.03114754098360656, + "grad_norm": 12.498418807983398, + "learning_rate": 4.98811444225042e-05, + "loss": 8.285, + "step": 323 + }, + { + "epoch": 0.03124397299903568, + "grad_norm": 13.671022415161133, + "learning_rate": 4.9880405632845825e-05, + "loss": 8.115, + "step": 324 + }, + { + "epoch": 0.0313404050144648, + "grad_norm": 14.613557815551758, + "learning_rate": 4.987966455969444e-05, + "loss": 8.0341, + "step": 325 + }, + { + "epoch": 0.03143683702989392, + "grad_norm": 14.077875137329102, + "learning_rate": 4.987892120311806e-05, + "loss": 7.8819, + "step": 326 + }, + { + "epoch": 0.03153326904532305, + "grad_norm": 19.819433212280273, + "learning_rate": 4.987817556318491e-05, + "loss": 7.9278, + "step": 327 + }, + { + "epoch": 0.03162970106075217, + "grad_norm": 14.7460355758667, + "learning_rate": 4.987742763996344e-05, + "loss": 8.0581, + "step": 328 + }, + { + "epoch": 0.031726133076181294, + "grad_norm": 11.290665626525879, + "learning_rate": 4.987667743352228e-05, + "loss": 8.1885, + "step": 329 + }, + { + "epoch": 0.03182256509161042, + "grad_norm": 5.798753261566162, + "learning_rate": 4.987592494393028e-05, + "loss": 8.3355, + "step": 330 + }, + { + "epoch": 0.031918997107039535, + "grad_norm": 19.859468460083008, + "learning_rate": 4.9875170171256494e-05, + "loss": 8.2547, + "step": 331 + }, + { + "epoch": 0.03201542912246866, + "grad_norm": 17.41913604736328, + "learning_rate": 4.987441311557022e-05, + "loss": 8.2316, + "step": 332 + }, + { + "epoch": 0.03211186113789778, + "grad_norm": 9.4606294631958, + "learning_rate": 4.987365377694092e-05, + "loss": 8.0372, + "step": 333 + }, + { + "epoch": 0.032208293153326906, + "grad_norm": 10.852896690368652, + "learning_rate": 4.987289215543829e-05, + "loss": 8.0862, + "step": 334 + }, + { + "epoch": 0.03230472516875603, + "grad_norm": 14.875093460083008, + "learning_rate": 4.9872128251132234e-05, + "loss": 9.3895, + "step": 335 + }, + { + "epoch": 0.032401157184185146, + "grad_norm": 15.737958908081055, + "learning_rate": 4.9871362064092856e-05, + "loss": 9.4074, + "step": 336 + }, + { + "epoch": 0.03249758919961427, + "grad_norm": 10.411087036132812, + "learning_rate": 4.987059359439048e-05, + "loss": 9.3526, + "step": 337 + }, + { + "epoch": 0.032594021215043394, + "grad_norm": 22.389772415161133, + "learning_rate": 4.9869822842095635e-05, + "loss": 9.517, + "step": 338 + }, + { + "epoch": 0.03269045323047252, + "grad_norm": 18.261947631835938, + "learning_rate": 4.986904980727907e-05, + "loss": 9.2924, + "step": 339 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 12.186614990234375, + "learning_rate": 4.986827449001171e-05, + "loss": 9.2234, + "step": 340 + }, + { + "epoch": 0.032883317261330765, + "grad_norm": 14.614298820495605, + "learning_rate": 4.9867496890364726e-05, + "loss": 9.2366, + "step": 341 + }, + { + "epoch": 0.03297974927675988, + "grad_norm": 10.4242582321167, + "learning_rate": 4.9866717008409493e-05, + "loss": 9.3733, + "step": 342 + }, + { + "epoch": 0.033076181292189005, + "grad_norm": 14.891242027282715, + "learning_rate": 4.986593484421757e-05, + "loss": 9.4776, + "step": 343 + }, + { + "epoch": 0.03317261330761813, + "grad_norm": 8.696683883666992, + "learning_rate": 4.9865150397860755e-05, + "loss": 9.3042, + "step": 344 + }, + { + "epoch": 0.03326904532304725, + "grad_norm": 7.18649959564209, + "learning_rate": 4.986436366941104e-05, + "loss": 9.2944, + "step": 345 + }, + { + "epoch": 0.033365477338476376, + "grad_norm": 2.455465316772461, + "learning_rate": 4.986357465894063e-05, + "loss": 9.1797, + "step": 346 + }, + { + "epoch": 0.03346190935390549, + "grad_norm": 17.469585418701172, + "learning_rate": 4.9862783366521944e-05, + "loss": 8.6232, + "step": 347 + }, + { + "epoch": 0.03355834136933462, + "grad_norm": 17.799530029296875, + "learning_rate": 4.98619897922276e-05, + "loss": 8.4299, + "step": 348 + }, + { + "epoch": 0.03365477338476374, + "grad_norm": 19.466184616088867, + "learning_rate": 4.986119393613043e-05, + "loss": 8.3812, + "step": 349 + }, + { + "epoch": 0.033751205400192864, + "grad_norm": 16.31876564025879, + "learning_rate": 4.986039579830348e-05, + "loss": 8.5352, + "step": 350 + }, + { + "epoch": 0.03384763741562199, + "grad_norm": 7.221741199493408, + "learning_rate": 4.985959537882e-05, + "loss": 9.0893, + "step": 351 + }, + { + "epoch": 0.03394406943105111, + "grad_norm": 6.212620735168457, + "learning_rate": 4.985879267775346e-05, + "loss": 9.0024, + "step": 352 + }, + { + "epoch": 0.03404050144648023, + "grad_norm": 7.258167266845703, + "learning_rate": 4.9857987695177525e-05, + "loss": 8.9634, + "step": 353 + }, + { + "epoch": 0.03413693346190935, + "grad_norm": 4.64061164855957, + "learning_rate": 4.985718043116607e-05, + "loss": 9.0697, + "step": 354 + }, + { + "epoch": 0.034233365477338476, + "grad_norm": 12.098272323608398, + "learning_rate": 4.9856370885793186e-05, + "loss": 8.8445, + "step": 355 + }, + { + "epoch": 0.0343297974927676, + "grad_norm": 3.6248228549957275, + "learning_rate": 4.985555905913318e-05, + "loss": 8.6739, + "step": 356 + }, + { + "epoch": 0.03442622950819672, + "grad_norm": 13.815839767456055, + "learning_rate": 4.985474495126056e-05, + "loss": 8.793, + "step": 357 + }, + { + "epoch": 0.03452266152362585, + "grad_norm": 15.244930267333984, + "learning_rate": 4.985392856225003e-05, + "loss": 8.7461, + "step": 358 + }, + { + "epoch": 0.034619093539054964, + "grad_norm": 8.587879180908203, + "learning_rate": 4.9853109892176534e-05, + "loss": 8.7626, + "step": 359 + }, + { + "epoch": 0.03471552555448409, + "grad_norm": 15.900521278381348, + "learning_rate": 4.985228894111519e-05, + "loss": 8.7398, + "step": 360 + }, + { + "epoch": 0.03481195756991321, + "grad_norm": 28.267898559570312, + "learning_rate": 4.985146570914136e-05, + "loss": 8.8181, + "step": 361 + }, + { + "epoch": 0.034908389585342335, + "grad_norm": 17.635709762573242, + "learning_rate": 4.98506401963306e-05, + "loss": 8.5978, + "step": 362 + }, + { + "epoch": 0.03500482160077146, + "grad_norm": 10.040915489196777, + "learning_rate": 4.984981240275867e-05, + "loss": 8.3774, + "step": 363 + }, + { + "epoch": 0.035101253616200576, + "grad_norm": 16.690637588500977, + "learning_rate": 4.9848982328501536e-05, + "loss": 8.3859, + "step": 364 + }, + { + "epoch": 0.0351976856316297, + "grad_norm": 14.938213348388672, + "learning_rate": 4.984814997363539e-05, + "loss": 8.3816, + "step": 365 + }, + { + "epoch": 0.03529411764705882, + "grad_norm": 3.1353566646575928, + "learning_rate": 4.9847315338236635e-05, + "loss": 8.5094, + "step": 366 + }, + { + "epoch": 0.03539054966248795, + "grad_norm": 12.144314765930176, + "learning_rate": 4.984647842238185e-05, + "loss": 8.3, + "step": 367 + }, + { + "epoch": 0.03548698167791707, + "grad_norm": 4.676533222198486, + "learning_rate": 4.984563922614785e-05, + "loss": 8.2539, + "step": 368 + }, + { + "epoch": 0.035583413693346194, + "grad_norm": 20.46993064880371, + "learning_rate": 4.984479774961167e-05, + "loss": 8.2843, + "step": 369 + }, + { + "epoch": 0.03567984570877531, + "grad_norm": 23.01390266418457, + "learning_rate": 4.984395399285052e-05, + "loss": 8.1746, + "step": 370 + }, + { + "epoch": 0.035776277724204435, + "grad_norm": 6.057257652282715, + "learning_rate": 4.984310795594187e-05, + "loss": 8.092, + "step": 371 + }, + { + "epoch": 0.03587270973963356, + "grad_norm": 24.5620174407959, + "learning_rate": 4.984225963896333e-05, + "loss": 8.1677, + "step": 372 + }, + { + "epoch": 0.03596914175506268, + "grad_norm": 32.68037414550781, + "learning_rate": 4.9841409041992795e-05, + "loss": 8.1802, + "step": 373 + }, + { + "epoch": 0.036065573770491806, + "grad_norm": 26.643495559692383, + "learning_rate": 4.98405561651083e-05, + "loss": 8.0942, + "step": 374 + }, + { + "epoch": 0.03616200578592092, + "grad_norm": 7.493255138397217, + "learning_rate": 4.983970100838813e-05, + "loss": 8.2537, + "step": 375 + }, + { + "epoch": 0.036258437801350046, + "grad_norm": 21.463525772094727, + "learning_rate": 4.983884357191079e-05, + "loss": 8.2307, + "step": 376 + }, + { + "epoch": 0.03635486981677917, + "grad_norm": 35.00458908081055, + "learning_rate": 4.983798385575495e-05, + "loss": 8.0952, + "step": 377 + }, + { + "epoch": 0.036451301832208294, + "grad_norm": 27.564451217651367, + "learning_rate": 4.983712185999951e-05, + "loss": 7.9977, + "step": 378 + }, + { + "epoch": 0.03654773384763742, + "grad_norm": 6.630080223083496, + "learning_rate": 4.983625758472361e-05, + "loss": 8.0436, + "step": 379 + }, + { + "epoch": 0.03664416586306654, + "grad_norm": 27.655641555786133, + "learning_rate": 4.983539103000656e-05, + "loss": 8.0462, + "step": 380 + }, + { + "epoch": 0.03674059787849566, + "grad_norm": 39.11167526245117, + "learning_rate": 4.983452219592788e-05, + "loss": 8.1338, + "step": 381 + }, + { + "epoch": 0.03683702989392478, + "grad_norm": 33.75078201293945, + "learning_rate": 4.9833651082567325e-05, + "loss": 8.0773, + "step": 382 + }, + { + "epoch": 0.036933461909353905, + "grad_norm": 15.65123176574707, + "learning_rate": 4.983277769000483e-05, + "loss": 7.8304, + "step": 383 + }, + { + "epoch": 0.03702989392478303, + "grad_norm": 13.708221435546875, + "learning_rate": 4.983190201832057e-05, + "loss": 7.8005, + "step": 384 + }, + { + "epoch": 0.03712632594021215, + "grad_norm": 24.250696182250977, + "learning_rate": 4.98310240675949e-05, + "loss": 7.8204, + "step": 385 + }, + { + "epoch": 0.03722275795564127, + "grad_norm": 20.420188903808594, + "learning_rate": 4.983014383790841e-05, + "loss": 7.9896, + "step": 386 + }, + { + "epoch": 0.03731918997107039, + "grad_norm": 4.9779486656188965, + "learning_rate": 4.982926132934188e-05, + "loss": 8.2542, + "step": 387 + }, + { + "epoch": 0.03741562198649952, + "grad_norm": 17.666404724121094, + "learning_rate": 4.98283765419763e-05, + "loss": 9.3106, + "step": 388 + }, + { + "epoch": 0.03751205400192864, + "grad_norm": 28.21157455444336, + "learning_rate": 4.9827489475892885e-05, + "loss": 9.3846, + "step": 389 + }, + { + "epoch": 0.037608486017357765, + "grad_norm": 29.018421173095703, + "learning_rate": 4.982660013117304e-05, + "loss": 9.3387, + "step": 390 + }, + { + "epoch": 0.03770491803278689, + "grad_norm": 24.144254684448242, + "learning_rate": 4.982570850789839e-05, + "loss": 8.8158, + "step": 391 + }, + { + "epoch": 0.037801350048216005, + "grad_norm": 10.654420852661133, + "learning_rate": 4.982481460615077e-05, + "loss": 8.1738, + "step": 392 + }, + { + "epoch": 0.03789778206364513, + "grad_norm": 30.32037925720215, + "learning_rate": 4.982391842601222e-05, + "loss": 9.3534, + "step": 393 + }, + { + "epoch": 0.03799421407907425, + "grad_norm": 36.26893997192383, + "learning_rate": 4.982301996756499e-05, + "loss": 9.2884, + "step": 394 + }, + { + "epoch": 0.038090646094503376, + "grad_norm": 33.64028549194336, + "learning_rate": 4.982211923089154e-05, + "loss": 9.2448, + "step": 395 + }, + { + "epoch": 0.0381870781099325, + "grad_norm": 15.476400375366211, + "learning_rate": 4.982121621607454e-05, + "loss": 9.0786, + "step": 396 + }, + { + "epoch": 0.03828351012536162, + "grad_norm": 12.837586402893066, + "learning_rate": 4.9820310923196864e-05, + "loss": 8.9988, + "step": 397 + }, + { + "epoch": 0.03837994214079074, + "grad_norm": 16.495031356811523, + "learning_rate": 4.98194033523416e-05, + "loss": 8.9305, + "step": 398 + }, + { + "epoch": 0.038476374156219864, + "grad_norm": 20.387012481689453, + "learning_rate": 4.9818493503592046e-05, + "loss": 8.8882, + "step": 399 + }, + { + "epoch": 0.03857280617164899, + "grad_norm": 14.066413879394531, + "learning_rate": 4.9817581377031695e-05, + "loss": 8.8402, + "step": 400 + }, + { + "epoch": 0.03866923818707811, + "grad_norm": 3.749875783920288, + "learning_rate": 4.9816666972744286e-05, + "loss": 8.8627, + "step": 401 + }, + { + "epoch": 0.038765670202507235, + "grad_norm": 15.44411849975586, + "learning_rate": 4.981575029081372e-05, + "loss": 8.9373, + "step": 402 + }, + { + "epoch": 0.03886210221793635, + "grad_norm": 17.613569259643555, + "learning_rate": 4.981483133132413e-05, + "loss": 9.0573, + "step": 403 + }, + { + "epoch": 0.038958534233365476, + "grad_norm": 9.654831886291504, + "learning_rate": 4.9813910094359865e-05, + "loss": 8.8974, + "step": 404 + }, + { + "epoch": 0.0390549662487946, + "grad_norm": 9.855647087097168, + "learning_rate": 4.981298658000548e-05, + "loss": 8.5355, + "step": 405 + }, + { + "epoch": 0.03915139826422372, + "grad_norm": 10.879222869873047, + "learning_rate": 4.9812060788345715e-05, + "loss": 8.5633, + "step": 406 + }, + { + "epoch": 0.03924783027965285, + "grad_norm": 12.105947494506836, + "learning_rate": 4.981113271946556e-05, + "loss": 8.3684, + "step": 407 + }, + { + "epoch": 0.03934426229508197, + "grad_norm": 14.971885681152344, + "learning_rate": 4.981020237345018e-05, + "loss": 8.1039, + "step": 408 + }, + { + "epoch": 0.03944069431051109, + "grad_norm": 14.77509593963623, + "learning_rate": 4.9809269750384956e-05, + "loss": 8.0028, + "step": 409 + }, + { + "epoch": 0.03953712632594021, + "grad_norm": 12.674419403076172, + "learning_rate": 4.98083348503555e-05, + "loss": 8.0626, + "step": 410 + }, + { + "epoch": 0.039633558341369335, + "grad_norm": 8.31741714477539, + "learning_rate": 4.980739767344759e-05, + "loss": 8.0086, + "step": 411 + }, + { + "epoch": 0.03972999035679846, + "grad_norm": 6.781083583831787, + "learning_rate": 4.9806458219747265e-05, + "loss": 7.9645, + "step": 412 + }, + { + "epoch": 0.03982642237222758, + "grad_norm": 5.875626087188721, + "learning_rate": 4.980551648934074e-05, + "loss": 8.287, + "step": 413 + }, + { + "epoch": 0.0399228543876567, + "grad_norm": 10.157448768615723, + "learning_rate": 4.9804572482314435e-05, + "loss": 8.2268, + "step": 414 + }, + { + "epoch": 0.04001928640308582, + "grad_norm": 8.757606506347656, + "learning_rate": 4.9803626198755e-05, + "loss": 8.314, + "step": 415 + }, + { + "epoch": 0.040115718418514947, + "grad_norm": 7.556174278259277, + "learning_rate": 4.9802677638749274e-05, + "loss": 8.6652, + "step": 416 + }, + { + "epoch": 0.04021215043394407, + "grad_norm": 8.564729690551758, + "learning_rate": 4.980172680238433e-05, + "loss": 8.5909, + "step": 417 + }, + { + "epoch": 0.040308582449373194, + "grad_norm": 3.1015684604644775, + "learning_rate": 4.9800773689747425e-05, + "loss": 8.5328, + "step": 418 + }, + { + "epoch": 0.04040501446480232, + "grad_norm": 3.094102382659912, + "learning_rate": 4.979981830092603e-05, + "loss": 8.66, + "step": 419 + }, + { + "epoch": 0.040501446480231434, + "grad_norm": 4.680108547210693, + "learning_rate": 4.979886063600784e-05, + "loss": 8.4927, + "step": 420 + }, + { + "epoch": 0.04059787849566056, + "grad_norm": 3.546982526779175, + "learning_rate": 4.9797900695080745e-05, + "loss": 8.353, + "step": 421 + }, + { + "epoch": 0.04069431051108968, + "grad_norm": 13.374004364013672, + "learning_rate": 4.979693847823284e-05, + "loss": 8.0842, + "step": 422 + }, + { + "epoch": 0.040790742526518806, + "grad_norm": 6.378659248352051, + "learning_rate": 4.979597398555245e-05, + "loss": 8.2055, + "step": 423 + }, + { + "epoch": 0.04088717454194793, + "grad_norm": 6.5266547203063965, + "learning_rate": 4.979500721712807e-05, + "loss": 8.5278, + "step": 424 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 5.290638446807861, + "learning_rate": 4.979403817304846e-05, + "loss": 8.6902, + "step": 425 + }, + { + "epoch": 0.04108003857280617, + "grad_norm": 2.945812225341797, + "learning_rate": 4.9793066853402536e-05, + "loss": 8.4366, + "step": 426 + }, + { + "epoch": 0.041176470588235294, + "grad_norm": 5.881504535675049, + "learning_rate": 4.979209325827946e-05, + "loss": 8.4219, + "step": 427 + }, + { + "epoch": 0.04127290260366442, + "grad_norm": 3.2260563373565674, + "learning_rate": 4.979111738776857e-05, + "loss": 8.4878, + "step": 428 + }, + { + "epoch": 0.04136933461909354, + "grad_norm": 3.4749674797058105, + "learning_rate": 4.9790139241959443e-05, + "loss": 8.3003, + "step": 429 + }, + { + "epoch": 0.041465766634522665, + "grad_norm": 9.230287551879883, + "learning_rate": 4.978915882094185e-05, + "loss": 8.0977, + "step": 430 + }, + { + "epoch": 0.04156219864995178, + "grad_norm": 10.452712059020996, + "learning_rate": 4.9788176124805764e-05, + "loss": 8.0717, + "step": 431 + }, + { + "epoch": 0.041658630665380905, + "grad_norm": 5.799816131591797, + "learning_rate": 4.978719115364139e-05, + "loss": 8.084, + "step": 432 + }, + { + "epoch": 0.04175506268081003, + "grad_norm": 9.777711868286133, + "learning_rate": 4.978620390753912e-05, + "loss": 8.0483, + "step": 433 + }, + { + "epoch": 0.04185149469623915, + "grad_norm": 8.47663402557373, + "learning_rate": 4.9785214386589555e-05, + "loss": 7.9209, + "step": 434 + }, + { + "epoch": 0.041947926711668276, + "grad_norm": 6.45842981338501, + "learning_rate": 4.978422259088352e-05, + "loss": 8.0297, + "step": 435 + }, + { + "epoch": 0.04204435872709739, + "grad_norm": 7.8504958152771, + "learning_rate": 4.978322852051205e-05, + "loss": 7.9405, + "step": 436 + }, + { + "epoch": 0.04214079074252652, + "grad_norm": 3.7656846046447754, + "learning_rate": 4.9782232175566355e-05, + "loss": 7.8616, + "step": 437 + }, + { + "epoch": 0.04223722275795564, + "grad_norm": 5.753844261169434, + "learning_rate": 4.978123355613791e-05, + "loss": 7.7171, + "step": 438 + }, + { + "epoch": 0.042333654773384764, + "grad_norm": 6.208416938781738, + "learning_rate": 4.978023266231834e-05, + "loss": 7.6318, + "step": 439 + }, + { + "epoch": 0.04243008678881389, + "grad_norm": 4.433635711669922, + "learning_rate": 4.977922949419951e-05, + "loss": 7.5651, + "step": 440 + }, + { + "epoch": 0.04252651880424301, + "grad_norm": 5.982161521911621, + "learning_rate": 4.97782240518735e-05, + "loss": 7.5407, + "step": 441 + }, + { + "epoch": 0.04262295081967213, + "grad_norm": 7.922628879547119, + "learning_rate": 4.977721633543259e-05, + "loss": 7.5567, + "step": 442 + }, + { + "epoch": 0.04271938283510125, + "grad_norm": 2.7739408016204834, + "learning_rate": 4.9776206344969254e-05, + "loss": 7.6278, + "step": 443 + }, + { + "epoch": 0.042815814850530376, + "grad_norm": 9.622992515563965, + "learning_rate": 4.977519408057619e-05, + "loss": 7.5791, + "step": 444 + }, + { + "epoch": 0.0429122468659595, + "grad_norm": 3.617684841156006, + "learning_rate": 4.9774179542346324e-05, + "loss": 7.6676, + "step": 445 + }, + { + "epoch": 0.04300867888138862, + "grad_norm": 6.375345706939697, + "learning_rate": 4.9773162730372735e-05, + "loss": 7.5498, + "step": 446 + }, + { + "epoch": 0.04310511089681774, + "grad_norm": 5.047219276428223, + "learning_rate": 4.977214364474877e-05, + "loss": 7.0666, + "step": 447 + }, + { + "epoch": 0.043201542912246864, + "grad_norm": 17.515024185180664, + "learning_rate": 4.9771122285567945e-05, + "loss": 7.1144, + "step": 448 + }, + { + "epoch": 0.04329797492767599, + "grad_norm": 8.265954971313477, + "learning_rate": 4.9770098652924005e-05, + "loss": 7.4888, + "step": 449 + }, + { + "epoch": 0.04339440694310511, + "grad_norm": 7.0176100730896, + "learning_rate": 4.97690727469109e-05, + "loss": 7.3927, + "step": 450 + }, + { + "epoch": 0.043490838958534235, + "grad_norm": 5.3592424392700195, + "learning_rate": 4.976804456762279e-05, + "loss": 7.4754, + "step": 451 + }, + { + "epoch": 0.04358727097396336, + "grad_norm": 4.105835914611816, + "learning_rate": 4.976701411515402e-05, + "loss": 7.4421, + "step": 452 + }, + { + "epoch": 0.043683702989392476, + "grad_norm": 9.683066368103027, + "learning_rate": 4.976598138959919e-05, + "loss": 7.4714, + "step": 453 + }, + { + "epoch": 0.0437801350048216, + "grad_norm": 11.1967191696167, + "learning_rate": 4.976494639105307e-05, + "loss": 8.1417, + "step": 454 + }, + { + "epoch": 0.04387656702025072, + "grad_norm": 12.534073829650879, + "learning_rate": 4.976390911961064e-05, + "loss": 9.2123, + "step": 455 + }, + { + "epoch": 0.04397299903567985, + "grad_norm": 20.159730911254883, + "learning_rate": 4.976286957536712e-05, + "loss": 9.338, + "step": 456 + }, + { + "epoch": 0.04406943105110897, + "grad_norm": 14.831771850585938, + "learning_rate": 4.976182775841791e-05, + "loss": 8.6852, + "step": 457 + }, + { + "epoch": 0.04416586306653809, + "grad_norm": 4.522267818450928, + "learning_rate": 4.976078366885861e-05, + "loss": 8.6997, + "step": 458 + }, + { + "epoch": 0.04426229508196721, + "grad_norm": 7.2478718757629395, + "learning_rate": 4.975973730678508e-05, + "loss": 8.8063, + "step": 459 + }, + { + "epoch": 0.044358727097396335, + "grad_norm": 10.538217544555664, + "learning_rate": 4.9758688672293326e-05, + "loss": 8.4388, + "step": 460 + }, + { + "epoch": 0.04445515911282546, + "grad_norm": 5.905882835388184, + "learning_rate": 4.975763776547959e-05, + "loss": 8.7669, + "step": 461 + }, + { + "epoch": 0.04455159112825458, + "grad_norm": 3.6632111072540283, + "learning_rate": 4.975658458644035e-05, + "loss": 8.6341, + "step": 462 + }, + { + "epoch": 0.044648023143683706, + "grad_norm": 4.22193717956543, + "learning_rate": 4.975552913527223e-05, + "loss": 8.6215, + "step": 463 + }, + { + "epoch": 0.04474445515911282, + "grad_norm": 5.258401393890381, + "learning_rate": 4.9754471412072123e-05, + "loss": 8.7067, + "step": 464 + }, + { + "epoch": 0.044840887174541946, + "grad_norm": 7.036890983581543, + "learning_rate": 4.97534114169371e-05, + "loss": 8.3587, + "step": 465 + }, + { + "epoch": 0.04493731918997107, + "grad_norm": 3.958103895187378, + "learning_rate": 4.975234914996444e-05, + "loss": 8.2653, + "step": 466 + }, + { + "epoch": 0.045033751205400194, + "grad_norm": 8.557723999023438, + "learning_rate": 4.975128461125164e-05, + "loss": 8.0401, + "step": 467 + }, + { + "epoch": 0.04513018322082932, + "grad_norm": 13.735795021057129, + "learning_rate": 4.97502178008964e-05, + "loss": 7.7004, + "step": 468 + }, + { + "epoch": 0.04522661523625844, + "grad_norm": 10.207188606262207, + "learning_rate": 4.974914871899664e-05, + "loss": 7.5564, + "step": 469 + }, + { + "epoch": 0.04532304725168756, + "grad_norm": 4.756181240081787, + "learning_rate": 4.9748077365650465e-05, + "loss": 7.5589, + "step": 470 + }, + { + "epoch": 0.04541947926711668, + "grad_norm": 5.937626838684082, + "learning_rate": 4.974700374095621e-05, + "loss": 7.5113, + "step": 471 + }, + { + "epoch": 0.045515911282545805, + "grad_norm": 5.806030750274658, + "learning_rate": 4.9745927845012416e-05, + "loss": 7.4635, + "step": 472 + }, + { + "epoch": 0.04561234329797493, + "grad_norm": 7.047977447509766, + "learning_rate": 4.974484967791782e-05, + "loss": 7.4563, + "step": 473 + }, + { + "epoch": 0.04570877531340405, + "grad_norm": 7.049996852874756, + "learning_rate": 4.974376923977137e-05, + "loss": 7.3323, + "step": 474 + }, + { + "epoch": 0.04580520732883317, + "grad_norm": 5.156527042388916, + "learning_rate": 4.9742686530672236e-05, + "loss": 7.2335, + "step": 475 + }, + { + "epoch": 0.04590163934426229, + "grad_norm": 3.8284270763397217, + "learning_rate": 4.974160155071978e-05, + "loss": 7.2719, + "step": 476 + }, + { + "epoch": 0.04599807135969142, + "grad_norm": 4.97770357131958, + "learning_rate": 4.97405143000136e-05, + "loss": 8.2816, + "step": 477 + }, + { + "epoch": 0.04609450337512054, + "grad_norm": 6.555431365966797, + "learning_rate": 4.973942477865347e-05, + "loss": 8.052, + "step": 478 + }, + { + "epoch": 0.046190935390549664, + "grad_norm": 6.889314651489258, + "learning_rate": 4.973833298673937e-05, + "loss": 8.2758, + "step": 479 + }, + { + "epoch": 0.04628736740597879, + "grad_norm": 6.040927410125732, + "learning_rate": 4.973723892437152e-05, + "loss": 8.4292, + "step": 480 + }, + { + "epoch": 0.046383799421407905, + "grad_norm": 4.749800682067871, + "learning_rate": 4.9736142591650326e-05, + "loss": 8.2235, + "step": 481 + }, + { + "epoch": 0.04648023143683703, + "grad_norm": 3.13912034034729, + "learning_rate": 4.9735043988676414e-05, + "loss": 8.1707, + "step": 482 + }, + { + "epoch": 0.04657666345226615, + "grad_norm": 3.4812865257263184, + "learning_rate": 4.973394311555061e-05, + "loss": 8.1604, + "step": 483 + }, + { + "epoch": 0.046673095467695276, + "grad_norm": 4.260141849517822, + "learning_rate": 4.973283997237395e-05, + "loss": 8.0652, + "step": 484 + }, + { + "epoch": 0.0467695274831244, + "grad_norm": 7.319946765899658, + "learning_rate": 4.9731734559247666e-05, + "loss": 7.9059, + "step": 485 + }, + { + "epoch": 0.04686595949855352, + "grad_norm": 15.447428703308105, + "learning_rate": 4.9730626876273235e-05, + "loss": 7.6443, + "step": 486 + }, + { + "epoch": 0.04696239151398264, + "grad_norm": 13.401851654052734, + "learning_rate": 4.9729516923552314e-05, + "loss": 7.6536, + "step": 487 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 8.634393692016602, + "learning_rate": 4.9728404701186756e-05, + "loss": 7.9827, + "step": 488 + }, + { + "epoch": 0.04715525554484089, + "grad_norm": 8.375943183898926, + "learning_rate": 4.972729020927865e-05, + "loss": 7.566, + "step": 489 + }, + { + "epoch": 0.04725168756027001, + "grad_norm": 9.288909912109375, + "learning_rate": 4.972617344793029e-05, + "loss": 7.5003, + "step": 490 + }, + { + "epoch": 0.047348119575699135, + "grad_norm": 5.584190845489502, + "learning_rate": 4.972505441724416e-05, + "loss": 7.498, + "step": 491 + }, + { + "epoch": 0.04744455159112825, + "grad_norm": 9.611954689025879, + "learning_rate": 4.9723933117322964e-05, + "loss": 7.4574, + "step": 492 + }, + { + "epoch": 0.047540983606557376, + "grad_norm": 10.461143493652344, + "learning_rate": 4.972280954826962e-05, + "loss": 7.8264, + "step": 493 + }, + { + "epoch": 0.0476374156219865, + "grad_norm": 13.174617767333984, + "learning_rate": 4.972168371018725e-05, + "loss": 7.6276, + "step": 494 + }, + { + "epoch": 0.04773384763741562, + "grad_norm": 10.340668678283691, + "learning_rate": 4.9720555603179165e-05, + "loss": 7.4374, + "step": 495 + }, + { + "epoch": 0.04783027965284475, + "grad_norm": 15.505912780761719, + "learning_rate": 4.971942522734892e-05, + "loss": 7.3695, + "step": 496 + }, + { + "epoch": 0.047926711668273864, + "grad_norm": 8.257423400878906, + "learning_rate": 4.971829258280025e-05, + "loss": 7.4705, + "step": 497 + }, + { + "epoch": 0.04802314368370299, + "grad_norm": 10.61296558380127, + "learning_rate": 4.971715766963712e-05, + "loss": 7.3333, + "step": 498 + }, + { + "epoch": 0.04811957569913211, + "grad_norm": 10.981717109680176, + "learning_rate": 4.9716020487963664e-05, + "loss": 7.3161, + "step": 499 + }, + { + "epoch": 0.048216007714561235, + "grad_norm": 6.807445526123047, + "learning_rate": 4.9714881037884276e-05, + "loss": 7.282, + "step": 500 + }, + { + "epoch": 0.04831243972999036, + "grad_norm": 5.189956188201904, + "learning_rate": 4.971373931950353e-05, + "loss": 7.2921, + "step": 501 + }, + { + "epoch": 0.04840887174541948, + "grad_norm": 7.254018306732178, + "learning_rate": 4.971259533292621e-05, + "loss": 7.1694, + "step": 502 + }, + { + "epoch": 0.0485053037608486, + "grad_norm": 7.539865493774414, + "learning_rate": 4.9711449078257286e-05, + "loss": 7.1151, + "step": 503 + }, + { + "epoch": 0.04860173577627772, + "grad_norm": 3.5605714321136475, + "learning_rate": 4.9710300555601996e-05, + "loss": 7.1754, + "step": 504 + }, + { + "epoch": 0.048698167791706846, + "grad_norm": 11.793344497680664, + "learning_rate": 4.9709149765065734e-05, + "loss": 7.2315, + "step": 505 + }, + { + "epoch": 0.04879459980713597, + "grad_norm": 11.269302368164062, + "learning_rate": 4.9707996706754115e-05, + "loss": 7.1275, + "step": 506 + }, + { + "epoch": 0.048891031822565094, + "grad_norm": 3.4554874897003174, + "learning_rate": 4.970684138077297e-05, + "loss": 7.1237, + "step": 507 + }, + { + "epoch": 0.04898746383799421, + "grad_norm": 4.579529762268066, + "learning_rate": 4.970568378722833e-05, + "loss": 7.1067, + "step": 508 + }, + { + "epoch": 0.049083895853423334, + "grad_norm": 6.728043556213379, + "learning_rate": 4.970452392622644e-05, + "loss": 7.2072, + "step": 509 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 4.915396690368652, + "learning_rate": 4.970336179787376e-05, + "loss": 7.857, + "step": 510 + }, + { + "epoch": 0.04927675988428158, + "grad_norm": 6.908259391784668, + "learning_rate": 4.970219740227693e-05, + "loss": 7.2865, + "step": 511 + }, + { + "epoch": 0.049373191899710706, + "grad_norm": 5.111330032348633, + "learning_rate": 4.970103073954283e-05, + "loss": 8.1316, + "step": 512 + }, + { + "epoch": 0.04946962391513983, + "grad_norm": 11.992586135864258, + "learning_rate": 4.969986180977852e-05, + "loss": 8.2214, + "step": 513 + }, + { + "epoch": 0.049566055930568946, + "grad_norm": 11.707314491271973, + "learning_rate": 4.969869061309131e-05, + "loss": 8.2546, + "step": 514 + }, + { + "epoch": 0.04966248794599807, + "grad_norm": 8.509270668029785, + "learning_rate": 4.969751714958867e-05, + "loss": 7.6191, + "step": 515 + }, + { + "epoch": 0.049758919961427193, + "grad_norm": 5.565041542053223, + "learning_rate": 4.9696341419378304e-05, + "loss": 8.1237, + "step": 516 + }, + { + "epoch": 0.04985535197685632, + "grad_norm": 4.544252395629883, + "learning_rate": 4.9695163422568105e-05, + "loss": 7.2572, + "step": 517 + }, + { + "epoch": 0.04995178399228544, + "grad_norm": 5.090080738067627, + "learning_rate": 4.969398315926622e-05, + "loss": 7.897, + "step": 518 + }, + { + "epoch": 0.050048216007714565, + "grad_norm": 8.469888687133789, + "learning_rate": 4.969280062958094e-05, + "loss": 8.0085, + "step": 519 + }, + { + "epoch": 0.05014464802314368, + "grad_norm": 9.7757568359375, + "learning_rate": 4.969161583362082e-05, + "loss": 8.0503, + "step": 520 + }, + { + "epoch": 0.050241080038572805, + "grad_norm": 5.25663423538208, + "learning_rate": 4.969042877149459e-05, + "loss": 7.9158, + "step": 521 + }, + { + "epoch": 0.05033751205400193, + "grad_norm": 4.700706958770752, + "learning_rate": 4.9689239443311185e-05, + "loss": 7.8144, + "step": 522 + }, + { + "epoch": 0.05043394406943105, + "grad_norm": 4.744993686676025, + "learning_rate": 4.968804784917978e-05, + "loss": 8.0455, + "step": 523 + }, + { + "epoch": 0.050530376084860176, + "grad_norm": 5.439626693725586, + "learning_rate": 4.968685398920972e-05, + "loss": 8.1198, + "step": 524 + }, + { + "epoch": 0.05062680810028929, + "grad_norm": 9.264444351196289, + "learning_rate": 4.9685657863510604e-05, + "loss": 8.1163, + "step": 525 + }, + { + "epoch": 0.05072324011571842, + "grad_norm": 15.910957336425781, + "learning_rate": 4.968445947219218e-05, + "loss": 7.7114, + "step": 526 + }, + { + "epoch": 0.05081967213114754, + "grad_norm": 14.35216236114502, + "learning_rate": 4.968325881536445e-05, + "loss": 7.5906, + "step": 527 + }, + { + "epoch": 0.050916104146576664, + "grad_norm": 7.743104934692383, + "learning_rate": 4.96820558931376e-05, + "loss": 7.4319, + "step": 528 + }, + { + "epoch": 0.05101253616200579, + "grad_norm": 8.18915843963623, + "learning_rate": 4.968085070562205e-05, + "loss": 7.2651, + "step": 529 + }, + { + "epoch": 0.05110896817743491, + "grad_norm": 8.686506271362305, + "learning_rate": 4.967964325292839e-05, + "loss": 7.479, + "step": 530 + }, + { + "epoch": 0.05120540019286403, + "grad_norm": 10.163628578186035, + "learning_rate": 4.967843353516745e-05, + "loss": 7.3857, + "step": 531 + }, + { + "epoch": 0.05130183220829315, + "grad_norm": 6.713008880615234, + "learning_rate": 4.967722155245026e-05, + "loss": 7.3257, + "step": 532 + }, + { + "epoch": 0.051398264223722276, + "grad_norm": 6.692584991455078, + "learning_rate": 4.967600730488804e-05, + "loss": 7.3731, + "step": 533 + }, + { + "epoch": 0.0514946962391514, + "grad_norm": 9.046116828918457, + "learning_rate": 4.967479079259225e-05, + "loss": 7.3018, + "step": 534 + }, + { + "epoch": 0.05159112825458052, + "grad_norm": 8.60361099243164, + "learning_rate": 4.9673572015674516e-05, + "loss": 7.2738, + "step": 535 + }, + { + "epoch": 0.05168756027000964, + "grad_norm": 7.396322250366211, + "learning_rate": 4.967235097424673e-05, + "loss": 7.8169, + "step": 536 + }, + { + "epoch": 0.051783992285438764, + "grad_norm": 6.51984977722168, + "learning_rate": 4.9671127668420926e-05, + "loss": 8.1969, + "step": 537 + }, + { + "epoch": 0.05188042430086789, + "grad_norm": 4.719422340393066, + "learning_rate": 4.966990209830939e-05, + "loss": 8.1003, + "step": 538 + }, + { + "epoch": 0.05197685631629701, + "grad_norm": 4.787863731384277, + "learning_rate": 4.9668674264024605e-05, + "loss": 7.9571, + "step": 539 + }, + { + "epoch": 0.052073288331726135, + "grad_norm": 7.87312126159668, + "learning_rate": 4.9667444165679264e-05, + "loss": 7.9893, + "step": 540 + }, + { + "epoch": 0.05216972034715526, + "grad_norm": 9.554696083068848, + "learning_rate": 4.9666211803386247e-05, + "loss": 7.5922, + "step": 541 + }, + { + "epoch": 0.052266152362584375, + "grad_norm": 3.1116487979888916, + "learning_rate": 4.9664977177258676e-05, + "loss": 7.8805, + "step": 542 + }, + { + "epoch": 0.0523625843780135, + "grad_norm": 2.7788820266723633, + "learning_rate": 4.966374028740986e-05, + "loss": 7.6435, + "step": 543 + }, + { + "epoch": 0.05245901639344262, + "grad_norm": 2.8334763050079346, + "learning_rate": 4.966250113395331e-05, + "loss": 7.7556, + "step": 544 + }, + { + "epoch": 0.05255544840887175, + "grad_norm": 3.8553876876831055, + "learning_rate": 4.9661259717002764e-05, + "loss": 7.6016, + "step": 545 + }, + { + "epoch": 0.05265188042430087, + "grad_norm": 3.0400679111480713, + "learning_rate": 4.9660016036672155e-05, + "loss": 7.7075, + "step": 546 + }, + { + "epoch": 0.05274831243972999, + "grad_norm": 3.6536972522735596, + "learning_rate": 4.9658770093075623e-05, + "loss": 7.5832, + "step": 547 + }, + { + "epoch": 0.05284474445515911, + "grad_norm": 3.811890125274658, + "learning_rate": 4.965752188632753e-05, + "loss": 7.6065, + "step": 548 + }, + { + "epoch": 0.052941176470588235, + "grad_norm": 3.8460748195648193, + "learning_rate": 4.965627141654242e-05, + "loss": 7.6661, + "step": 549 + }, + { + "epoch": 0.05303760848601736, + "grad_norm": 3.728832721710205, + "learning_rate": 4.965501868383506e-05, + "loss": 7.695, + "step": 550 + }, + { + "epoch": 0.05313404050144648, + "grad_norm": 6.055170059204102, + "learning_rate": 4.965376368832044e-05, + "loss": 7.4689, + "step": 551 + }, + { + "epoch": 0.053230472516875606, + "grad_norm": 4.474534511566162, + "learning_rate": 4.965250643011373e-05, + "loss": 7.5523, + "step": 552 + }, + { + "epoch": 0.05332690453230472, + "grad_norm": 2.6279327869415283, + "learning_rate": 4.9651246909330326e-05, + "loss": 7.4746, + "step": 553 + }, + { + "epoch": 0.053423336547733846, + "grad_norm": 9.667316436767578, + "learning_rate": 4.964998512608582e-05, + "loss": 7.3738, + "step": 554 + }, + { + "epoch": 0.05351976856316297, + "grad_norm": 14.089911460876465, + "learning_rate": 4.9648721080496016e-05, + "loss": 7.348, + "step": 555 + }, + { + "epoch": 0.053616200578592094, + "grad_norm": 4.606900215148926, + "learning_rate": 4.964745477267693e-05, + "loss": 7.3053, + "step": 556 + }, + { + "epoch": 0.05371263259402122, + "grad_norm": 13.718865394592285, + "learning_rate": 4.9646186202744784e-05, + "loss": 7.3975, + "step": 557 + }, + { + "epoch": 0.053809064609450334, + "grad_norm": 14.108428955078125, + "learning_rate": 4.964491537081599e-05, + "loss": 7.3931, + "step": 558 + }, + { + "epoch": 0.05390549662487946, + "grad_norm": 6.542649269104004, + "learning_rate": 4.9643642277007206e-05, + "loss": 7.2654, + "step": 559 + }, + { + "epoch": 0.05400192864030858, + "grad_norm": 9.551427841186523, + "learning_rate": 4.964236692143527e-05, + "loss": 7.1946, + "step": 560 + }, + { + "epoch": 0.054098360655737705, + "grad_norm": 8.340819358825684, + "learning_rate": 4.964108930421722e-05, + "loss": 7.2313, + "step": 561 + }, + { + "epoch": 0.05419479267116683, + "grad_norm": 3.9380762577056885, + "learning_rate": 4.9639809425470324e-05, + "loss": 7.1843, + "step": 562 + }, + { + "epoch": 0.05429122468659595, + "grad_norm": 9.403596878051758, + "learning_rate": 4.963852728531204e-05, + "loss": 7.0556, + "step": 563 + }, + { + "epoch": 0.05438765670202507, + "grad_norm": 8.31064224243164, + "learning_rate": 4.963724288386006e-05, + "loss": 7.0136, + "step": 564 + }, + { + "epoch": 0.05448408871745419, + "grad_norm": 3.1984965801239014, + "learning_rate": 4.9635956221232236e-05, + "loss": 7.0773, + "step": 565 + }, + { + "epoch": 0.05458052073288332, + "grad_norm": 6.4525556564331055, + "learning_rate": 4.9634667297546686e-05, + "loss": 7.0149, + "step": 566 + }, + { + "epoch": 0.05467695274831244, + "grad_norm": 4.115429878234863, + "learning_rate": 4.963337611292168e-05, + "loss": 6.9452, + "step": 567 + }, + { + "epoch": 0.054773384763741564, + "grad_norm": 7.680262088775635, + "learning_rate": 4.9632082667475745e-05, + "loss": 7.1667, + "step": 568 + }, + { + "epoch": 0.05486981677917069, + "grad_norm": 7.964494705200195, + "learning_rate": 4.963078696132757e-05, + "loss": 7.1432, + "step": 569 + }, + { + "epoch": 0.054966248794599805, + "grad_norm": 5.106312274932861, + "learning_rate": 4.962948899459609e-05, + "loss": 6.9629, + "step": 570 + }, + { + "epoch": 0.05506268081002893, + "grad_norm": 9.640681266784668, + "learning_rate": 4.962818876740042e-05, + "loss": 6.968, + "step": 571 + }, + { + "epoch": 0.05515911282545805, + "grad_norm": 4.756351947784424, + "learning_rate": 4.9626886279859895e-05, + "loss": 6.9055, + "step": 572 + }, + { + "epoch": 0.055255544840887176, + "grad_norm": 5.47014856338501, + "learning_rate": 4.9625581532094066e-05, + "loss": 7.2314, + "step": 573 + }, + { + "epoch": 0.0553519768563163, + "grad_norm": 7.749273777008057, + "learning_rate": 4.962427452422267e-05, + "loss": 7.9905, + "step": 574 + }, + { + "epoch": 0.05544840887174542, + "grad_norm": 5.651285171508789, + "learning_rate": 4.962296525636566e-05, + "loss": 8.0455, + "step": 575 + }, + { + "epoch": 0.05554484088717454, + "grad_norm": 4.945115566253662, + "learning_rate": 4.962165372864321e-05, + "loss": 7.9434, + "step": 576 + }, + { + "epoch": 0.055641272902603664, + "grad_norm": 7.483427047729492, + "learning_rate": 4.962033994117569e-05, + "loss": 8.0137, + "step": 577 + }, + { + "epoch": 0.05573770491803279, + "grad_norm": 6.014307022094727, + "learning_rate": 4.961902389408366e-05, + "loss": 7.9277, + "step": 578 + }, + { + "epoch": 0.05583413693346191, + "grad_norm": 6.442894458770752, + "learning_rate": 4.961770558748793e-05, + "loss": 7.8747, + "step": 579 + }, + { + "epoch": 0.055930568948891035, + "grad_norm": 7.744041442871094, + "learning_rate": 4.9616385021509474e-05, + "loss": 7.5574, + "step": 580 + }, + { + "epoch": 0.05602700096432015, + "grad_norm": 4.747359275817871, + "learning_rate": 4.96150621962695e-05, + "loss": 7.7677, + "step": 581 + }, + { + "epoch": 0.056123432979749276, + "grad_norm": 4.683193206787109, + "learning_rate": 4.961373711188941e-05, + "loss": 7.8114, + "step": 582 + }, + { + "epoch": 0.0562198649951784, + "grad_norm": 3.312889337539673, + "learning_rate": 4.961240976849083e-05, + "loss": 7.739, + "step": 583 + }, + { + "epoch": 0.05631629701060752, + "grad_norm": 4.396903038024902, + "learning_rate": 4.961108016619557e-05, + "loss": 7.605, + "step": 584 + }, + { + "epoch": 0.05641272902603665, + "grad_norm": 3.7541677951812744, + "learning_rate": 4.960974830512567e-05, + "loss": 7.91, + "step": 585 + }, + { + "epoch": 0.056509161041465764, + "grad_norm": 4.286538600921631, + "learning_rate": 4.960841418540335e-05, + "loss": 7.8734, + "step": 586 + }, + { + "epoch": 0.05660559305689489, + "grad_norm": 4.763824462890625, + "learning_rate": 4.960707780715107e-05, + "loss": 7.5341, + "step": 587 + }, + { + "epoch": 0.05670202507232401, + "grad_norm": 7.59773063659668, + "learning_rate": 4.9605739170491474e-05, + "loss": 7.468, + "step": 588 + }, + { + "epoch": 0.056798457087753135, + "grad_norm": 8.066536903381348, + "learning_rate": 4.9604398275547424e-05, + "loss": 7.4397, + "step": 589 + }, + { + "epoch": 0.05689488910318226, + "grad_norm": 8.380285263061523, + "learning_rate": 4.960305512244198e-05, + "loss": 7.4001, + "step": 590 + }, + { + "epoch": 0.05699132111861138, + "grad_norm": 8.124722480773926, + "learning_rate": 4.960170971129842e-05, + "loss": 7.3055, + "step": 591 + }, + { + "epoch": 0.0570877531340405, + "grad_norm": 4.141718864440918, + "learning_rate": 4.960036204224022e-05, + "loss": 7.2844, + "step": 592 + }, + { + "epoch": 0.05718418514946962, + "grad_norm": 4.7875471115112305, + "learning_rate": 4.959901211539107e-05, + "loss": 7.3513, + "step": 593 + }, + { + "epoch": 0.057280617164898746, + "grad_norm": 4.810647964477539, + "learning_rate": 4.959765993087487e-05, + "loss": 7.1783, + "step": 594 + }, + { + "epoch": 0.05737704918032787, + "grad_norm": 5.035844326019287, + "learning_rate": 4.959630548881571e-05, + "loss": 7.1043, + "step": 595 + }, + { + "epoch": 0.057473481195756994, + "grad_norm": 7.219296455383301, + "learning_rate": 4.9594948789337914e-05, + "loss": 7.1012, + "step": 596 + }, + { + "epoch": 0.05756991321118611, + "grad_norm": 4.246260166168213, + "learning_rate": 4.959358983256598e-05, + "loss": 7.1781, + "step": 597 + }, + { + "epoch": 0.057666345226615234, + "grad_norm": 4.010378360748291, + "learning_rate": 4.959222861862465e-05, + "loss": 7.4363, + "step": 598 + }, + { + "epoch": 0.05776277724204436, + "grad_norm": 4.492542266845703, + "learning_rate": 4.959086514763883e-05, + "loss": 7.8122, + "step": 599 + }, + { + "epoch": 0.05785920925747348, + "grad_norm": 6.490872859954834, + "learning_rate": 4.9589499419733686e-05, + "loss": 8.1298, + "step": 600 + }, + { + "epoch": 0.057955641272902605, + "grad_norm": 4.638951778411865, + "learning_rate": 4.9588131435034555e-05, + "loss": 7.827, + "step": 601 + }, + { + "epoch": 0.05805207328833173, + "grad_norm": 4.953810214996338, + "learning_rate": 4.958676119366697e-05, + "loss": 7.683, + "step": 602 + }, + { + "epoch": 0.058148505303760846, + "grad_norm": 3.357394218444824, + "learning_rate": 4.9585388695756714e-05, + "loss": 7.6735, + "step": 603 + }, + { + "epoch": 0.05824493731918997, + "grad_norm": 7.088304042816162, + "learning_rate": 4.958401394142974e-05, + "loss": 7.5655, + "step": 604 + }, + { + "epoch": 0.05834136933461909, + "grad_norm": 7.273013591766357, + "learning_rate": 4.958263693081222e-05, + "loss": 7.208, + "step": 605 + }, + { + "epoch": 0.05843780135004822, + "grad_norm": 3.173706531524658, + "learning_rate": 4.9581257664030545e-05, + "loss": 7.7075, + "step": 606 + }, + { + "epoch": 0.05853423336547734, + "grad_norm": 4.955965518951416, + "learning_rate": 4.957987614121129e-05, + "loss": 7.8025, + "step": 607 + }, + { + "epoch": 0.05863066538090646, + "grad_norm": 8.765563011169434, + "learning_rate": 4.957849236248126e-05, + "loss": 7.8591, + "step": 608 + }, + { + "epoch": 0.05872709739633558, + "grad_norm": 5.552623271942139, + "learning_rate": 4.957710632796746e-05, + "loss": 7.7811, + "step": 609 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 6.09744119644165, + "learning_rate": 4.957571803779708e-05, + "loss": 7.8284, + "step": 610 + }, + { + "epoch": 0.05891996142719383, + "grad_norm": 3.4246718883514404, + "learning_rate": 4.957432749209755e-05, + "loss": 7.6634, + "step": 611 + }, + { + "epoch": 0.05901639344262295, + "grad_norm": 9.05555248260498, + "learning_rate": 4.957293469099649e-05, + "loss": 7.6755, + "step": 612 + }, + { + "epoch": 0.059112825458052076, + "grad_norm": 9.119853019714355, + "learning_rate": 4.957153963462172e-05, + "loss": 7.6821, + "step": 613 + }, + { + "epoch": 0.05920925747348119, + "grad_norm": 3.419140338897705, + "learning_rate": 4.957014232310129e-05, + "loss": 7.6161, + "step": 614 + }, + { + "epoch": 0.05930568948891032, + "grad_norm": 4.614326477050781, + "learning_rate": 4.956874275656344e-05, + "loss": 7.5078, + "step": 615 + }, + { + "epoch": 0.05940212150433944, + "grad_norm": 8.800910949707031, + "learning_rate": 4.956734093513662e-05, + "loss": 7.5157, + "step": 616 + }, + { + "epoch": 0.059498553519768564, + "grad_norm": 11.937853813171387, + "learning_rate": 4.9565936858949486e-05, + "loss": 7.4675, + "step": 617 + }, + { + "epoch": 0.05959498553519769, + "grad_norm": 8.386017799377441, + "learning_rate": 4.95645305281309e-05, + "loss": 7.4078, + "step": 618 + }, + { + "epoch": 0.059691417550626805, + "grad_norm": 9.55029010772705, + "learning_rate": 4.956312194280994e-05, + "loss": 7.2986, + "step": 619 + }, + { + "epoch": 0.05978784956605593, + "grad_norm": 6.401444435119629, + "learning_rate": 4.9561711103115874e-05, + "loss": 7.307, + "step": 620 + }, + { + "epoch": 0.05988428158148505, + "grad_norm": 5.62228536605835, + "learning_rate": 4.95602980091782e-05, + "loss": 7.292, + "step": 621 + }, + { + "epoch": 0.059980713596914176, + "grad_norm": 6.920928001403809, + "learning_rate": 4.9558882661126605e-05, + "loss": 7.203, + "step": 622 + }, + { + "epoch": 0.0600771456123433, + "grad_norm": 8.574027061462402, + "learning_rate": 4.955746505909098e-05, + "loss": 7.2176, + "step": 623 + }, + { + "epoch": 0.06017357762777242, + "grad_norm": 8.712440490722656, + "learning_rate": 4.955604520320144e-05, + "loss": 7.1516, + "step": 624 + }, + { + "epoch": 0.06027000964320154, + "grad_norm": 9.328496932983398, + "learning_rate": 4.9554623093588295e-05, + "loss": 7.2576, + "step": 625 + }, + { + "epoch": 0.060366441658630664, + "grad_norm": 5.8497819900512695, + "learning_rate": 4.955319873038207e-05, + "loss": 7.2777, + "step": 626 + }, + { + "epoch": 0.06046287367405979, + "grad_norm": 7.268163681030273, + "learning_rate": 4.9551772113713474e-05, + "loss": 7.043, + "step": 627 + }, + { + "epoch": 0.06055930568948891, + "grad_norm": 6.818979740142822, + "learning_rate": 4.955034324371346e-05, + "loss": 7.2063, + "step": 628 + }, + { + "epoch": 0.060655737704918035, + "grad_norm": 8.562586784362793, + "learning_rate": 4.954891212051316e-05, + "loss": 7.1278, + "step": 629 + }, + { + "epoch": 0.06075216972034716, + "grad_norm": 6.470109939575195, + "learning_rate": 4.954747874424391e-05, + "loss": 7.0391, + "step": 630 + }, + { + "epoch": 0.060848601735776275, + "grad_norm": 3.921257734298706, + "learning_rate": 4.954604311503729e-05, + "loss": 7.0171, + "step": 631 + }, + { + "epoch": 0.0609450337512054, + "grad_norm": 2.852525472640991, + "learning_rate": 4.954460523302503e-05, + "loss": 6.9988, + "step": 632 + }, + { + "epoch": 0.06104146576663452, + "grad_norm": 1.8591301441192627, + "learning_rate": 4.954316509833912e-05, + "loss": 6.819, + "step": 633 + }, + { + "epoch": 0.06113789778206365, + "grad_norm": 5.201539516448975, + "learning_rate": 4.954172271111172e-05, + "loss": 6.9008, + "step": 634 + }, + { + "epoch": 0.06123432979749277, + "grad_norm": 2.408597469329834, + "learning_rate": 4.954027807147522e-05, + "loss": 6.7679, + "step": 635 + }, + { + "epoch": 0.06133076181292189, + "grad_norm": 6.819536209106445, + "learning_rate": 4.9538831179562195e-05, + "loss": 6.8591, + "step": 636 + }, + { + "epoch": 0.06142719382835101, + "grad_norm": 3.6933035850524902, + "learning_rate": 4.9537382035505455e-05, + "loss": 7.0385, + "step": 637 + }, + { + "epoch": 0.061523625843780134, + "grad_norm": 9.37769889831543, + "learning_rate": 4.953593063943799e-05, + "loss": 7.0596, + "step": 638 + }, + { + "epoch": 0.06162005785920926, + "grad_norm": 15.13674259185791, + "learning_rate": 4.9534476991493006e-05, + "loss": 7.1766, + "step": 639 + }, + { + "epoch": 0.06171648987463838, + "grad_norm": 5.392880916595459, + "learning_rate": 4.9533021091803924e-05, + "loss": 7.4442, + "step": 640 + }, + { + "epoch": 0.061812921890067506, + "grad_norm": 5.972459316253662, + "learning_rate": 4.953156294050436e-05, + "loss": 7.692, + "step": 641 + }, + { + "epoch": 0.06190935390549662, + "grad_norm": 4.576001167297363, + "learning_rate": 4.953010253772815e-05, + "loss": 7.9991, + "step": 642 + }, + { + "epoch": 0.062005785920925746, + "grad_norm": 6.793012619018555, + "learning_rate": 4.9528639883609306e-05, + "loss": 7.7519, + "step": 643 + }, + { + "epoch": 0.06210221793635487, + "grad_norm": 8.72681999206543, + "learning_rate": 4.9527174978282096e-05, + "loss": 7.7039, + "step": 644 + }, + { + "epoch": 0.062198649951783994, + "grad_norm": 4.923428535461426, + "learning_rate": 4.952570782188095e-05, + "loss": 7.8698, + "step": 645 + }, + { + "epoch": 0.06229508196721312, + "grad_norm": 13.370941162109375, + "learning_rate": 4.9524238414540526e-05, + "loss": 7.8739, + "step": 646 + }, + { + "epoch": 0.062391513982642234, + "grad_norm": 15.22468376159668, + "learning_rate": 4.952276675639569e-05, + "loss": 7.2947, + "step": 647 + }, + { + "epoch": 0.06248794599807136, + "grad_norm": 7.376005172729492, + "learning_rate": 4.9521292847581494e-05, + "loss": 7.6741, + "step": 648 + }, + { + "epoch": 0.06258437801350049, + "grad_norm": 6.242543697357178, + "learning_rate": 4.9519816688233226e-05, + "loss": 7.7753, + "step": 649 + }, + { + "epoch": 0.0626808100289296, + "grad_norm": 7.737668991088867, + "learning_rate": 4.951833827848637e-05, + "loss": 7.6523, + "step": 650 + }, + { + "epoch": 0.06277724204435872, + "grad_norm": 5.996747016906738, + "learning_rate": 4.951685761847659e-05, + "loss": 7.9101, + "step": 651 + }, + { + "epoch": 0.06287367405978785, + "grad_norm": 7.370509624481201, + "learning_rate": 4.95153747083398e-05, + "loss": 7.9236, + "step": 652 + }, + { + "epoch": 0.06297010607521697, + "grad_norm": 5.095292091369629, + "learning_rate": 4.951388954821209e-05, + "loss": 7.8655, + "step": 653 + }, + { + "epoch": 0.0630665380906461, + "grad_norm": 4.2308454513549805, + "learning_rate": 4.9512402138229776e-05, + "loss": 7.5282, + "step": 654 + }, + { + "epoch": 0.06316297010607522, + "grad_norm": 5.634434223175049, + "learning_rate": 4.951091247852936e-05, + "loss": 7.7355, + "step": 655 + }, + { + "epoch": 0.06325940212150434, + "grad_norm": 2.0305840969085693, + "learning_rate": 4.9509420569247556e-05, + "loss": 7.7829, + "step": 656 + }, + { + "epoch": 0.06335583413693346, + "grad_norm": 12.305181503295898, + "learning_rate": 4.950792641052131e-05, + "loss": 7.4244, + "step": 657 + }, + { + "epoch": 0.06345226615236259, + "grad_norm": 9.34610366821289, + "learning_rate": 4.9506430002487735e-05, + "loss": 7.6046, + "step": 658 + }, + { + "epoch": 0.06354869816779171, + "grad_norm": 2.7315409183502197, + "learning_rate": 4.950493134528418e-05, + "loss": 7.4737, + "step": 659 + }, + { + "epoch": 0.06364513018322084, + "grad_norm": 3.652231454849243, + "learning_rate": 4.9503430439048185e-05, + "loss": 7.1097, + "step": 660 + }, + { + "epoch": 0.06374156219864995, + "grad_norm": 5.455106258392334, + "learning_rate": 4.9501927283917504e-05, + "loss": 7.5432, + "step": 661 + }, + { + "epoch": 0.06383799421407907, + "grad_norm": 4.415393352508545, + "learning_rate": 4.9500421880030094e-05, + "loss": 7.6175, + "step": 662 + }, + { + "epoch": 0.06393442622950819, + "grad_norm": 3.0669233798980713, + "learning_rate": 4.9498914227524115e-05, + "loss": 7.5698, + "step": 663 + }, + { + "epoch": 0.06403085824493732, + "grad_norm": 5.130939483642578, + "learning_rate": 4.949740432653795e-05, + "loss": 7.4699, + "step": 664 + }, + { + "epoch": 0.06412729026036644, + "grad_norm": 5.638159275054932, + "learning_rate": 4.949589217721017e-05, + "loss": 7.372, + "step": 665 + }, + { + "epoch": 0.06422372227579556, + "grad_norm": 7.268194675445557, + "learning_rate": 4.9494377779679546e-05, + "loss": 7.3365, + "step": 666 + }, + { + "epoch": 0.06432015429122469, + "grad_norm": 6.2909393310546875, + "learning_rate": 4.949286113408508e-05, + "loss": 7.3314, + "step": 667 + }, + { + "epoch": 0.06441658630665381, + "grad_norm": 5.956280708312988, + "learning_rate": 4.949134224056596e-05, + "loss": 7.2436, + "step": 668 + }, + { + "epoch": 0.06451301832208294, + "grad_norm": 3.826382875442505, + "learning_rate": 4.94898210992616e-05, + "loss": 7.138, + "step": 669 + }, + { + "epoch": 0.06460945033751206, + "grad_norm": 2.5136072635650635, + "learning_rate": 4.94882977103116e-05, + "loss": 7.1262, + "step": 670 + }, + { + "epoch": 0.06470588235294118, + "grad_norm": 3.0819315910339355, + "learning_rate": 4.9486772073855776e-05, + "loss": 6.9332, + "step": 671 + }, + { + "epoch": 0.06480231436837029, + "grad_norm": 7.746872425079346, + "learning_rate": 4.948524419003415e-05, + "loss": 6.9914, + "step": 672 + }, + { + "epoch": 0.06489874638379942, + "grad_norm": 8.48938274383545, + "learning_rate": 4.948371405898695e-05, + "loss": 7.1672, + "step": 673 + }, + { + "epoch": 0.06499517839922854, + "grad_norm": 8.81027603149414, + "learning_rate": 4.94821816808546e-05, + "loss": 6.8957, + "step": 674 + }, + { + "epoch": 0.06509161041465766, + "grad_norm": 4.032145977020264, + "learning_rate": 4.9480647055777754e-05, + "loss": 7.0796, + "step": 675 + }, + { + "epoch": 0.06518804243008679, + "grad_norm": 3.8762242794036865, + "learning_rate": 4.9479110183897256e-05, + "loss": 7.0536, + "step": 676 + }, + { + "epoch": 0.06528447444551591, + "grad_norm": 3.1201794147491455, + "learning_rate": 4.947757106535415e-05, + "loss": 6.9527, + "step": 677 + }, + { + "epoch": 0.06538090646094503, + "grad_norm": 2.2303707599639893, + "learning_rate": 4.947602970028969e-05, + "loss": 6.9233, + "step": 678 + }, + { + "epoch": 0.06547733847637416, + "grad_norm": 5.997209072113037, + "learning_rate": 4.9474486088845364e-05, + "loss": 6.9487, + "step": 679 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 7.488331317901611, + "learning_rate": 4.947294023116282e-05, + "loss": 6.6928, + "step": 680 + }, + { + "epoch": 0.0656702025072324, + "grad_norm": 6.515425205230713, + "learning_rate": 4.947139212738395e-05, + "loss": 6.9016, + "step": 681 + }, + { + "epoch": 0.06576663452266153, + "grad_norm": 6.713529586791992, + "learning_rate": 4.946984177765083e-05, + "loss": 6.8589, + "step": 682 + }, + { + "epoch": 0.06586306653809064, + "grad_norm": 2.903639316558838, + "learning_rate": 4.946828918210574e-05, + "loss": 6.7776, + "step": 683 + }, + { + "epoch": 0.06595949855351976, + "grad_norm": 4.646830081939697, + "learning_rate": 4.9466734340891186e-05, + "loss": 6.774, + "step": 684 + }, + { + "epoch": 0.06605593056894889, + "grad_norm": 6.10171365737915, + "learning_rate": 4.946517725414987e-05, + "loss": 7.5231, + "step": 685 + }, + { + "epoch": 0.06615236258437801, + "grad_norm": 9.157377243041992, + "learning_rate": 4.9463617922024696e-05, + "loss": 7.8741, + "step": 686 + }, + { + "epoch": 0.06624879459980713, + "grad_norm": 7.118037223815918, + "learning_rate": 4.946205634465878e-05, + "loss": 7.8944, + "step": 687 + }, + { + "epoch": 0.06634522661523626, + "grad_norm": 5.764063358306885, + "learning_rate": 4.946049252219544e-05, + "loss": 7.8259, + "step": 688 + }, + { + "epoch": 0.06644165863066538, + "grad_norm": 5.329281330108643, + "learning_rate": 4.945892645477821e-05, + "loss": 7.7733, + "step": 689 + }, + { + "epoch": 0.0665380906460945, + "grad_norm": 6.3071393966674805, + "learning_rate": 4.9457358142550804e-05, + "loss": 7.6548, + "step": 690 + }, + { + "epoch": 0.06663452266152363, + "grad_norm": 6.205166816711426, + "learning_rate": 4.945578758565717e-05, + "loss": 7.2286, + "step": 691 + }, + { + "epoch": 0.06673095467695275, + "grad_norm": 11.707999229431152, + "learning_rate": 4.9454214784241456e-05, + "loss": 6.9581, + "step": 692 + }, + { + "epoch": 0.06682738669238188, + "grad_norm": 12.224519729614258, + "learning_rate": 4.9452639738448e-05, + "loss": 6.9502, + "step": 693 + }, + { + "epoch": 0.06692381870781099, + "grad_norm": 8.41761302947998, + "learning_rate": 4.9451062448421385e-05, + "loss": 6.9606, + "step": 694 + }, + { + "epoch": 0.06702025072324011, + "grad_norm": 3.518836736679077, + "learning_rate": 4.9449482914306336e-05, + "loss": 7.4708, + "step": 695 + }, + { + "epoch": 0.06711668273866923, + "grad_norm": 4.373415470123291, + "learning_rate": 4.944790113624784e-05, + "loss": 7.28, + "step": 696 + }, + { + "epoch": 0.06721311475409836, + "grad_norm": 5.282960891723633, + "learning_rate": 4.944631711439107e-05, + "loss": 7.0579, + "step": 697 + }, + { + "epoch": 0.06730954676952748, + "grad_norm": 4.11353063583374, + "learning_rate": 4.9444730848881397e-05, + "loss": 7.0565, + "step": 698 + }, + { + "epoch": 0.0674059787849566, + "grad_norm": 4.759942531585693, + "learning_rate": 4.9443142339864424e-05, + "loss": 7.0248, + "step": 699 + }, + { + "epoch": 0.06750241080038573, + "grad_norm": 4.746040344238281, + "learning_rate": 4.944155158748592e-05, + "loss": 7.4362, + "step": 700 + }, + { + "epoch": 0.06759884281581485, + "grad_norm": 5.311487674713135, + "learning_rate": 4.94399585918919e-05, + "loss": 7.6496, + "step": 701 + }, + { + "epoch": 0.06769527483124398, + "grad_norm": 5.704343318939209, + "learning_rate": 4.9438363353228565e-05, + "loss": 7.8198, + "step": 702 + }, + { + "epoch": 0.0677917068466731, + "grad_norm": 4.995862007141113, + "learning_rate": 4.9436765871642316e-05, + "loss": 7.5208, + "step": 703 + }, + { + "epoch": 0.06788813886210222, + "grad_norm": 6.804040431976318, + "learning_rate": 4.943516614727977e-05, + "loss": 7.8924, + "step": 704 + }, + { + "epoch": 0.06798457087753135, + "grad_norm": 4.307209014892578, + "learning_rate": 4.943356418028775e-05, + "loss": 7.512, + "step": 705 + }, + { + "epoch": 0.06808100289296046, + "grad_norm": 3.604828357696533, + "learning_rate": 4.943195997081329e-05, + "loss": 7.4761, + "step": 706 + }, + { + "epoch": 0.06817743490838958, + "grad_norm": 4.0435051918029785, + "learning_rate": 4.943035351900361e-05, + "loss": 7.47, + "step": 707 + }, + { + "epoch": 0.0682738669238187, + "grad_norm": 3.849553108215332, + "learning_rate": 4.942874482500615e-05, + "loss": 7.631, + "step": 708 + }, + { + "epoch": 0.06837029893924783, + "grad_norm": 3.954850912094116, + "learning_rate": 4.9427133888968556e-05, + "loss": 7.4623, + "step": 709 + }, + { + "epoch": 0.06846673095467695, + "grad_norm": 4.310900688171387, + "learning_rate": 4.942552071103869e-05, + "loss": 7.6989, + "step": 710 + }, + { + "epoch": 0.06856316297010608, + "grad_norm": 4.594058036804199, + "learning_rate": 4.942390529136459e-05, + "loss": 7.4932, + "step": 711 + }, + { + "epoch": 0.0686595949855352, + "grad_norm": 2.6726670265197754, + "learning_rate": 4.9422287630094515e-05, + "loss": 7.7742, + "step": 712 + }, + { + "epoch": 0.06875602700096432, + "grad_norm": 3.2594447135925293, + "learning_rate": 4.942066772737695e-05, + "loss": 7.7354, + "step": 713 + }, + { + "epoch": 0.06885245901639345, + "grad_norm": 13.27195930480957, + "learning_rate": 4.9419045583360555e-05, + "loss": 7.29, + "step": 714 + }, + { + "epoch": 0.06894889103182257, + "grad_norm": 9.563909530639648, + "learning_rate": 4.9417421198194207e-05, + "loss": 7.3602, + "step": 715 + }, + { + "epoch": 0.0690453230472517, + "grad_norm": 8.662384033203125, + "learning_rate": 4.9415794572026996e-05, + "loss": 7.2424, + "step": 716 + }, + { + "epoch": 0.0691417550626808, + "grad_norm": 7.994836807250977, + "learning_rate": 4.941416570500822e-05, + "loss": 7.5722, + "step": 717 + }, + { + "epoch": 0.06923818707810993, + "grad_norm": 4.041254043579102, + "learning_rate": 4.941253459728735e-05, + "loss": 7.4971, + "step": 718 + }, + { + "epoch": 0.06933461909353905, + "grad_norm": 7.113839626312256, + "learning_rate": 4.941090124901411e-05, + "loss": 7.5927, + "step": 719 + }, + { + "epoch": 0.06943105110896818, + "grad_norm": 9.627026557922363, + "learning_rate": 4.94092656603384e-05, + "loss": 7.4699, + "step": 720 + }, + { + "epoch": 0.0695274831243973, + "grad_norm": 3.572275400161743, + "learning_rate": 4.9407627831410324e-05, + "loss": 7.3805, + "step": 721 + }, + { + "epoch": 0.06962391513982642, + "grad_norm": 5.690505027770996, + "learning_rate": 4.9405987762380216e-05, + "loss": 7.5144, + "step": 722 + }, + { + "epoch": 0.06972034715525555, + "grad_norm": 7.6571550369262695, + "learning_rate": 4.9404345453398583e-05, + "loss": 7.4327, + "step": 723 + }, + { + "epoch": 0.06981677917068467, + "grad_norm": 5.6483683586120605, + "learning_rate": 4.940270090461616e-05, + "loss": 7.1595, + "step": 724 + }, + { + "epoch": 0.0699132111861138, + "grad_norm": 4.0333757400512695, + "learning_rate": 4.940105411618389e-05, + "loss": 6.5545, + "step": 725 + }, + { + "epoch": 0.07000964320154292, + "grad_norm": 6.417595386505127, + "learning_rate": 4.939940508825289e-05, + "loss": 6.5206, + "step": 726 + }, + { + "epoch": 0.07010607521697204, + "grad_norm": 6.790557861328125, + "learning_rate": 4.9397753820974536e-05, + "loss": 7.2866, + "step": 727 + }, + { + "epoch": 0.07020250723240115, + "grad_norm": 4.876961708068848, + "learning_rate": 4.939610031450036e-05, + "loss": 7.5335, + "step": 728 + }, + { + "epoch": 0.07029893924783027, + "grad_norm": 3.7643628120422363, + "learning_rate": 4.939444456898213e-05, + "loss": 7.3779, + "step": 729 + }, + { + "epoch": 0.0703953712632594, + "grad_norm": 8.744268417358398, + "learning_rate": 4.93927865845718e-05, + "loss": 7.4213, + "step": 730 + }, + { + "epoch": 0.07049180327868852, + "grad_norm": 7.886297702789307, + "learning_rate": 4.939112636142154e-05, + "loss": 7.3119, + "step": 731 + }, + { + "epoch": 0.07058823529411765, + "grad_norm": 6.225279331207275, + "learning_rate": 4.938946389968372e-05, + "loss": 7.6053, + "step": 732 + }, + { + "epoch": 0.07068466730954677, + "grad_norm": 5.177829265594482, + "learning_rate": 4.938779919951092e-05, + "loss": 7.312, + "step": 733 + }, + { + "epoch": 0.0707810993249759, + "grad_norm": 8.553696632385254, + "learning_rate": 4.9386132261055926e-05, + "loss": 7.3674, + "step": 734 + }, + { + "epoch": 0.07087753134040502, + "grad_norm": 6.802550792694092, + "learning_rate": 4.938446308447173e-05, + "loss": 7.326, + "step": 735 + }, + { + "epoch": 0.07097396335583414, + "grad_norm": 3.1959054470062256, + "learning_rate": 4.938279166991153e-05, + "loss": 7.3161, + "step": 736 + }, + { + "epoch": 0.07107039537126326, + "grad_norm": 3.1314220428466797, + "learning_rate": 4.938111801752871e-05, + "loss": 7.1784, + "step": 737 + }, + { + "epoch": 0.07116682738669239, + "grad_norm": 5.167366981506348, + "learning_rate": 4.937944212747689e-05, + "loss": 6.3418, + "step": 738 + }, + { + "epoch": 0.0712632594021215, + "grad_norm": 5.306033611297607, + "learning_rate": 4.9377763999909884e-05, + "loss": 7.1379, + "step": 739 + }, + { + "epoch": 0.07135969141755062, + "grad_norm": 2.1339869499206543, + "learning_rate": 4.9376083634981695e-05, + "loss": 7.4625, + "step": 740 + }, + { + "epoch": 0.07145612343297975, + "grad_norm": 2.2282066345214844, + "learning_rate": 4.9374401032846564e-05, + "loss": 7.6524, + "step": 741 + }, + { + "epoch": 0.07155255544840887, + "grad_norm": 4.495329856872559, + "learning_rate": 4.937271619365889e-05, + "loss": 7.6314, + "step": 742 + }, + { + "epoch": 0.071648987463838, + "grad_norm": 6.422760009765625, + "learning_rate": 4.937102911757333e-05, + "loss": 7.5935, + "step": 743 + }, + { + "epoch": 0.07174541947926712, + "grad_norm": 4.970848083496094, + "learning_rate": 4.9369339804744714e-05, + "loss": 7.5594, + "step": 744 + }, + { + "epoch": 0.07184185149469624, + "grad_norm": 8.992645263671875, + "learning_rate": 4.936764825532808e-05, + "loss": 6.9309, + "step": 745 + }, + { + "epoch": 0.07193828351012536, + "grad_norm": 6.361808776855469, + "learning_rate": 4.9365954469478686e-05, + "loss": 6.7198, + "step": 746 + }, + { + "epoch": 0.07203471552555449, + "grad_norm": 6.659085273742676, + "learning_rate": 4.936425844735198e-05, + "loss": 7.2225, + "step": 747 + }, + { + "epoch": 0.07213114754098361, + "grad_norm": 3.734452486038208, + "learning_rate": 4.936256018910362e-05, + "loss": 7.1276, + "step": 748 + }, + { + "epoch": 0.07222757955641274, + "grad_norm": 9.904897689819336, + "learning_rate": 4.936085969488947e-05, + "loss": 6.7826, + "step": 749 + }, + { + "epoch": 0.07232401157184185, + "grad_norm": 9.800460815429688, + "learning_rate": 4.93591569648656e-05, + "loss": 7.6478, + "step": 750 + }, + { + "epoch": 0.07242044358727097, + "grad_norm": 9.262618064880371, + "learning_rate": 4.935745199918829e-05, + "loss": 7.8173, + "step": 751 + }, + { + "epoch": 0.07251687560270009, + "grad_norm": 8.490457534790039, + "learning_rate": 4.935574479801401e-05, + "loss": 7.2758, + "step": 752 + }, + { + "epoch": 0.07261330761812922, + "grad_norm": 8.740652084350586, + "learning_rate": 4.935403536149945e-05, + "loss": 7.8856, + "step": 753 + }, + { + "epoch": 0.07270973963355834, + "grad_norm": 7.606390953063965, + "learning_rate": 4.93523236898015e-05, + "loss": 7.6106, + "step": 754 + }, + { + "epoch": 0.07280617164898746, + "grad_norm": 5.9307541847229, + "learning_rate": 4.935060978307725e-05, + "loss": 7.5106, + "step": 755 + }, + { + "epoch": 0.07290260366441659, + "grad_norm": 4.487514019012451, + "learning_rate": 4.934889364148401e-05, + "loss": 7.2993, + "step": 756 + }, + { + "epoch": 0.07299903567984571, + "grad_norm": 4.00663423538208, + "learning_rate": 4.934717526517928e-05, + "loss": 6.3625, + "step": 757 + }, + { + "epoch": 0.07309546769527484, + "grad_norm": 7.976274013519287, + "learning_rate": 4.934545465432077e-05, + "loss": 7.2839, + "step": 758 + }, + { + "epoch": 0.07319189971070396, + "grad_norm": 6.423211574554443, + "learning_rate": 4.9343731809066394e-05, + "loss": 7.4139, + "step": 759 + }, + { + "epoch": 0.07328833172613308, + "grad_norm": 4.153281211853027, + "learning_rate": 4.9342006729574285e-05, + "loss": 7.2737, + "step": 760 + }, + { + "epoch": 0.07338476374156219, + "grad_norm": 3.340407609939575, + "learning_rate": 4.9340279416002744e-05, + "loss": 7.5646, + "step": 761 + }, + { + "epoch": 0.07348119575699132, + "grad_norm": 5.25300407409668, + "learning_rate": 4.933854986851033e-05, + "loss": 7.4139, + "step": 762 + }, + { + "epoch": 0.07357762777242044, + "grad_norm": 5.223256587982178, + "learning_rate": 4.933681808725575e-05, + "loss": 7.5866, + "step": 763 + }, + { + "epoch": 0.07367405978784956, + "grad_norm": 5.307270526885986, + "learning_rate": 4.933508407239798e-05, + "loss": 7.4149, + "step": 764 + }, + { + "epoch": 0.07377049180327869, + "grad_norm": 3.3860023021698, + "learning_rate": 4.933334782409613e-05, + "loss": 7.319, + "step": 765 + }, + { + "epoch": 0.07386692381870781, + "grad_norm": 2.8617844581604004, + "learning_rate": 4.933160934250957e-05, + "loss": 7.228, + "step": 766 + }, + { + "epoch": 0.07396335583413693, + "grad_norm": 11.431314468383789, + "learning_rate": 4.932986862779785e-05, + "loss": 7.3637, + "step": 767 + }, + { + "epoch": 0.07405978784956606, + "grad_norm": 13.689541816711426, + "learning_rate": 4.932812568012073e-05, + "loss": 7.3989, + "step": 768 + }, + { + "epoch": 0.07415621986499518, + "grad_norm": 8.50946044921875, + "learning_rate": 4.932638049963818e-05, + "loss": 7.3343, + "step": 769 + }, + { + "epoch": 0.0742526518804243, + "grad_norm": 7.138329029083252, + "learning_rate": 4.932463308651038e-05, + "loss": 7.2973, + "step": 770 + }, + { + "epoch": 0.07434908389585343, + "grad_norm": 7.085503578186035, + "learning_rate": 4.932288344089768e-05, + "loss": 7.1633, + "step": 771 + }, + { + "epoch": 0.07444551591128254, + "grad_norm": 4.326226234436035, + "learning_rate": 4.932113156296068e-05, + "loss": 7.2099, + "step": 772 + }, + { + "epoch": 0.07454194792671166, + "grad_norm": 2.972689151763916, + "learning_rate": 4.931937745286015e-05, + "loss": 7.0859, + "step": 773 + }, + { + "epoch": 0.07463837994214079, + "grad_norm": 5.896681308746338, + "learning_rate": 4.931762111075711e-05, + "loss": 7.0634, + "step": 774 + }, + { + "epoch": 0.07473481195756991, + "grad_norm": 8.559419631958008, + "learning_rate": 4.931586253681272e-05, + "loss": 6.9563, + "step": 775 + }, + { + "epoch": 0.07483124397299903, + "grad_norm": 9.048019409179688, + "learning_rate": 4.931410173118839e-05, + "loss": 7.0495, + "step": 776 + }, + { + "epoch": 0.07492767598842816, + "grad_norm": 6.729249000549316, + "learning_rate": 4.9312338694045735e-05, + "loss": 7.0324, + "step": 777 + }, + { + "epoch": 0.07502410800385728, + "grad_norm": 5.936110973358154, + "learning_rate": 4.931057342554656e-05, + "loss": 6.9621, + "step": 778 + }, + { + "epoch": 0.0751205400192864, + "grad_norm": 5.614899158477783, + "learning_rate": 4.930880592585288e-05, + "loss": 7.153, + "step": 779 + }, + { + "epoch": 0.07521697203471553, + "grad_norm": 5.048844814300537, + "learning_rate": 4.930703619512691e-05, + "loss": 7.0864, + "step": 780 + }, + { + "epoch": 0.07531340405014465, + "grad_norm": 3.550630569458008, + "learning_rate": 4.930526423353107e-05, + "loss": 7.0124, + "step": 781 + }, + { + "epoch": 0.07540983606557378, + "grad_norm": 3.6218643188476562, + "learning_rate": 4.930349004122799e-05, + "loss": 7.042, + "step": 782 + }, + { + "epoch": 0.07550626808100289, + "grad_norm": 5.186278820037842, + "learning_rate": 4.930171361838052e-05, + "loss": 6.8914, + "step": 783 + }, + { + "epoch": 0.07560270009643201, + "grad_norm": 6.573472499847412, + "learning_rate": 4.9299934965151677e-05, + "loss": 6.755, + "step": 784 + }, + { + "epoch": 0.07569913211186113, + "grad_norm": 6.584750175476074, + "learning_rate": 4.929815408170472e-05, + "loss": 6.6986, + "step": 785 + }, + { + "epoch": 0.07579556412729026, + "grad_norm": 3.863327980041504, + "learning_rate": 4.929637096820307e-05, + "loss": 6.996, + "step": 786 + }, + { + "epoch": 0.07589199614271938, + "grad_norm": 4.859745979309082, + "learning_rate": 4.929458562481041e-05, + "loss": 6.9083, + "step": 787 + }, + { + "epoch": 0.0759884281581485, + "grad_norm": 4.570915699005127, + "learning_rate": 4.929279805169058e-05, + "loss": 6.7644, + "step": 788 + }, + { + "epoch": 0.07608486017357763, + "grad_norm": 2.733233690261841, + "learning_rate": 4.9291008249007644e-05, + "loss": 6.7303, + "step": 789 + }, + { + "epoch": 0.07618129218900675, + "grad_norm": 4.3721394538879395, + "learning_rate": 4.928921621692587e-05, + "loss": 6.7149, + "step": 790 + }, + { + "epoch": 0.07627772420443588, + "grad_norm": 3.061414957046509, + "learning_rate": 4.9287421955609726e-05, + "loss": 6.7077, + "step": 791 + }, + { + "epoch": 0.076374156219865, + "grad_norm": 2.720810651779175, + "learning_rate": 4.9285625465223884e-05, + "loss": 6.8566, + "step": 792 + }, + { + "epoch": 0.07647058823529412, + "grad_norm": 3.264906167984009, + "learning_rate": 4.928382674593323e-05, + "loss": 6.8335, + "step": 793 + }, + { + "epoch": 0.07656702025072323, + "grad_norm": 3.0483381748199463, + "learning_rate": 4.928202579790285e-05, + "loss": 6.716, + "step": 794 + }, + { + "epoch": 0.07666345226615236, + "grad_norm": 1.6710330247879028, + "learning_rate": 4.928022262129802e-05, + "loss": 6.7374, + "step": 795 + }, + { + "epoch": 0.07675988428158148, + "grad_norm": 8.377826690673828, + "learning_rate": 4.927841721628425e-05, + "loss": 7.571, + "step": 796 + }, + { + "epoch": 0.0768563162970106, + "grad_norm": 6.745691776275635, + "learning_rate": 4.927660958302723e-05, + "loss": 7.6146, + "step": 797 + }, + { + "epoch": 0.07695274831243973, + "grad_norm": 3.619133710861206, + "learning_rate": 4.9274799721692856e-05, + "loss": 7.5534, + "step": 798 + }, + { + "epoch": 0.07704918032786885, + "grad_norm": 9.479769706726074, + "learning_rate": 4.9272987632447245e-05, + "loss": 7.9913, + "step": 799 + }, + { + "epoch": 0.07714561234329798, + "grad_norm": 7.7876482009887695, + "learning_rate": 4.9271173315456696e-05, + "loss": 7.7043, + "step": 800 + }, + { + "epoch": 0.0772420443587271, + "grad_norm": 4.390965461730957, + "learning_rate": 4.926935677088774e-05, + "loss": 7.8624, + "step": 801 + }, + { + "epoch": 0.07733847637415622, + "grad_norm": 3.9007182121276855, + "learning_rate": 4.9267537998907097e-05, + "loss": 7.8985, + "step": 802 + }, + { + "epoch": 0.07743490838958535, + "grad_norm": 4.066274642944336, + "learning_rate": 4.9265716999681676e-05, + "loss": 7.7711, + "step": 803 + }, + { + "epoch": 0.07753134040501447, + "grad_norm": 4.391547203063965, + "learning_rate": 4.9263893773378617e-05, + "loss": 7.6212, + "step": 804 + }, + { + "epoch": 0.07762777242044358, + "grad_norm": 4.200672626495361, + "learning_rate": 4.926206832016526e-05, + "loss": 7.5882, + "step": 805 + }, + { + "epoch": 0.0777242044358727, + "grad_norm": 8.617514610290527, + "learning_rate": 4.926024064020912e-05, + "loss": 7.23, + "step": 806 + }, + { + "epoch": 0.07782063645130183, + "grad_norm": 5.9472975730896, + "learning_rate": 4.925841073367795e-05, + "loss": 7.4581, + "step": 807 + }, + { + "epoch": 0.07791706846673095, + "grad_norm": 3.531665325164795, + "learning_rate": 4.925657860073972e-05, + "loss": 7.5092, + "step": 808 + }, + { + "epoch": 0.07801350048216008, + "grad_norm": 3.080536127090454, + "learning_rate": 4.925474424156254e-05, + "loss": 7.4522, + "step": 809 + }, + { + "epoch": 0.0781099324975892, + "grad_norm": 3.2550666332244873, + "learning_rate": 4.925290765631481e-05, + "loss": 7.6126, + "step": 810 + }, + { + "epoch": 0.07820636451301832, + "grad_norm": 3.8597874641418457, + "learning_rate": 4.925106884516505e-05, + "loss": 7.5544, + "step": 811 + }, + { + "epoch": 0.07830279652844745, + "grad_norm": 2.795840263366699, + "learning_rate": 4.924922780828204e-05, + "loss": 7.6933, + "step": 812 + }, + { + "epoch": 0.07839922854387657, + "grad_norm": 9.162662506103516, + "learning_rate": 4.924738454583475e-05, + "loss": 7.3606, + "step": 813 + }, + { + "epoch": 0.0784956605593057, + "grad_norm": 2.5525949001312256, + "learning_rate": 4.924553905799235e-05, + "loss": 7.6157, + "step": 814 + }, + { + "epoch": 0.07859209257473482, + "grad_norm": 3.000890016555786, + "learning_rate": 4.9243691344924216e-05, + "loss": 7.6251, + "step": 815 + }, + { + "epoch": 0.07868852459016394, + "grad_norm": 2.8554863929748535, + "learning_rate": 4.924184140679993e-05, + "loss": 7.6573, + "step": 816 + }, + { + "epoch": 0.07878495660559305, + "grad_norm": 3.3716282844543457, + "learning_rate": 4.9239989243789275e-05, + "loss": 7.3889, + "step": 817 + }, + { + "epoch": 0.07888138862102217, + "grad_norm": 2.8017170429229736, + "learning_rate": 4.923813485606225e-05, + "loss": 7.1841, + "step": 818 + }, + { + "epoch": 0.0789778206364513, + "grad_norm": 3.7379071712493896, + "learning_rate": 4.923627824378903e-05, + "loss": 7.4635, + "step": 819 + }, + { + "epoch": 0.07907425265188042, + "grad_norm": 2.1000125408172607, + "learning_rate": 4.923441940714002e-05, + "loss": 7.2709, + "step": 820 + }, + { + "epoch": 0.07917068466730955, + "grad_norm": 2.4101638793945312, + "learning_rate": 4.923255834628584e-05, + "loss": 7.0076, + "step": 821 + }, + { + "epoch": 0.07926711668273867, + "grad_norm": 2.5575690269470215, + "learning_rate": 4.923069506139727e-05, + "loss": 7.4049, + "step": 822 + }, + { + "epoch": 0.0793635486981678, + "grad_norm": 3.7485814094543457, + "learning_rate": 4.922882955264533e-05, + "loss": 7.3946, + "step": 823 + }, + { + "epoch": 0.07945998071359692, + "grad_norm": 6.954967498779297, + "learning_rate": 4.9226961820201235e-05, + "loss": 7.4142, + "step": 824 + }, + { + "epoch": 0.07955641272902604, + "grad_norm": 5.652334213256836, + "learning_rate": 4.92250918642364e-05, + "loss": 7.2843, + "step": 825 + }, + { + "epoch": 0.07965284474445516, + "grad_norm": 4.026338577270508, + "learning_rate": 4.922321968492246e-05, + "loss": 7.4827, + "step": 826 + }, + { + "epoch": 0.07974927675988429, + "grad_norm": 2.654324769973755, + "learning_rate": 4.922134528243122e-05, + "loss": 7.2771, + "step": 827 + }, + { + "epoch": 0.0798457087753134, + "grad_norm": 7.249755382537842, + "learning_rate": 4.921946865693473e-05, + "loss": 7.2863, + "step": 828 + }, + { + "epoch": 0.07994214079074252, + "grad_norm": 8.757387161254883, + "learning_rate": 4.9217589808605215e-05, + "loss": 7.2481, + "step": 829 + }, + { + "epoch": 0.08003857280617165, + "grad_norm": 11.87730598449707, + "learning_rate": 4.9215708737615114e-05, + "loss": 7.1842, + "step": 830 + }, + { + "epoch": 0.08013500482160077, + "grad_norm": 9.620343208312988, + "learning_rate": 4.9213825444137065e-05, + "loss": 7.0855, + "step": 831 + }, + { + "epoch": 0.08023143683702989, + "grad_norm": 5.55994176864624, + "learning_rate": 4.921193992834392e-05, + "loss": 6.9839, + "step": 832 + }, + { + "epoch": 0.08032786885245902, + "grad_norm": 6.439215183258057, + "learning_rate": 4.921005219040873e-05, + "loss": 6.9527, + "step": 833 + }, + { + "epoch": 0.08042430086788814, + "grad_norm": 10.257261276245117, + "learning_rate": 4.920816223050475e-05, + "loss": 6.796, + "step": 834 + }, + { + "epoch": 0.08052073288331726, + "grad_norm": 13.278034210205078, + "learning_rate": 4.920627004880544e-05, + "loss": 6.7764, + "step": 835 + }, + { + "epoch": 0.08061716489874639, + "grad_norm": 15.246657371520996, + "learning_rate": 4.9204375645484443e-05, + "loss": 6.7034, + "step": 836 + }, + { + "epoch": 0.08071359691417551, + "grad_norm": 14.736125946044922, + "learning_rate": 4.920247902071565e-05, + "loss": 6.9557, + "step": 837 + }, + { + "epoch": 0.08081002892960464, + "grad_norm": 9.808652877807617, + "learning_rate": 4.9200580174673125e-05, + "loss": 6.9092, + "step": 838 + }, + { + "epoch": 0.08090646094503375, + "grad_norm": 5.891807556152344, + "learning_rate": 4.9198679107531134e-05, + "loss": 7.0261, + "step": 839 + }, + { + "epoch": 0.08100289296046287, + "grad_norm": 5.659506797790527, + "learning_rate": 4.919677581946416e-05, + "loss": 7.0361, + "step": 840 + }, + { + "epoch": 0.08109932497589199, + "grad_norm": 2.5914769172668457, + "learning_rate": 4.919487031064687e-05, + "loss": 6.7119, + "step": 841 + }, + { + "epoch": 0.08119575699132112, + "grad_norm": 3.230952024459839, + "learning_rate": 4.919296258125418e-05, + "loss": 6.9669, + "step": 842 + }, + { + "epoch": 0.08129218900675024, + "grad_norm": 2.871159553527832, + "learning_rate": 4.919105263146115e-05, + "loss": 6.9227, + "step": 843 + }, + { + "epoch": 0.08138862102217936, + "grad_norm": 3.229419231414795, + "learning_rate": 4.918914046144308e-05, + "loss": 6.8877, + "step": 844 + }, + { + "epoch": 0.08148505303760849, + "grad_norm": 4.117875576019287, + "learning_rate": 4.918722607137548e-05, + "loss": 6.7793, + "step": 845 + }, + { + "epoch": 0.08158148505303761, + "grad_norm": 3.3827381134033203, + "learning_rate": 4.9185309461434035e-05, + "loss": 6.7452, + "step": 846 + }, + { + "epoch": 0.08167791706846673, + "grad_norm": 2.6461453437805176, + "learning_rate": 4.918339063179466e-05, + "loss": 6.97, + "step": 847 + }, + { + "epoch": 0.08177434908389586, + "grad_norm": 2.3088161945343018, + "learning_rate": 4.9181469582633454e-05, + "loss": 6.7641, + "step": 848 + }, + { + "epoch": 0.08187078109932498, + "grad_norm": 2.081615924835205, + "learning_rate": 4.9179546314126734e-05, + "loss": 6.876, + "step": 849 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 3.579237699508667, + "learning_rate": 4.917762082645101e-05, + "loss": 6.8127, + "step": 850 + }, + { + "epoch": 0.08206364513018322, + "grad_norm": 1.8214225769042969, + "learning_rate": 4.9175693119783013e-05, + "loss": 6.8722, + "step": 851 + }, + { + "epoch": 0.08216007714561234, + "grad_norm": 4.838691711425781, + "learning_rate": 4.9173763194299647e-05, + "loss": 6.8131, + "step": 852 + }, + { + "epoch": 0.08225650916104146, + "grad_norm": 3.9453041553497314, + "learning_rate": 4.917183105017805e-05, + "loss": 6.7385, + "step": 853 + }, + { + "epoch": 0.08235294117647059, + "grad_norm": 12.947820663452148, + "learning_rate": 4.916989668759556e-05, + "loss": 8.1421, + "step": 854 + }, + { + "epoch": 0.08244937319189971, + "grad_norm": 8.122206687927246, + "learning_rate": 4.916796010672969e-05, + "loss": 6.9068, + "step": 855 + }, + { + "epoch": 0.08254580520732883, + "grad_norm": 10.510831832885742, + "learning_rate": 4.9166021307758194e-05, + "loss": 7.7585, + "step": 856 + }, + { + "epoch": 0.08264223722275796, + "grad_norm": 8.267707824707031, + "learning_rate": 4.9164080290858996e-05, + "loss": 7.7029, + "step": 857 + }, + { + "epoch": 0.08273866923818708, + "grad_norm": 6.0293426513671875, + "learning_rate": 4.9162137056210254e-05, + "loss": 7.8644, + "step": 858 + }, + { + "epoch": 0.0828351012536162, + "grad_norm": 3.474806785583496, + "learning_rate": 4.9160191603990314e-05, + "loss": 7.5333, + "step": 859 + }, + { + "epoch": 0.08293153326904533, + "grad_norm": 4.274496078491211, + "learning_rate": 4.915824393437772e-05, + "loss": 7.3149, + "step": 860 + }, + { + "epoch": 0.08302796528447444, + "grad_norm": 4.818944931030273, + "learning_rate": 4.915629404755124e-05, + "loss": 7.5138, + "step": 861 + }, + { + "epoch": 0.08312439729990356, + "grad_norm": 3.358581781387329, + "learning_rate": 4.915434194368982e-05, + "loss": 6.8901, + "step": 862 + }, + { + "epoch": 0.08322082931533269, + "grad_norm": 3.9279487133026123, + "learning_rate": 4.9152387622972626e-05, + "loss": 7.5669, + "step": 863 + }, + { + "epoch": 0.08331726133076181, + "grad_norm": 4.42994499206543, + "learning_rate": 4.9150431085579026e-05, + "loss": 7.464, + "step": 864 + }, + { + "epoch": 0.08341369334619093, + "grad_norm": 2.439588785171509, + "learning_rate": 4.914847233168858e-05, + "loss": 7.2092, + "step": 865 + }, + { + "epoch": 0.08351012536162006, + "grad_norm": 2.5686087608337402, + "learning_rate": 4.914651136148106e-05, + "loss": 7.4372, + "step": 866 + }, + { + "epoch": 0.08360655737704918, + "grad_norm": 2.572404146194458, + "learning_rate": 4.9144548175136464e-05, + "loss": 7.3901, + "step": 867 + }, + { + "epoch": 0.0837029893924783, + "grad_norm": 5.010617256164551, + "learning_rate": 4.914258277283494e-05, + "loss": 7.5298, + "step": 868 + }, + { + "epoch": 0.08379942140790743, + "grad_norm": 5.425722599029541, + "learning_rate": 4.9140615154756886e-05, + "loss": 7.431, + "step": 869 + }, + { + "epoch": 0.08389585342333655, + "grad_norm": 2.3538410663604736, + "learning_rate": 4.913864532108289e-05, + "loss": 7.6036, + "step": 870 + }, + { + "epoch": 0.08399228543876568, + "grad_norm": 5.305138111114502, + "learning_rate": 4.9136673271993724e-05, + "loss": 7.2639, + "step": 871 + }, + { + "epoch": 0.08408871745419479, + "grad_norm": 5.993468761444092, + "learning_rate": 4.9134699007670405e-05, + "loss": 7.3027, + "step": 872 + }, + { + "epoch": 0.08418514946962391, + "grad_norm": 8.992399215698242, + "learning_rate": 4.913272252829411e-05, + "loss": 7.177, + "step": 873 + }, + { + "epoch": 0.08428158148505303, + "grad_norm": 7.828742504119873, + "learning_rate": 4.913074383404625e-05, + "loss": 7.0636, + "step": 874 + }, + { + "epoch": 0.08437801350048216, + "grad_norm": 7.442550182342529, + "learning_rate": 4.912876292510842e-05, + "loss": 6.9675, + "step": 875 + }, + { + "epoch": 0.08447444551591128, + "grad_norm": 6.914331436157227, + "learning_rate": 4.912677980166242e-05, + "loss": 7.0092, + "step": 876 + }, + { + "epoch": 0.0845708775313404, + "grad_norm": 8.898406028747559, + "learning_rate": 4.912479446389027e-05, + "loss": 6.8714, + "step": 877 + }, + { + "epoch": 0.08466730954676953, + "grad_norm": 5.307892799377441, + "learning_rate": 4.912280691197417e-05, + "loss": 6.953, + "step": 878 + }, + { + "epoch": 0.08476374156219865, + "grad_norm": 4.582098484039307, + "learning_rate": 4.912081714609655e-05, + "loss": 6.8384, + "step": 879 + }, + { + "epoch": 0.08486017357762778, + "grad_norm": 5.5378007888793945, + "learning_rate": 4.911882516644002e-05, + "loss": 6.8021, + "step": 880 + }, + { + "epoch": 0.0849566055930569, + "grad_norm": 5.379184722900391, + "learning_rate": 4.91168309731874e-05, + "loss": 6.9304, + "step": 881 + }, + { + "epoch": 0.08505303760848602, + "grad_norm": 7.602871894836426, + "learning_rate": 4.9114834566521716e-05, + "loss": 7.3961, + "step": 882 + }, + { + "epoch": 0.08514946962391513, + "grad_norm": 8.531844139099121, + "learning_rate": 4.911283594662619e-05, + "loss": 7.8337, + "step": 883 + }, + { + "epoch": 0.08524590163934426, + "grad_norm": 7.923476696014404, + "learning_rate": 4.911083511368427e-05, + "loss": 7.7383, + "step": 884 + }, + { + "epoch": 0.08534233365477338, + "grad_norm": 6.480445861816406, + "learning_rate": 4.9108832067879574e-05, + "loss": 7.5976, + "step": 885 + }, + { + "epoch": 0.0854387656702025, + "grad_norm": 5.817816734313965, + "learning_rate": 4.910682680939594e-05, + "loss": 7.3891, + "step": 886 + }, + { + "epoch": 0.08553519768563163, + "grad_norm": 3.5624780654907227, + "learning_rate": 4.910481933841742e-05, + "loss": 7.4713, + "step": 887 + }, + { + "epoch": 0.08563162970106075, + "grad_norm": 3.157945156097412, + "learning_rate": 4.910280965512824e-05, + "loss": 7.3864, + "step": 888 + }, + { + "epoch": 0.08572806171648988, + "grad_norm": 2.9716379642486572, + "learning_rate": 4.910079775971286e-05, + "loss": 7.2761, + "step": 889 + }, + { + "epoch": 0.085824493731919, + "grad_norm": 2.8703598976135254, + "learning_rate": 4.909878365235593e-05, + "loss": 6.9676, + "step": 890 + }, + { + "epoch": 0.08592092574734812, + "grad_norm": 3.6541965007781982, + "learning_rate": 4.909676733324229e-05, + "loss": 7.3393, + "step": 891 + }, + { + "epoch": 0.08601735776277725, + "grad_norm": 3.089522361755371, + "learning_rate": 4.9094748802557e-05, + "loss": 7.3113, + "step": 892 + }, + { + "epoch": 0.08611378977820637, + "grad_norm": 3.165750026702881, + "learning_rate": 4.9092728060485323e-05, + "loss": 7.3835, + "step": 893 + }, + { + "epoch": 0.08621022179363548, + "grad_norm": 3.276035785675049, + "learning_rate": 4.909070510721273e-05, + "loss": 7.2858, + "step": 894 + }, + { + "epoch": 0.0863066538090646, + "grad_norm": 3.9920103549957275, + "learning_rate": 4.908867994292486e-05, + "loss": 7.2561, + "step": 895 + }, + { + "epoch": 0.08640308582449373, + "grad_norm": 3.539341449737549, + "learning_rate": 4.90866525678076e-05, + "loss": 7.25, + "step": 896 + }, + { + "epoch": 0.08649951783992285, + "grad_norm": 7.246125221252441, + "learning_rate": 4.9084622982047005e-05, + "loss": 7.2321, + "step": 897 + }, + { + "epoch": 0.08659594985535198, + "grad_norm": 8.686210632324219, + "learning_rate": 4.9082591185829364e-05, + "loss": 7.1409, + "step": 898 + }, + { + "epoch": 0.0866923818707811, + "grad_norm": 7.6181960105896, + "learning_rate": 4.908055717934114e-05, + "loss": 7.0542, + "step": 899 + }, + { + "epoch": 0.08678881388621022, + "grad_norm": 9.734560012817383, + "learning_rate": 4.907852096276902e-05, + "loss": 7.0718, + "step": 900 + }, + { + "epoch": 0.08688524590163935, + "grad_norm": 8.127421379089355, + "learning_rate": 4.907648253629988e-05, + "loss": 7.0522, + "step": 901 + }, + { + "epoch": 0.08698167791706847, + "grad_norm": 4.576957702636719, + "learning_rate": 4.90744419001208e-05, + "loss": 6.9775, + "step": 902 + }, + { + "epoch": 0.0870781099324976, + "grad_norm": 3.869992733001709, + "learning_rate": 4.907239905441908e-05, + "loss": 6.9229, + "step": 903 + }, + { + "epoch": 0.08717454194792672, + "grad_norm": 3.4373090267181396, + "learning_rate": 4.9070353999382206e-05, + "loss": 6.9158, + "step": 904 + }, + { + "epoch": 0.08727097396335583, + "grad_norm": 3.4985053539276123, + "learning_rate": 4.906830673519787e-05, + "loss": 6.8421, + "step": 905 + }, + { + "epoch": 0.08736740597878495, + "grad_norm": 5.103436470031738, + "learning_rate": 4.906625726205395e-05, + "loss": 6.9213, + "step": 906 + }, + { + "epoch": 0.08746383799421407, + "grad_norm": 3.8501391410827637, + "learning_rate": 4.9064205580138575e-05, + "loss": 6.7671, + "step": 907 + }, + { + "epoch": 0.0875602700096432, + "grad_norm": 5.4397993087768555, + "learning_rate": 4.906215168964002e-05, + "loss": 6.6495, + "step": 908 + }, + { + "epoch": 0.08765670202507232, + "grad_norm": 3.750786304473877, + "learning_rate": 4.90600955907468e-05, + "loss": 6.5915, + "step": 909 + }, + { + "epoch": 0.08775313404050145, + "grad_norm": 2.5191028118133545, + "learning_rate": 4.905803728364763e-05, + "loss": 6.7076, + "step": 910 + }, + { + "epoch": 0.08784956605593057, + "grad_norm": 2.4035069942474365, + "learning_rate": 4.90559767685314e-05, + "loss": 6.6057, + "step": 911 + }, + { + "epoch": 0.0879459980713597, + "grad_norm": 5.749815464019775, + "learning_rate": 4.905391404558723e-05, + "loss": 6.503, + "step": 912 + }, + { + "epoch": 0.08804243008678882, + "grad_norm": 3.421557664871216, + "learning_rate": 4.905184911500443e-05, + "loss": 6.4438, + "step": 913 + }, + { + "epoch": 0.08813886210221794, + "grad_norm": 5.333877086639404, + "learning_rate": 4.904978197697253e-05, + "loss": 6.7879, + "step": 914 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 8.022939682006836, + "learning_rate": 4.9047712631681245e-05, + "loss": 6.848, + "step": 915 + }, + { + "epoch": 0.08833172613307617, + "grad_norm": 4.414430141448975, + "learning_rate": 4.9045641079320484e-05, + "loss": 6.7551, + "step": 916 + }, + { + "epoch": 0.0884281581485053, + "grad_norm": 5.227283000946045, + "learning_rate": 4.904356732008038e-05, + "loss": 6.7243, + "step": 917 + }, + { + "epoch": 0.08852459016393442, + "grad_norm": 6.4570417404174805, + "learning_rate": 4.904149135415126e-05, + "loss": 6.9567, + "step": 918 + }, + { + "epoch": 0.08862102217936355, + "grad_norm": 1.8320252895355225, + "learning_rate": 4.9039413181723654e-05, + "loss": 6.7157, + "step": 919 + }, + { + "epoch": 0.08871745419479267, + "grad_norm": 2.6197421550750732, + "learning_rate": 4.90373328029883e-05, + "loss": 6.7625, + "step": 920 + }, + { + "epoch": 0.08881388621022179, + "grad_norm": 1.3263190984725952, + "learning_rate": 4.903525021813612e-05, + "loss": 6.7273, + "step": 921 + }, + { + "epoch": 0.08891031822565092, + "grad_norm": 3.2155401706695557, + "learning_rate": 4.903316542735825e-05, + "loss": 6.6581, + "step": 922 + }, + { + "epoch": 0.08900675024108004, + "grad_norm": 1.5424723625183105, + "learning_rate": 4.903107843084605e-05, + "loss": 6.863, + "step": 923 + }, + { + "epoch": 0.08910318225650916, + "grad_norm": 4.260273456573486, + "learning_rate": 4.9028989228791044e-05, + "loss": 6.6537, + "step": 924 + }, + { + "epoch": 0.08919961427193829, + "grad_norm": 2.2216720581054688, + "learning_rate": 4.902689782138499e-05, + "loss": 6.7304, + "step": 925 + }, + { + "epoch": 0.08929604628736741, + "grad_norm": 3.9663515090942383, + "learning_rate": 4.9024804208819815e-05, + "loss": 6.925, + "step": 926 + }, + { + "epoch": 0.08939247830279654, + "grad_norm": 4.751559734344482, + "learning_rate": 4.902270839128768e-05, + "loss": 6.7646, + "step": 927 + }, + { + "epoch": 0.08948891031822565, + "grad_norm": 7.368974208831787, + "learning_rate": 4.902061036898094e-05, + "loss": 7.6453, + "step": 928 + }, + { + "epoch": 0.08958534233365477, + "grad_norm": 5.922390937805176, + "learning_rate": 4.901851014209215e-05, + "loss": 7.5751, + "step": 929 + }, + { + "epoch": 0.08968177434908389, + "grad_norm": 5.281038284301758, + "learning_rate": 4.901640771081405e-05, + "loss": 7.5031, + "step": 930 + }, + { + "epoch": 0.08977820636451302, + "grad_norm": 8.530941009521484, + "learning_rate": 4.901430307533962e-05, + "loss": 6.7726, + "step": 931 + }, + { + "epoch": 0.08987463837994214, + "grad_norm": 6.0860676765441895, + "learning_rate": 4.9012196235862e-05, + "loss": 7.3711, + "step": 932 + }, + { + "epoch": 0.08997107039537126, + "grad_norm": 3.3353068828582764, + "learning_rate": 4.901008719257457e-05, + "loss": 7.3084, + "step": 933 + }, + { + "epoch": 0.09006750241080039, + "grad_norm": 4.636171340942383, + "learning_rate": 4.900797594567089e-05, + "loss": 7.4116, + "step": 934 + }, + { + "epoch": 0.09016393442622951, + "grad_norm": 5.08152961730957, + "learning_rate": 4.9005862495344726e-05, + "loss": 7.3055, + "step": 935 + }, + { + "epoch": 0.09026036644165863, + "grad_norm": 4.410223484039307, + "learning_rate": 4.900374684179004e-05, + "loss": 7.5202, + "step": 936 + }, + { + "epoch": 0.09035679845708776, + "grad_norm": 3.2805428504943848, + "learning_rate": 4.900162898520103e-05, + "loss": 7.3838, + "step": 937 + }, + { + "epoch": 0.09045323047251688, + "grad_norm": 3.128434419631958, + "learning_rate": 4.8999508925772045e-05, + "loss": 7.3103, + "step": 938 + }, + { + "epoch": 0.09054966248794599, + "grad_norm": 2.5947282314300537, + "learning_rate": 4.899738666369766e-05, + "loss": 7.43, + "step": 939 + }, + { + "epoch": 0.09064609450337512, + "grad_norm": 6.40971040725708, + "learning_rate": 4.899526219917267e-05, + "loss": 7.2655, + "step": 940 + }, + { + "epoch": 0.09074252651880424, + "grad_norm": 6.421778202056885, + "learning_rate": 4.8993135532392055e-05, + "loss": 7.4082, + "step": 941 + }, + { + "epoch": 0.09083895853423336, + "grad_norm": 3.2538418769836426, + "learning_rate": 4.899100666355099e-05, + "loss": 7.1695, + "step": 942 + }, + { + "epoch": 0.09093539054966249, + "grad_norm": 5.804473400115967, + "learning_rate": 4.898887559284485e-05, + "loss": 7.4757, + "step": 943 + }, + { + "epoch": 0.09103182256509161, + "grad_norm": 2.611293077468872, + "learning_rate": 4.898674232046924e-05, + "loss": 7.2819, + "step": 944 + }, + { + "epoch": 0.09112825458052073, + "grad_norm": 13.278595924377441, + "learning_rate": 4.898460684661995e-05, + "loss": 7.1618, + "step": 945 + }, + { + "epoch": 0.09122468659594986, + "grad_norm": 15.751297950744629, + "learning_rate": 4.898246917149295e-05, + "loss": 7.0577, + "step": 946 + }, + { + "epoch": 0.09132111861137898, + "grad_norm": 12.126399040222168, + "learning_rate": 4.8980329295284456e-05, + "loss": 7.1282, + "step": 947 + }, + { + "epoch": 0.0914175506268081, + "grad_norm": 9.761650085449219, + "learning_rate": 4.897818721819086e-05, + "loss": 7.0218, + "step": 948 + }, + { + "epoch": 0.09151398264223723, + "grad_norm": 4.634538650512695, + "learning_rate": 4.897604294040874e-05, + "loss": 7.02, + "step": 949 + }, + { + "epoch": 0.09161041465766634, + "grad_norm": 4.674229621887207, + "learning_rate": 4.897389646213492e-05, + "loss": 6.9693, + "step": 950 + }, + { + "epoch": 0.09170684667309546, + "grad_norm": 4.116694450378418, + "learning_rate": 4.897174778356638e-05, + "loss": 6.92, + "step": 951 + }, + { + "epoch": 0.09180327868852459, + "grad_norm": 5.379880428314209, + "learning_rate": 4.8969596904900336e-05, + "loss": 6.9123, + "step": 952 + }, + { + "epoch": 0.09189971070395371, + "grad_norm": 5.709438323974609, + "learning_rate": 4.896744382633419e-05, + "loss": 6.7938, + "step": 953 + }, + { + "epoch": 0.09199614271938283, + "grad_norm": 4.640354633331299, + "learning_rate": 4.896528854806556e-05, + "loss": 7.0171, + "step": 954 + }, + { + "epoch": 0.09209257473481196, + "grad_norm": 10.97243881225586, + "learning_rate": 4.896313107029223e-05, + "loss": 7.9671, + "step": 955 + }, + { + "epoch": 0.09218900675024108, + "grad_norm": 10.724348068237305, + "learning_rate": 4.896097139321223e-05, + "loss": 7.8078, + "step": 956 + }, + { + "epoch": 0.0922854387656702, + "grad_norm": 11.635737419128418, + "learning_rate": 4.895880951702376e-05, + "loss": 8.1518, + "step": 957 + }, + { + "epoch": 0.09238187078109933, + "grad_norm": 11.51595401763916, + "learning_rate": 4.895664544192525e-05, + "loss": 7.5525, + "step": 958 + }, + { + "epoch": 0.09247830279652845, + "grad_norm": 7.375553131103516, + "learning_rate": 4.89544791681153e-05, + "loss": 7.3908, + "step": 959 + }, + { + "epoch": 0.09257473481195758, + "grad_norm": 3.262079954147339, + "learning_rate": 4.895231069579274e-05, + "loss": 7.4304, + "step": 960 + }, + { + "epoch": 0.09267116682738669, + "grad_norm": 3.930424690246582, + "learning_rate": 4.895014002515658e-05, + "loss": 7.2696, + "step": 961 + }, + { + "epoch": 0.09276759884281581, + "grad_norm": 3.3061673641204834, + "learning_rate": 4.894796715640605e-05, + "loss": 7.3616, + "step": 962 + }, + { + "epoch": 0.09286403085824493, + "grad_norm": 2.6337170600891113, + "learning_rate": 4.894579208974057e-05, + "loss": 7.3685, + "step": 963 + }, + { + "epoch": 0.09296046287367406, + "grad_norm": 3.875525712966919, + "learning_rate": 4.894361482535976e-05, + "loss": 7.2635, + "step": 964 + }, + { + "epoch": 0.09305689488910318, + "grad_norm": 3.5343809127807617, + "learning_rate": 4.894143536346346e-05, + "loss": 7.1196, + "step": 965 + }, + { + "epoch": 0.0931533269045323, + "grad_norm": 3.026662826538086, + "learning_rate": 4.893925370425168e-05, + "loss": 7.235, + "step": 966 + }, + { + "epoch": 0.09324975891996143, + "grad_norm": 2.4557604789733887, + "learning_rate": 4.893706984792467e-05, + "loss": 7.3322, + "step": 967 + }, + { + "epoch": 0.09334619093539055, + "grad_norm": 3.4649252891540527, + "learning_rate": 4.893488379468284e-05, + "loss": 7.2855, + "step": 968 + }, + { + "epoch": 0.09344262295081968, + "grad_norm": 4.8894171714782715, + "learning_rate": 4.8932695544726834e-05, + "loss": 7.2391, + "step": 969 + }, + { + "epoch": 0.0935390549662488, + "grad_norm": 10.106011390686035, + "learning_rate": 4.89305050982575e-05, + "loss": 7.1379, + "step": 970 + }, + { + "epoch": 0.09363548698167792, + "grad_norm": 11.489299774169922, + "learning_rate": 4.892831245547585e-05, + "loss": 7.1194, + "step": 971 + }, + { + "epoch": 0.09373191899710703, + "grad_norm": 11.574524879455566, + "learning_rate": 4.892611761658313e-05, + "loss": 7.0489, + "step": 972 + }, + { + "epoch": 0.09382835101253616, + "grad_norm": 7.859764575958252, + "learning_rate": 4.89239205817808e-05, + "loss": 6.9817, + "step": 973 + }, + { + "epoch": 0.09392478302796528, + "grad_norm": 5.581941604614258, + "learning_rate": 4.892172135127048e-05, + "loss": 6.8593, + "step": 974 + }, + { + "epoch": 0.0940212150433944, + "grad_norm": 7.146266937255859, + "learning_rate": 4.891951992525401e-05, + "loss": 7.013, + "step": 975 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 8.746365547180176, + "learning_rate": 4.8917316303933443e-05, + "loss": 7.549, + "step": 976 + }, + { + "epoch": 0.09421407907425265, + "grad_norm": 5.2318196296691895, + "learning_rate": 4.891511048751102e-05, + "loss": 6.987, + "step": 977 + }, + { + "epoch": 0.09431051108968178, + "grad_norm": 4.396707534790039, + "learning_rate": 4.89129024761892e-05, + "loss": 7.1157, + "step": 978 + }, + { + "epoch": 0.0944069431051109, + "grad_norm": 7.755749702453613, + "learning_rate": 4.8910692270170615e-05, + "loss": 6.8046, + "step": 979 + }, + { + "epoch": 0.09450337512054002, + "grad_norm": 6.554897308349609, + "learning_rate": 4.890847986965813e-05, + "loss": 6.8316, + "step": 980 + }, + { + "epoch": 0.09459980713596915, + "grad_norm": 5.376605033874512, + "learning_rate": 4.890626527485478e-05, + "loss": 6.7655, + "step": 981 + }, + { + "epoch": 0.09469623915139827, + "grad_norm": 3.022986888885498, + "learning_rate": 4.890404848596383e-05, + "loss": 6.7357, + "step": 982 + }, + { + "epoch": 0.09479267116682738, + "grad_norm": 3.4674289226531982, + "learning_rate": 4.890182950318874e-05, + "loss": 6.6793, + "step": 983 + }, + { + "epoch": 0.0948891031822565, + "grad_norm": 4.14244270324707, + "learning_rate": 4.889960832673315e-05, + "loss": 6.7004, + "step": 984 + }, + { + "epoch": 0.09498553519768563, + "grad_norm": 2.957387685775757, + "learning_rate": 4.8897384956800916e-05, + "loss": 6.6714, + "step": 985 + }, + { + "epoch": 0.09508196721311475, + "grad_norm": 2.6519951820373535, + "learning_rate": 4.889515939359611e-05, + "loss": 6.5618, + "step": 986 + }, + { + "epoch": 0.09517839922854388, + "grad_norm": 1.6209803819656372, + "learning_rate": 4.889293163732299e-05, + "loss": 6.6497, + "step": 987 + }, + { + "epoch": 0.095274831243973, + "grad_norm": 2.139385223388672, + "learning_rate": 4.8890701688186005e-05, + "loss": 6.6079, + "step": 988 + }, + { + "epoch": 0.09537126325940212, + "grad_norm": 1.7418864965438843, + "learning_rate": 4.8888469546389824e-05, + "loss": 6.6297, + "step": 989 + }, + { + "epoch": 0.09546769527483125, + "grad_norm": 1.6686296463012695, + "learning_rate": 4.8886235212139316e-05, + "loss": 6.679, + "step": 990 + }, + { + "epoch": 0.09556412729026037, + "grad_norm": 2.689122438430786, + "learning_rate": 4.888399868563953e-05, + "loss": 6.7101, + "step": 991 + }, + { + "epoch": 0.0956605593056895, + "grad_norm": 1.8720098733901978, + "learning_rate": 4.888175996709575e-05, + "loss": 6.8153, + "step": 992 + }, + { + "epoch": 0.09575699132111862, + "grad_norm": 2.889772653579712, + "learning_rate": 4.887951905671343e-05, + "loss": 6.7139, + "step": 993 + }, + { + "epoch": 0.09585342333654773, + "grad_norm": 3.9933905601501465, + "learning_rate": 4.887727595469824e-05, + "loss": 6.6933, + "step": 994 + }, + { + "epoch": 0.09594985535197685, + "grad_norm": 2.306525230407715, + "learning_rate": 4.8875030661256054e-05, + "loss": 6.6891, + "step": 995 + }, + { + "epoch": 0.09604628736740597, + "grad_norm": 2.4756250381469727, + "learning_rate": 4.887278317659294e-05, + "loss": 6.6358, + "step": 996 + }, + { + "epoch": 0.0961427193828351, + "grad_norm": 2.5154261589050293, + "learning_rate": 4.887053350091517e-05, + "loss": 6.7182, + "step": 997 + }, + { + "epoch": 0.09623915139826422, + "grad_norm": 3.052358627319336, + "learning_rate": 4.886828163442921e-05, + "loss": 6.6661, + "step": 998 + }, + { + "epoch": 0.09633558341369335, + "grad_norm": 4.052638053894043, + "learning_rate": 4.8866027577341746e-05, + "loss": 6.605, + "step": 999 + }, + { + "epoch": 0.09643201542912247, + "grad_norm": 11.181468963623047, + "learning_rate": 4.886377132985963e-05, + "loss": 7.8308, + "step": 1000 + }, + { + "epoch": 0.0965284474445516, + "grad_norm": 9.163455963134766, + "learning_rate": 4.886151289218997e-05, + "loss": 7.7308, + "step": 1001 + }, + { + "epoch": 0.09662487945998072, + "grad_norm": 6.24665641784668, + "learning_rate": 4.8859252264540014e-05, + "loss": 7.9668, + "step": 1002 + }, + { + "epoch": 0.09672131147540984, + "grad_norm": 3.5593485832214355, + "learning_rate": 4.885698944711726e-05, + "loss": 7.5112, + "step": 1003 + }, + { + "epoch": 0.09681774349083896, + "grad_norm": 3.9343745708465576, + "learning_rate": 4.8854724440129376e-05, + "loss": 7.1927, + "step": 1004 + }, + { + "epoch": 0.09691417550626807, + "grad_norm": 4.012549877166748, + "learning_rate": 4.885245724378423e-05, + "loss": 7.5495, + "step": 1005 + }, + { + "epoch": 0.0970106075216972, + "grad_norm": 4.871736526489258, + "learning_rate": 4.8850187858289934e-05, + "loss": 7.2404, + "step": 1006 + }, + { + "epoch": 0.09710703953712632, + "grad_norm": 4.982372283935547, + "learning_rate": 4.884791628385474e-05, + "loss": 7.2065, + "step": 1007 + }, + { + "epoch": 0.09720347155255545, + "grad_norm": 6.436575412750244, + "learning_rate": 4.8845642520687145e-05, + "loss": 7.3014, + "step": 1008 + }, + { + "epoch": 0.09729990356798457, + "grad_norm": 5.5630717277526855, + "learning_rate": 4.884336656899583e-05, + "loss": 7.4551, + "step": 1009 + }, + { + "epoch": 0.09739633558341369, + "grad_norm": 3.7654545307159424, + "learning_rate": 4.884108842898968e-05, + "loss": 7.3886, + "step": 1010 + }, + { + "epoch": 0.09749276759884282, + "grad_norm": 4.591469764709473, + "learning_rate": 4.883880810087777e-05, + "loss": 7.5125, + "step": 1011 + }, + { + "epoch": 0.09758919961427194, + "grad_norm": 4.305830955505371, + "learning_rate": 4.88365255848694e-05, + "loss": 7.4074, + "step": 1012 + }, + { + "epoch": 0.09768563162970106, + "grad_norm": 3.4926254749298096, + "learning_rate": 4.8834240881174044e-05, + "loss": 7.4663, + "step": 1013 + }, + { + "epoch": 0.09778206364513019, + "grad_norm": 6.08200740814209, + "learning_rate": 4.88319539900014e-05, + "loss": 7.138, + "step": 1014 + }, + { + "epoch": 0.09787849566055931, + "grad_norm": 8.299041748046875, + "learning_rate": 4.8829664911561346e-05, + "loss": 6.8026, + "step": 1015 + }, + { + "epoch": 0.09797492767598842, + "grad_norm": 7.0349626541137695, + "learning_rate": 4.882737364606398e-05, + "loss": 6.8618, + "step": 1016 + }, + { + "epoch": 0.09807135969141755, + "grad_norm": 7.68587064743042, + "learning_rate": 4.8825080193719586e-05, + "loss": 6.6224, + "step": 1017 + }, + { + "epoch": 0.09816779170684667, + "grad_norm": 4.744743347167969, + "learning_rate": 4.882278455473866e-05, + "loss": 6.6513, + "step": 1018 + }, + { + "epoch": 0.09826422372227579, + "grad_norm": 4.803104877471924, + "learning_rate": 4.8820486729331884e-05, + "loss": 6.7674, + "step": 1019 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 5.158519268035889, + "learning_rate": 4.8818186717710154e-05, + "loss": 6.7618, + "step": 1020 + }, + { + "epoch": 0.09845708775313404, + "grad_norm": 5.109723091125488, + "learning_rate": 4.881588452008456e-05, + "loss": 6.872, + "step": 1021 + }, + { + "epoch": 0.09855351976856316, + "grad_norm": 3.8843555450439453, + "learning_rate": 4.88135801366664e-05, + "loss": 6.6342, + "step": 1022 + }, + { + "epoch": 0.09864995178399229, + "grad_norm": 4.4330830574035645, + "learning_rate": 4.881127356766717e-05, + "loss": 6.5234, + "step": 1023 + }, + { + "epoch": 0.09874638379942141, + "grad_norm": 2.04451584815979, + "learning_rate": 4.880896481329856e-05, + "loss": 6.7783, + "step": 1024 + }, + { + "epoch": 0.09884281581485053, + "grad_norm": 9.279680252075195, + "learning_rate": 4.8806653873772456e-05, + "loss": 7.6065, + "step": 1025 + }, + { + "epoch": 0.09893924783027966, + "grad_norm": 8.433257102966309, + "learning_rate": 4.8804340749300963e-05, + "loss": 7.5409, + "step": 1026 + }, + { + "epoch": 0.09903567984570878, + "grad_norm": 7.722862243652344, + "learning_rate": 4.880202544009638e-05, + "loss": 7.609, + "step": 1027 + }, + { + "epoch": 0.09913211186113789, + "grad_norm": 6.478600025177002, + "learning_rate": 4.879970794637119e-05, + "loss": 7.8509, + "step": 1028 + }, + { + "epoch": 0.09922854387656702, + "grad_norm": 4.548638343811035, + "learning_rate": 4.8797388268338095e-05, + "loss": 7.7816, + "step": 1029 + }, + { + "epoch": 0.09932497589199614, + "grad_norm": 3.3162877559661865, + "learning_rate": 4.879506640621e-05, + "loss": 7.5863, + "step": 1030 + }, + { + "epoch": 0.09942140790742526, + "grad_norm": 4.124875545501709, + "learning_rate": 4.8792742360199995e-05, + "loss": 7.3831, + "step": 1031 + }, + { + "epoch": 0.09951783992285439, + "grad_norm": 3.0710110664367676, + "learning_rate": 4.879041613052139e-05, + "loss": 7.5442, + "step": 1032 + }, + { + "epoch": 0.09961427193828351, + "grad_norm": 5.3492045402526855, + "learning_rate": 4.878808771738766e-05, + "loss": 7.1562, + "step": 1033 + }, + { + "epoch": 0.09971070395371263, + "grad_norm": 6.507651329040527, + "learning_rate": 4.878575712101252e-05, + "loss": 7.3709, + "step": 1034 + }, + { + "epoch": 0.09980713596914176, + "grad_norm": 6.624672889709473, + "learning_rate": 4.8783424341609875e-05, + "loss": 7.4315, + "step": 1035 + }, + { + "epoch": 0.09990356798457088, + "grad_norm": 3.9179508686065674, + "learning_rate": 4.8781089379393805e-05, + "loss": 7.3401, + "step": 1036 + }, + { + "epoch": 0.1, + "grad_norm": 4.242870330810547, + "learning_rate": 4.877875223457863e-05, + "loss": 7.5108, + "step": 1037 + }, + { + "epoch": 0.10009643201542913, + "grad_norm": 3.723322629928589, + "learning_rate": 4.877641290737884e-05, + "loss": 7.4212, + "step": 1038 + }, + { + "epoch": 0.10019286403085824, + "grad_norm": 3.8881897926330566, + "learning_rate": 4.877407139800914e-05, + "loss": 7.3337, + "step": 1039 + }, + { + "epoch": 0.10028929604628736, + "grad_norm": 12.850090980529785, + "learning_rate": 4.877172770668442e-05, + "loss": 7.4105, + "step": 1040 + }, + { + "epoch": 0.10038572806171649, + "grad_norm": 11.54455852508545, + "learning_rate": 4.87693818336198e-05, + "loss": 7.2402, + "step": 1041 + }, + { + "epoch": 0.10048216007714561, + "grad_norm": 10.047114372253418, + "learning_rate": 4.876703377903057e-05, + "loss": 7.174, + "step": 1042 + }, + { + "epoch": 0.10057859209257473, + "grad_norm": 7.868381500244141, + "learning_rate": 4.876468354313222e-05, + "loss": 7.0857, + "step": 1043 + }, + { + "epoch": 0.10067502410800386, + "grad_norm": 7.04380464553833, + "learning_rate": 4.876233112614047e-05, + "loss": 6.8817, + "step": 1044 + }, + { + "epoch": 0.10077145612343298, + "grad_norm": 4.806086540222168, + "learning_rate": 4.875997652827122e-05, + "loss": 6.8482, + "step": 1045 + }, + { + "epoch": 0.1008678881388621, + "grad_norm": 4.823073863983154, + "learning_rate": 4.875761974974057e-05, + "loss": 6.9158, + "step": 1046 + }, + { + "epoch": 0.10096432015429123, + "grad_norm": 4.681227684020996, + "learning_rate": 4.8755260790764814e-05, + "loss": 6.7951, + "step": 1047 + }, + { + "epoch": 0.10106075216972035, + "grad_norm": 5.599585056304932, + "learning_rate": 4.875289965156046e-05, + "loss": 6.9414, + "step": 1048 + }, + { + "epoch": 0.10115718418514948, + "grad_norm": 5.73380708694458, + "learning_rate": 4.875053633234422e-05, + "loss": 6.9082, + "step": 1049 + }, + { + "epoch": 0.10125361620057859, + "grad_norm": 5.886882781982422, + "learning_rate": 4.8748170833332975e-05, + "loss": 7.0509, + "step": 1050 + }, + { + "epoch": 0.10135004821600771, + "grad_norm": 4.818755626678467, + "learning_rate": 4.8745803154743844e-05, + "loss": 6.8103, + "step": 1051 + }, + { + "epoch": 0.10144648023143683, + "grad_norm": 3.485372543334961, + "learning_rate": 4.8743433296794126e-05, + "loss": 6.874, + "step": 1052 + }, + { + "epoch": 0.10154291224686596, + "grad_norm": 4.226315498352051, + "learning_rate": 4.8741061259701325e-05, + "loss": 6.8835, + "step": 1053 + }, + { + "epoch": 0.10163934426229508, + "grad_norm": 3.22265362739563, + "learning_rate": 4.873868704368315e-05, + "loss": 6.8173, + "step": 1054 + }, + { + "epoch": 0.1017357762777242, + "grad_norm": 4.55885124206543, + "learning_rate": 4.873631064895748e-05, + "loss": 6.7008, + "step": 1055 + }, + { + "epoch": 0.10183220829315333, + "grad_norm": 6.2485880851745605, + "learning_rate": 4.873393207574245e-05, + "loss": 6.6659, + "step": 1056 + }, + { + "epoch": 0.10192864030858245, + "grad_norm": 3.23079776763916, + "learning_rate": 4.8731551324256334e-05, + "loss": 6.634, + "step": 1057 + }, + { + "epoch": 0.10202507232401158, + "grad_norm": 2.183356761932373, + "learning_rate": 4.872916839471765e-05, + "loss": 6.7557, + "step": 1058 + }, + { + "epoch": 0.1021215043394407, + "grad_norm": 6.161545276641846, + "learning_rate": 4.8726783287345095e-05, + "loss": 6.6416, + "step": 1059 + }, + { + "epoch": 0.10221793635486982, + "grad_norm": 5.716010570526123, + "learning_rate": 4.872439600235758e-05, + "loss": 6.7202, + "step": 1060 + }, + { + "epoch": 0.10231436837029893, + "grad_norm": 2.2733092308044434, + "learning_rate": 4.872200653997419e-05, + "loss": 6.7825, + "step": 1061 + }, + { + "epoch": 0.10241080038572806, + "grad_norm": 2.2111732959747314, + "learning_rate": 4.8719614900414236e-05, + "loss": 6.6936, + "step": 1062 + }, + { + "epoch": 0.10250723240115718, + "grad_norm": 2.4184231758117676, + "learning_rate": 4.8717221083897223e-05, + "loss": 6.8313, + "step": 1063 + }, + { + "epoch": 0.1026036644165863, + "grad_norm": 8.221638679504395, + "learning_rate": 4.871482509064286e-05, + "loss": 7.6381, + "step": 1064 + }, + { + "epoch": 0.10270009643201543, + "grad_norm": 8.57941722869873, + "learning_rate": 4.871242692087102e-05, + "loss": 7.5859, + "step": 1065 + }, + { + "epoch": 0.10279652844744455, + "grad_norm": 6.806934356689453, + "learning_rate": 4.8710026574801834e-05, + "loss": 7.5703, + "step": 1066 + }, + { + "epoch": 0.10289296046287368, + "grad_norm": 4.832522392272949, + "learning_rate": 4.87076240526556e-05, + "loss": 7.5996, + "step": 1067 + }, + { + "epoch": 0.1029893924783028, + "grad_norm": 4.189358234405518, + "learning_rate": 4.87052193546528e-05, + "loss": 7.5289, + "step": 1068 + }, + { + "epoch": 0.10308582449373192, + "grad_norm": 3.2825424671173096, + "learning_rate": 4.870281248101414e-05, + "loss": 7.479, + "step": 1069 + }, + { + "epoch": 0.10318225650916105, + "grad_norm": 4.290635108947754, + "learning_rate": 4.8700403431960525e-05, + "loss": 7.3083, + "step": 1070 + }, + { + "epoch": 0.10327868852459017, + "grad_norm": 4.57724142074585, + "learning_rate": 4.869799220771306e-05, + "loss": 7.2917, + "step": 1071 + }, + { + "epoch": 0.10337512054001928, + "grad_norm": 6.5010576248168945, + "learning_rate": 4.869557880849304e-05, + "loss": 7.3467, + "step": 1072 + }, + { + "epoch": 0.1034715525554484, + "grad_norm": 3.174355983734131, + "learning_rate": 4.869316323452195e-05, + "loss": 7.362, + "step": 1073 + }, + { + "epoch": 0.10356798457087753, + "grad_norm": 3.211115598678589, + "learning_rate": 4.86907454860215e-05, + "loss": 7.1303, + "step": 1074 + }, + { + "epoch": 0.10366441658630665, + "grad_norm": 2.942356824874878, + "learning_rate": 4.86883255632136e-05, + "loss": 7.2647, + "step": 1075 + }, + { + "epoch": 0.10376084860173578, + "grad_norm": 2.923945426940918, + "learning_rate": 4.8685903466320324e-05, + "loss": 7.5214, + "step": 1076 + }, + { + "epoch": 0.1038572806171649, + "grad_norm": 6.638969421386719, + "learning_rate": 4.868347919556399e-05, + "loss": 6.9508, + "step": 1077 + }, + { + "epoch": 0.10395371263259402, + "grad_norm": 7.593791961669922, + "learning_rate": 4.868105275116708e-05, + "loss": 6.6219, + "step": 1078 + }, + { + "epoch": 0.10405014464802315, + "grad_norm": 2.6671266555786133, + "learning_rate": 4.867862413335229e-05, + "loss": 7.3024, + "step": 1079 + }, + { + "epoch": 0.10414657666345227, + "grad_norm": 3.3000409603118896, + "learning_rate": 4.867619334234253e-05, + "loss": 7.2561, + "step": 1080 + }, + { + "epoch": 0.1042430086788814, + "grad_norm": 2.805269956588745, + "learning_rate": 4.8673760378360875e-05, + "loss": 7.1555, + "step": 1081 + }, + { + "epoch": 0.10433944069431052, + "grad_norm": 2.3547937870025635, + "learning_rate": 4.867132524163065e-05, + "loss": 7.3378, + "step": 1082 + }, + { + "epoch": 0.10443587270973963, + "grad_norm": 3.3533437252044678, + "learning_rate": 4.8668887932375307e-05, + "loss": 7.3693, + "step": 1083 + }, + { + "epoch": 0.10453230472516875, + "grad_norm": 6.4251251220703125, + "learning_rate": 4.866644845081857e-05, + "loss": 7.0342, + "step": 1084 + }, + { + "epoch": 0.10462873674059787, + "grad_norm": 4.625685214996338, + "learning_rate": 4.866400679718432e-05, + "loss": 7.5851, + "step": 1085 + }, + { + "epoch": 0.104725168756027, + "grad_norm": 4.252157211303711, + "learning_rate": 4.866156297169666e-05, + "loss": 7.6289, + "step": 1086 + }, + { + "epoch": 0.10482160077145612, + "grad_norm": 5.532313823699951, + "learning_rate": 4.8659116974579867e-05, + "loss": 7.6088, + "step": 1087 + }, + { + "epoch": 0.10491803278688525, + "grad_norm": 7.577369213104248, + "learning_rate": 4.865666880605844e-05, + "loss": 7.0981, + "step": 1088 + }, + { + "epoch": 0.10501446480231437, + "grad_norm": 7.355609893798828, + "learning_rate": 4.8654218466357064e-05, + "loss": 7.0865, + "step": 1089 + }, + { + "epoch": 0.1051108968177435, + "grad_norm": 4.0606842041015625, + "learning_rate": 4.865176595570063e-05, + "loss": 7.0413, + "step": 1090 + }, + { + "epoch": 0.10520732883317262, + "grad_norm": 7.392199516296387, + "learning_rate": 4.8649311274314234e-05, + "loss": 6.8021, + "step": 1091 + }, + { + "epoch": 0.10530376084860174, + "grad_norm": 7.15516996383667, + "learning_rate": 4.864685442242315e-05, + "loss": 6.8655, + "step": 1092 + }, + { + "epoch": 0.10540019286403086, + "grad_norm": 7.907309055328369, + "learning_rate": 4.8644395400252874e-05, + "loss": 6.8537, + "step": 1093 + }, + { + "epoch": 0.10549662487945997, + "grad_norm": 6.274310111999512, + "learning_rate": 4.864193420802909e-05, + "loss": 6.6821, + "step": 1094 + }, + { + "epoch": 0.1055930568948891, + "grad_norm": 4.259432792663574, + "learning_rate": 4.863947084597768e-05, + "loss": 6.746, + "step": 1095 + }, + { + "epoch": 0.10568948891031822, + "grad_norm": 5.991074562072754, + "learning_rate": 4.8637005314324734e-05, + "loss": 6.6084, + "step": 1096 + }, + { + "epoch": 0.10578592092574735, + "grad_norm": 2.8629696369171143, + "learning_rate": 4.8634537613296534e-05, + "loss": 6.7998, + "step": 1097 + }, + { + "epoch": 0.10588235294117647, + "grad_norm": 5.819774150848389, + "learning_rate": 4.8632067743119556e-05, + "loss": 7.3842, + "step": 1098 + }, + { + "epoch": 0.10597878495660559, + "grad_norm": 7.880488395690918, + "learning_rate": 4.862959570402049e-05, + "loss": 7.0829, + "step": 1099 + }, + { + "epoch": 0.10607521697203472, + "grad_norm": 6.260347843170166, + "learning_rate": 4.8627121496226214e-05, + "loss": 7.0528, + "step": 1100 + }, + { + "epoch": 0.10617164898746384, + "grad_norm": 5.550198554992676, + "learning_rate": 4.8624645119963804e-05, + "loss": 7.2982, + "step": 1101 + }, + { + "epoch": 0.10626808100289296, + "grad_norm": 6.010748863220215, + "learning_rate": 4.862216657546055e-05, + "loss": 7.6334, + "step": 1102 + }, + { + "epoch": 0.10636451301832209, + "grad_norm": 6.822992324829102, + "learning_rate": 4.8619685862943916e-05, + "loss": 7.4979, + "step": 1103 + }, + { + "epoch": 0.10646094503375121, + "grad_norm": 6.1509504318237305, + "learning_rate": 4.8617202982641585e-05, + "loss": 7.3896, + "step": 1104 + }, + { + "epoch": 0.10655737704918032, + "grad_norm": 2.9886887073516846, + "learning_rate": 4.8614717934781434e-05, + "loss": 7.3152, + "step": 1105 + }, + { + "epoch": 0.10665380906460944, + "grad_norm": 4.20281982421875, + "learning_rate": 4.861223071959153e-05, + "loss": 7.146, + "step": 1106 + }, + { + "epoch": 0.10675024108003857, + "grad_norm": 4.281688690185547, + "learning_rate": 4.860974133730016e-05, + "loss": 7.2898, + "step": 1107 + }, + { + "epoch": 0.10684667309546769, + "grad_norm": 3.170978307723999, + "learning_rate": 4.860724978813579e-05, + "loss": 6.9779, + "step": 1108 + }, + { + "epoch": 0.10694310511089682, + "grad_norm": 2.7457401752471924, + "learning_rate": 4.8604756072327085e-05, + "loss": 7.1749, + "step": 1109 + }, + { + "epoch": 0.10703953712632594, + "grad_norm": 3.705017566680908, + "learning_rate": 4.860226019010292e-05, + "loss": 7.3763, + "step": 1110 + }, + { + "epoch": 0.10713596914175506, + "grad_norm": 3.7340617179870605, + "learning_rate": 4.859976214169237e-05, + "loss": 7.3147, + "step": 1111 + }, + { + "epoch": 0.10723240115718419, + "grad_norm": 2.6382596492767334, + "learning_rate": 4.859726192732469e-05, + "loss": 7.1958, + "step": 1112 + }, + { + "epoch": 0.10732883317261331, + "grad_norm": 3.7873129844665527, + "learning_rate": 4.8594759547229355e-05, + "loss": 7.413, + "step": 1113 + }, + { + "epoch": 0.10742526518804243, + "grad_norm": 4.753648281097412, + "learning_rate": 4.8592255001636037e-05, + "loss": 7.37, + "step": 1114 + }, + { + "epoch": 0.10752169720347156, + "grad_norm": 3.558072805404663, + "learning_rate": 4.8589748290774584e-05, + "loss": 7.1445, + "step": 1115 + }, + { + "epoch": 0.10761812921890067, + "grad_norm": 7.94614315032959, + "learning_rate": 4.858723941487507e-05, + "loss": 7.0069, + "step": 1116 + }, + { + "epoch": 0.10771456123432979, + "grad_norm": 8.88119125366211, + "learning_rate": 4.858472837416776e-05, + "loss": 6.9416, + "step": 1117 + }, + { + "epoch": 0.10781099324975892, + "grad_norm": 6.284987449645996, + "learning_rate": 4.8582215168883094e-05, + "loss": 6.892, + "step": 1118 + }, + { + "epoch": 0.10790742526518804, + "grad_norm": 6.3007707595825195, + "learning_rate": 4.857969979925175e-05, + "loss": 6.7266, + "step": 1119 + }, + { + "epoch": 0.10800385728061716, + "grad_norm": 3.647733688354492, + "learning_rate": 4.857718226550458e-05, + "loss": 6.7647, + "step": 1120 + }, + { + "epoch": 0.10810028929604629, + "grad_norm": 6.711991310119629, + "learning_rate": 4.857466256787265e-05, + "loss": 6.7005, + "step": 1121 + }, + { + "epoch": 0.10819672131147541, + "grad_norm": 11.733789443969727, + "learning_rate": 4.85721407065872e-05, + "loss": 6.7707, + "step": 1122 + }, + { + "epoch": 0.10829315332690453, + "grad_norm": 13.492863655090332, + "learning_rate": 4.856961668187968e-05, + "loss": 6.6691, + "step": 1123 + }, + { + "epoch": 0.10838958534233366, + "grad_norm": 10.49193000793457, + "learning_rate": 4.856709049398176e-05, + "loss": 6.7702, + "step": 1124 + }, + { + "epoch": 0.10848601735776278, + "grad_norm": 6.084685802459717, + "learning_rate": 4.856456214312528e-05, + "loss": 6.8317, + "step": 1125 + }, + { + "epoch": 0.1085824493731919, + "grad_norm": 8.000066757202148, + "learning_rate": 4.856203162954229e-05, + "loss": 6.9151, + "step": 1126 + }, + { + "epoch": 0.10867888138862102, + "grad_norm": 13.285922050476074, + "learning_rate": 4.855949895346503e-05, + "loss": 6.6841, + "step": 1127 + }, + { + "epoch": 0.10877531340405014, + "grad_norm": 13.787715911865234, + "learning_rate": 4.855696411512596e-05, + "loss": 6.7575, + "step": 1128 + }, + { + "epoch": 0.10887174541947926, + "grad_norm": 11.233845710754395, + "learning_rate": 4.8554427114757715e-05, + "loss": 6.8754, + "step": 1129 + }, + { + "epoch": 0.10896817743490839, + "grad_norm": 10.238951683044434, + "learning_rate": 4.8551887952593144e-05, + "loss": 6.8286, + "step": 1130 + }, + { + "epoch": 0.10906460945033751, + "grad_norm": 6.515451908111572, + "learning_rate": 4.854934662886528e-05, + "loss": 6.7355, + "step": 1131 + }, + { + "epoch": 0.10916104146576663, + "grad_norm": 5.214569091796875, + "learning_rate": 4.8546803143807366e-05, + "loss": 6.7959, + "step": 1132 + }, + { + "epoch": 0.10925747348119576, + "grad_norm": 2.713520050048828, + "learning_rate": 4.854425749765285e-05, + "loss": 6.7086, + "step": 1133 + }, + { + "epoch": 0.10935390549662488, + "grad_norm": 6.776952743530273, + "learning_rate": 4.854170969063535e-05, + "loss": 6.6379, + "step": 1134 + }, + { + "epoch": 0.109450337512054, + "grad_norm": 8.255961418151855, + "learning_rate": 4.853915972298871e-05, + "loss": 6.5642, + "step": 1135 + }, + { + "epoch": 0.10954676952748313, + "grad_norm": 4.933233261108398, + "learning_rate": 4.853660759494696e-05, + "loss": 6.8076, + "step": 1136 + }, + { + "epoch": 0.10964320154291225, + "grad_norm": 4.555628776550293, + "learning_rate": 4.853405330674434e-05, + "loss": 6.7509, + "step": 1137 + }, + { + "epoch": 0.10973963355834138, + "grad_norm": 5.146445274353027, + "learning_rate": 4.853149685861527e-05, + "loss": 6.678, + "step": 1138 + }, + { + "epoch": 0.10983606557377049, + "grad_norm": 2.4799880981445312, + "learning_rate": 4.8528938250794387e-05, + "loss": 6.6555, + "step": 1139 + }, + { + "epoch": 0.10993249758919961, + "grad_norm": 4.029135227203369, + "learning_rate": 4.852637748351651e-05, + "loss": 6.6155, + "step": 1140 + }, + { + "epoch": 0.11002892960462873, + "grad_norm": 5.124574661254883, + "learning_rate": 4.8523814557016655e-05, + "loss": 6.5479, + "step": 1141 + }, + { + "epoch": 0.11012536162005786, + "grad_norm": 4.3882904052734375, + "learning_rate": 4.8521249471530055e-05, + "loss": 6.4721, + "step": 1142 + }, + { + "epoch": 0.11022179363548698, + "grad_norm": 4.328455448150635, + "learning_rate": 4.8518682227292134e-05, + "loss": 6.7956, + "step": 1143 + }, + { + "epoch": 0.1103182256509161, + "grad_norm": 4.07034969329834, + "learning_rate": 4.8516112824538504e-05, + "loss": 6.9876, + "step": 1144 + }, + { + "epoch": 0.11041465766634523, + "grad_norm": 4.679478168487549, + "learning_rate": 4.851354126350498e-05, + "loss": 7.0179, + "step": 1145 + }, + { + "epoch": 0.11051108968177435, + "grad_norm": 2.567866086959839, + "learning_rate": 4.8510967544427574e-05, + "loss": 6.6683, + "step": 1146 + }, + { + "epoch": 0.11060752169720348, + "grad_norm": 3.6043574810028076, + "learning_rate": 4.850839166754251e-05, + "loss": 6.5492, + "step": 1147 + }, + { + "epoch": 0.1107039537126326, + "grad_norm": 2.0683038234710693, + "learning_rate": 4.8505813633086195e-05, + "loss": 6.7284, + "step": 1148 + }, + { + "epoch": 0.11080038572806172, + "grad_norm": 8.007342338562012, + "learning_rate": 4.850323344129524e-05, + "loss": 7.3985, + "step": 1149 + }, + { + "epoch": 0.11089681774349083, + "grad_norm": 6.15872049331665, + "learning_rate": 4.8500651092406437e-05, + "loss": 7.3272, + "step": 1150 + }, + { + "epoch": 0.11099324975891996, + "grad_norm": 4.579965114593506, + "learning_rate": 4.8498066586656804e-05, + "loss": 7.4422, + "step": 1151 + }, + { + "epoch": 0.11108968177434908, + "grad_norm": 2.7223968505859375, + "learning_rate": 4.8495479924283544e-05, + "loss": 7.1468, + "step": 1152 + }, + { + "epoch": 0.1111861137897782, + "grad_norm": 3.5902514457702637, + "learning_rate": 4.849289110552405e-05, + "loss": 7.3167, + "step": 1153 + }, + { + "epoch": 0.11128254580520733, + "grad_norm": 3.0013396739959717, + "learning_rate": 4.849030013061593e-05, + "loss": 7.031, + "step": 1154 + }, + { + "epoch": 0.11137897782063645, + "grad_norm": 4.417283535003662, + "learning_rate": 4.8487706999796975e-05, + "loss": 6.9134, + "step": 1155 + }, + { + "epoch": 0.11147540983606558, + "grad_norm": 3.3868048191070557, + "learning_rate": 4.848511171330518e-05, + "loss": 7.156, + "step": 1156 + }, + { + "epoch": 0.1115718418514947, + "grad_norm": 2.6159629821777344, + "learning_rate": 4.8482514271378745e-05, + "loss": 7.1636, + "step": 1157 + }, + { + "epoch": 0.11166827386692382, + "grad_norm": 2.5438485145568848, + "learning_rate": 4.8479914674256035e-05, + "loss": 7.0879, + "step": 1158 + }, + { + "epoch": 0.11176470588235295, + "grad_norm": 3.1217610836029053, + "learning_rate": 4.847731292217566e-05, + "loss": 7.0655, + "step": 1159 + }, + { + "epoch": 0.11186113789778207, + "grad_norm": 2.8607540130615234, + "learning_rate": 4.8474709015376416e-05, + "loss": 7.2709, + "step": 1160 + }, + { + "epoch": 0.11195756991321118, + "grad_norm": 2.5065414905548096, + "learning_rate": 4.8472102954097266e-05, + "loss": 7.4469, + "step": 1161 + }, + { + "epoch": 0.1120540019286403, + "grad_norm": 1.94443678855896, + "learning_rate": 4.846949473857739e-05, + "loss": 7.2929, + "step": 1162 + }, + { + "epoch": 0.11215043394406943, + "grad_norm": 3.3388113975524902, + "learning_rate": 4.8466884369056177e-05, + "loss": 7.464, + "step": 1163 + }, + { + "epoch": 0.11224686595949855, + "grad_norm": 3.5323524475097656, + "learning_rate": 4.846427184577319e-05, + "loss": 6.9918, + "step": 1164 + }, + { + "epoch": 0.11234329797492767, + "grad_norm": 2.9090781211853027, + "learning_rate": 4.846165716896822e-05, + "loss": 6.7476, + "step": 1165 + }, + { + "epoch": 0.1124397299903568, + "grad_norm": 4.431588172912598, + "learning_rate": 4.8459040338881234e-05, + "loss": 6.625, + "step": 1166 + }, + { + "epoch": 0.11253616200578592, + "grad_norm": 7.303047180175781, + "learning_rate": 4.84564213557524e-05, + "loss": 6.6145, + "step": 1167 + }, + { + "epoch": 0.11263259402121505, + "grad_norm": 12.372384071350098, + "learning_rate": 4.845380021982208e-05, + "loss": 6.8156, + "step": 1168 + }, + { + "epoch": 0.11272902603664417, + "grad_norm": 7.8757710456848145, + "learning_rate": 4.845117693133084e-05, + "loss": 7.1785, + "step": 1169 + }, + { + "epoch": 0.1128254580520733, + "grad_norm": 7.070699214935303, + "learning_rate": 4.844855149051945e-05, + "loss": 7.5236, + "step": 1170 + }, + { + "epoch": 0.11292189006750242, + "grad_norm": 6.374688625335693, + "learning_rate": 4.844592389762887e-05, + "loss": 7.5396, + "step": 1171 + }, + { + "epoch": 0.11301832208293153, + "grad_norm": 6.454207897186279, + "learning_rate": 4.844329415290024e-05, + "loss": 7.4066, + "step": 1172 + }, + { + "epoch": 0.11311475409836065, + "grad_norm": 6.435431957244873, + "learning_rate": 4.8440662256574934e-05, + "loss": 7.189, + "step": 1173 + }, + { + "epoch": 0.11321118611378977, + "grad_norm": 5.790335655212402, + "learning_rate": 4.8438028208894496e-05, + "loss": 7.2767, + "step": 1174 + }, + { + "epoch": 0.1133076181292189, + "grad_norm": 4.803894519805908, + "learning_rate": 4.843539201010067e-05, + "loss": 7.0997, + "step": 1175 + }, + { + "epoch": 0.11340405014464802, + "grad_norm": 3.97212815284729, + "learning_rate": 4.843275366043542e-05, + "loss": 7.1628, + "step": 1176 + }, + { + "epoch": 0.11350048216007715, + "grad_norm": 2.9724020957946777, + "learning_rate": 4.8430113160140874e-05, + "loss": 7.0945, + "step": 1177 + }, + { + "epoch": 0.11359691417550627, + "grad_norm": 5.448117256164551, + "learning_rate": 4.8427470509459386e-05, + "loss": 7.0086, + "step": 1178 + }, + { + "epoch": 0.1136933461909354, + "grad_norm": 2.700082302093506, + "learning_rate": 4.842482570863348e-05, + "loss": 7.2316, + "step": 1179 + }, + { + "epoch": 0.11378977820636452, + "grad_norm": 2.921184539794922, + "learning_rate": 4.84221787579059e-05, + "loss": 7.2688, + "step": 1180 + }, + { + "epoch": 0.11388621022179364, + "grad_norm": 2.255180597305298, + "learning_rate": 4.841952965751959e-05, + "loss": 7.0905, + "step": 1181 + }, + { + "epoch": 0.11398264223722276, + "grad_norm": 2.1162147521972656, + "learning_rate": 4.841687840771767e-05, + "loss": 7.3258, + "step": 1182 + }, + { + "epoch": 0.11407907425265187, + "grad_norm": 2.3591182231903076, + "learning_rate": 4.841422500874348e-05, + "loss": 7.334, + "step": 1183 + }, + { + "epoch": 0.114175506268081, + "grad_norm": 3.349637269973755, + "learning_rate": 4.8411569460840524e-05, + "loss": 6.9337, + "step": 1184 + }, + { + "epoch": 0.11427193828351012, + "grad_norm": 4.811432361602783, + "learning_rate": 4.840891176425254e-05, + "loss": 6.7452, + "step": 1185 + }, + { + "epoch": 0.11436837029893925, + "grad_norm": 4.157739162445068, + "learning_rate": 4.840625191922344e-05, + "loss": 6.7969, + "step": 1186 + }, + { + "epoch": 0.11446480231436837, + "grad_norm": 3.4151101112365723, + "learning_rate": 4.840358992599736e-05, + "loss": 6.6916, + "step": 1187 + }, + { + "epoch": 0.11456123432979749, + "grad_norm": 3.4178223609924316, + "learning_rate": 4.8400925784818595e-05, + "loss": 6.6624, + "step": 1188 + }, + { + "epoch": 0.11465766634522662, + "grad_norm": 4.809190273284912, + "learning_rate": 4.8398259495931667e-05, + "loss": 6.6076, + "step": 1189 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 5.1235504150390625, + "learning_rate": 4.839559105958128e-05, + "loss": 6.6512, + "step": 1190 + }, + { + "epoch": 0.11485053037608486, + "grad_norm": 2.9320967197418213, + "learning_rate": 4.839292047601234e-05, + "loss": 6.6738, + "step": 1191 + }, + { + "epoch": 0.11494696239151399, + "grad_norm": 4.355123043060303, + "learning_rate": 4.8390247745469955e-05, + "loss": 6.6007, + "step": 1192 + }, + { + "epoch": 0.11504339440694311, + "grad_norm": 6.507326126098633, + "learning_rate": 4.8387572868199414e-05, + "loss": 6.5638, + "step": 1193 + }, + { + "epoch": 0.11513982642237222, + "grad_norm": 4.38944673538208, + "learning_rate": 4.838489584444623e-05, + "loss": 6.5498, + "step": 1194 + }, + { + "epoch": 0.11523625843780134, + "grad_norm": 4.491549968719482, + "learning_rate": 4.838221667445608e-05, + "loss": 6.402, + "step": 1195 + }, + { + "epoch": 0.11533269045323047, + "grad_norm": 2.8442740440368652, + "learning_rate": 4.8379535358474856e-05, + "loss": 6.7445, + "step": 1196 + }, + { + "epoch": 0.11542912246865959, + "grad_norm": 2.6438026428222656, + "learning_rate": 4.8376851896748665e-05, + "loss": 6.6878, + "step": 1197 + }, + { + "epoch": 0.11552555448408872, + "grad_norm": 3.202092409133911, + "learning_rate": 4.837416628952377e-05, + "loss": 6.5984, + "step": 1198 + }, + { + "epoch": 0.11562198649951784, + "grad_norm": 5.406285285949707, + "learning_rate": 4.837147853704667e-05, + "loss": 6.6195, + "step": 1199 + }, + { + "epoch": 0.11571841851494696, + "grad_norm": 6.5946125984191895, + "learning_rate": 4.8368788639564035e-05, + "loss": 6.6229, + "step": 1200 + }, + { + "epoch": 0.11581485053037609, + "grad_norm": 4.626385688781738, + "learning_rate": 4.836609659732273e-05, + "loss": 6.6131, + "step": 1201 + }, + { + "epoch": 0.11591128254580521, + "grad_norm": 6.064082622528076, + "learning_rate": 4.836340241056985e-05, + "loss": 6.5184, + "step": 1202 + }, + { + "epoch": 0.11600771456123433, + "grad_norm": 10.596954345703125, + "learning_rate": 4.8360706079552646e-05, + "loss": 6.4546, + "step": 1203 + }, + { + "epoch": 0.11610414657666346, + "grad_norm": 9.18006420135498, + "learning_rate": 4.835800760451859e-05, + "loss": 6.4718, + "step": 1204 + }, + { + "epoch": 0.11620057859209257, + "grad_norm": 4.23291015625, + "learning_rate": 4.8355306985715345e-05, + "loss": 6.5356, + "step": 1205 + }, + { + "epoch": 0.11629701060752169, + "grad_norm": 3.8353753089904785, + "learning_rate": 4.835260422339078e-05, + "loss": 6.5779, + "step": 1206 + }, + { + "epoch": 0.11639344262295082, + "grad_norm": 3.53157639503479, + "learning_rate": 4.834989931779294e-05, + "loss": 6.6403, + "step": 1207 + }, + { + "epoch": 0.11648987463837994, + "grad_norm": 5.773702144622803, + "learning_rate": 4.8347192269170064e-05, + "loss": 6.6148, + "step": 1208 + }, + { + "epoch": 0.11658630665380906, + "grad_norm": 5.96999979019165, + "learning_rate": 4.834448307777063e-05, + "loss": 6.7365, + "step": 1209 + }, + { + "epoch": 0.11668273866923819, + "grad_norm": 4.247237205505371, + "learning_rate": 4.8341771743843266e-05, + "loss": 6.5429, + "step": 1210 + }, + { + "epoch": 0.11677917068466731, + "grad_norm": 4.39473819732666, + "learning_rate": 4.833905826763683e-05, + "loss": 6.6776, + "step": 1211 + }, + { + "epoch": 0.11687560270009643, + "grad_norm": 5.228391170501709, + "learning_rate": 4.833634264940034e-05, + "loss": 6.6229, + "step": 1212 + }, + { + "epoch": 0.11697203471552556, + "grad_norm": 7.10579252243042, + "learning_rate": 4.833362488938305e-05, + "loss": 7.287, + "step": 1213 + }, + { + "epoch": 0.11706846673095468, + "grad_norm": 5.434371471405029, + "learning_rate": 4.8330904987834386e-05, + "loss": 7.2238, + "step": 1214 + }, + { + "epoch": 0.1171648987463838, + "grad_norm": 3.998452663421631, + "learning_rate": 4.8328182945003976e-05, + "loss": 7.392, + "step": 1215 + }, + { + "epoch": 0.11726133076181292, + "grad_norm": 2.10972261428833, + "learning_rate": 4.8325458761141653e-05, + "loss": 7.2575, + "step": 1216 + }, + { + "epoch": 0.11735776277724204, + "grad_norm": 5.405978679656982, + "learning_rate": 4.832273243649743e-05, + "loss": 7.1921, + "step": 1217 + }, + { + "epoch": 0.11745419479267116, + "grad_norm": 7.109873294830322, + "learning_rate": 4.832000397132153e-05, + "loss": 7.258, + "step": 1218 + }, + { + "epoch": 0.11755062680810029, + "grad_norm": 6.113857269287109, + "learning_rate": 4.831727336586437e-05, + "loss": 7.2583, + "step": 1219 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 3.9261600971221924, + "learning_rate": 4.8314540620376555e-05, + "loss": 7.258, + "step": 1220 + }, + { + "epoch": 0.11774349083895853, + "grad_norm": 4.4973249435424805, + "learning_rate": 4.8311805735108894e-05, + "loss": 7.3122, + "step": 1221 + }, + { + "epoch": 0.11783992285438766, + "grad_norm": 3.8392186164855957, + "learning_rate": 4.8309068710312404e-05, + "loss": 7.2705, + "step": 1222 + }, + { + "epoch": 0.11793635486981678, + "grad_norm": 3.8131377696990967, + "learning_rate": 4.8306329546238274e-05, + "loss": 7.147, + "step": 1223 + }, + { + "epoch": 0.1180327868852459, + "grad_norm": 4.648843765258789, + "learning_rate": 4.83035882431379e-05, + "loss": 7.0793, + "step": 1224 + }, + { + "epoch": 0.11812921890067503, + "grad_norm": 3.879821538925171, + "learning_rate": 4.830084480126288e-05, + "loss": 7.1832, + "step": 1225 + }, + { + "epoch": 0.11822565091610415, + "grad_norm": 3.932734966278076, + "learning_rate": 4.829809922086501e-05, + "loss": 7.0989, + "step": 1226 + }, + { + "epoch": 0.11832208293153326, + "grad_norm": 2.7236292362213135, + "learning_rate": 4.8295351502196264e-05, + "loss": 6.4874, + "step": 1227 + }, + { + "epoch": 0.11841851494696239, + "grad_norm": 2.184886932373047, + "learning_rate": 4.829260164550883e-05, + "loss": 7.1531, + "step": 1228 + }, + { + "epoch": 0.11851494696239151, + "grad_norm": 3.517503023147583, + "learning_rate": 4.828984965105508e-05, + "loss": 7.1213, + "step": 1229 + }, + { + "epoch": 0.11861137897782063, + "grad_norm": 2.312248945236206, + "learning_rate": 4.82870955190876e-05, + "loss": 7.1583, + "step": 1230 + }, + { + "epoch": 0.11870781099324976, + "grad_norm": 2.6807045936584473, + "learning_rate": 4.828433924985915e-05, + "loss": 7.3683, + "step": 1231 + }, + { + "epoch": 0.11880424300867888, + "grad_norm": 1.765264868736267, + "learning_rate": 4.8281580843622705e-05, + "loss": 7.3293, + "step": 1232 + }, + { + "epoch": 0.118900675024108, + "grad_norm": 2.2216410636901855, + "learning_rate": 4.8278820300631425e-05, + "loss": 7.2167, + "step": 1233 + }, + { + "epoch": 0.11899710703953713, + "grad_norm": 2.3777976036071777, + "learning_rate": 4.827605762113867e-05, + "loss": 7.3688, + "step": 1234 + }, + { + "epoch": 0.11909353905496625, + "grad_norm": 4.77785062789917, + "learning_rate": 4.8273292805397994e-05, + "loss": 6.8353, + "step": 1235 + }, + { + "epoch": 0.11918997107039538, + "grad_norm": 5.802145481109619, + "learning_rate": 4.827052585366315e-05, + "loss": 6.6216, + "step": 1236 + }, + { + "epoch": 0.1192864030858245, + "grad_norm": 3.524364709854126, + "learning_rate": 4.826775676618808e-05, + "loss": 6.6482, + "step": 1237 + }, + { + "epoch": 0.11938283510125361, + "grad_norm": 5.762240886688232, + "learning_rate": 4.8264985543226934e-05, + "loss": 6.8608, + "step": 1238 + }, + { + "epoch": 0.11947926711668273, + "grad_norm": 4.810267925262451, + "learning_rate": 4.8262212185034056e-05, + "loss": 6.6627, + "step": 1239 + }, + { + "epoch": 0.11957569913211186, + "grad_norm": 3.8413941860198975, + "learning_rate": 4.825943669186397e-05, + "loss": 6.7296, + "step": 1240 + }, + { + "epoch": 0.11967213114754098, + "grad_norm": 3.5716779232025146, + "learning_rate": 4.825665906397141e-05, + "loss": 6.7213, + "step": 1241 + }, + { + "epoch": 0.1197685631629701, + "grad_norm": 4.993801116943359, + "learning_rate": 4.825387930161131e-05, + "loss": 7.415, + "step": 1242 + }, + { + "epoch": 0.11986499517839923, + "grad_norm": 2.1277599334716797, + "learning_rate": 4.825109740503878e-05, + "loss": 7.3579, + "step": 1243 + }, + { + "epoch": 0.11996142719382835, + "grad_norm": 2.471595287322998, + "learning_rate": 4.824831337450916e-05, + "loss": 7.2987, + "step": 1244 + }, + { + "epoch": 0.12005785920925748, + "grad_norm": 2.5759222507476807, + "learning_rate": 4.824552721027794e-05, + "loss": 7.2461, + "step": 1245 + }, + { + "epoch": 0.1201542912246866, + "grad_norm": 2.8420872688293457, + "learning_rate": 4.8242738912600855e-05, + "loss": 7.088, + "step": 1246 + }, + { + "epoch": 0.12025072324011572, + "grad_norm": 2.525028944015503, + "learning_rate": 4.823994848173379e-05, + "loss": 7.1546, + "step": 1247 + }, + { + "epoch": 0.12034715525554485, + "grad_norm": 3.5775511264801025, + "learning_rate": 4.823715591793286e-05, + "loss": 6.6877, + "step": 1248 + }, + { + "epoch": 0.12044358727097397, + "grad_norm": 1.9440757036209106, + "learning_rate": 4.8234361221454365e-05, + "loss": 7.0268, + "step": 1249 + }, + { + "epoch": 0.12054001928640308, + "grad_norm": 3.359982490539551, + "learning_rate": 4.823156439255479e-05, + "loss": 7.178, + "step": 1250 + }, + { + "epoch": 0.1206364513018322, + "grad_norm": 2.539149284362793, + "learning_rate": 4.822876543149083e-05, + "loss": 7.0674, + "step": 1251 + }, + { + "epoch": 0.12073288331726133, + "grad_norm": 2.4249656200408936, + "learning_rate": 4.822596433851937e-05, + "loss": 7.3227, + "step": 1252 + }, + { + "epoch": 0.12082931533269045, + "grad_norm": 3.8289520740509033, + "learning_rate": 4.822316111389749e-05, + "loss": 7.2252, + "step": 1253 + }, + { + "epoch": 0.12092574734811957, + "grad_norm": 2.0398480892181396, + "learning_rate": 4.822035575788246e-05, + "loss": 7.1984, + "step": 1254 + }, + { + "epoch": 0.1210221793635487, + "grad_norm": 5.253793239593506, + "learning_rate": 4.821754827073176e-05, + "loss": 6.9679, + "step": 1255 + }, + { + "epoch": 0.12111861137897782, + "grad_norm": 10.3585844039917, + "learning_rate": 4.821473865270306e-05, + "loss": 6.9061, + "step": 1256 + }, + { + "epoch": 0.12121504339440695, + "grad_norm": 10.275850296020508, + "learning_rate": 4.821192690405421e-05, + "loss": 6.7738, + "step": 1257 + }, + { + "epoch": 0.12131147540983607, + "grad_norm": 6.445937633514404, + "learning_rate": 4.820911302504328e-05, + "loss": 6.8227, + "step": 1258 + }, + { + "epoch": 0.1214079074252652, + "grad_norm": 5.947268486022949, + "learning_rate": 4.8206297015928534e-05, + "loss": 6.6806, + "step": 1259 + }, + { + "epoch": 0.12150433944069432, + "grad_norm": 4.2752180099487305, + "learning_rate": 4.82034788769684e-05, + "loss": 6.6366, + "step": 1260 + }, + { + "epoch": 0.12160077145612343, + "grad_norm": 6.434248924255371, + "learning_rate": 4.8200658608421536e-05, + "loss": 6.5805, + "step": 1261 + }, + { + "epoch": 0.12169720347155255, + "grad_norm": 7.447798728942871, + "learning_rate": 4.819783621054678e-05, + "loss": 6.7877, + "step": 1262 + }, + { + "epoch": 0.12179363548698167, + "grad_norm": 8.219697952270508, + "learning_rate": 4.819501168360317e-05, + "loss": 6.7707, + "step": 1263 + }, + { + "epoch": 0.1218900675024108, + "grad_norm": 8.234067916870117, + "learning_rate": 4.8192185027849937e-05, + "loss": 6.5626, + "step": 1264 + }, + { + "epoch": 0.12198649951783992, + "grad_norm": 6.259660720825195, + "learning_rate": 4.81893562435465e-05, + "loss": 6.6431, + "step": 1265 + }, + { + "epoch": 0.12208293153326905, + "grad_norm": 4.509976387023926, + "learning_rate": 4.8186525330952495e-05, + "loss": 6.6832, + "step": 1266 + }, + { + "epoch": 0.12217936354869817, + "grad_norm": 3.2472777366638184, + "learning_rate": 4.818369229032773e-05, + "loss": 6.5647, + "step": 1267 + }, + { + "epoch": 0.1222757955641273, + "grad_norm": 4.092449188232422, + "learning_rate": 4.818085712193223e-05, + "loss": 6.4719, + "step": 1268 + }, + { + "epoch": 0.12237222757955642, + "grad_norm": 3.849250316619873, + "learning_rate": 4.817801982602618e-05, + "loss": 6.5778, + "step": 1269 + }, + { + "epoch": 0.12246865959498554, + "grad_norm": 2.8170604705810547, + "learning_rate": 4.817518040287001e-05, + "loss": 6.6495, + "step": 1270 + }, + { + "epoch": 0.12256509161041466, + "grad_norm": 1.3276063203811646, + "learning_rate": 4.8172338852724294e-05, + "loss": 6.6856, + "step": 1271 + }, + { + "epoch": 0.12266152362584377, + "grad_norm": 3.258204460144043, + "learning_rate": 4.8169495175849854e-05, + "loss": 6.5746, + "step": 1272 + }, + { + "epoch": 0.1227579556412729, + "grad_norm": 4.602853298187256, + "learning_rate": 4.8166649372507656e-05, + "loss": 6.5194, + "step": 1273 + }, + { + "epoch": 0.12285438765670202, + "grad_norm": 3.696258068084717, + "learning_rate": 4.816380144295889e-05, + "loss": 6.6471, + "step": 1274 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 2.5696563720703125, + "learning_rate": 4.8160951387464937e-05, + "loss": 6.5277, + "step": 1275 + }, + { + "epoch": 0.12304725168756027, + "grad_norm": 3.6009633541107178, + "learning_rate": 4.815809920628738e-05, + "loss": 6.7199, + "step": 1276 + }, + { + "epoch": 0.12314368370298939, + "grad_norm": 3.2129015922546387, + "learning_rate": 4.8155244899687974e-05, + "loss": 6.603, + "step": 1277 + }, + { + "epoch": 0.12324011571841852, + "grad_norm": 2.389948844909668, + "learning_rate": 4.815238846792869e-05, + "loss": 6.5334, + "step": 1278 + }, + { + "epoch": 0.12333654773384764, + "grad_norm": 3.4943597316741943, + "learning_rate": 4.814952991127169e-05, + "loss": 6.4366, + "step": 1279 + }, + { + "epoch": 0.12343297974927676, + "grad_norm": 2.575087547302246, + "learning_rate": 4.8146669229979324e-05, + "loss": 6.5319, + "step": 1280 + }, + { + "epoch": 0.12352941176470589, + "grad_norm": 1.9913874864578247, + "learning_rate": 4.8143806424314156e-05, + "loss": 6.6306, + "step": 1281 + }, + { + "epoch": 0.12362584378013501, + "grad_norm": 5.44051456451416, + "learning_rate": 4.814094149453891e-05, + "loss": 7.1164, + "step": 1282 + }, + { + "epoch": 0.12372227579556412, + "grad_norm": 2.977384090423584, + "learning_rate": 4.813807444091654e-05, + "loss": 7.0548, + "step": 1283 + }, + { + "epoch": 0.12381870781099324, + "grad_norm": 1.7310867309570312, + "learning_rate": 4.813520526371017e-05, + "loss": 6.8765, + "step": 1284 + }, + { + "epoch": 0.12391513982642237, + "grad_norm": 3.261918544769287, + "learning_rate": 4.813233396318314e-05, + "loss": 7.0957, + "step": 1285 + }, + { + "epoch": 0.12401157184185149, + "grad_norm": 4.313853740692139, + "learning_rate": 4.812946053959897e-05, + "loss": 7.1395, + "step": 1286 + }, + { + "epoch": 0.12410800385728062, + "grad_norm": 2.687566041946411, + "learning_rate": 4.812658499322138e-05, + "loss": 6.7397, + "step": 1287 + }, + { + "epoch": 0.12420443587270974, + "grad_norm": 2.2373409271240234, + "learning_rate": 4.812370732431428e-05, + "loss": 7.0582, + "step": 1288 + }, + { + "epoch": 0.12430086788813886, + "grad_norm": 2.885615348815918, + "learning_rate": 4.812082753314179e-05, + "loss": 6.5414, + "step": 1289 + }, + { + "epoch": 0.12439729990356799, + "grad_norm": 2.7554588317871094, + "learning_rate": 4.8117945619968194e-05, + "loss": 7.0637, + "step": 1290 + }, + { + "epoch": 0.12449373191899711, + "grad_norm": 3.7643444538116455, + "learning_rate": 4.811506158505802e-05, + "loss": 6.9949, + "step": 1291 + }, + { + "epoch": 0.12459016393442623, + "grad_norm": 4.225304126739502, + "learning_rate": 4.811217542867593e-05, + "loss": 7.3143, + "step": 1292 + }, + { + "epoch": 0.12468659594985536, + "grad_norm": 4.254619598388672, + "learning_rate": 4.810928715108683e-05, + "loss": 7.4941, + "step": 1293 + }, + { + "epoch": 0.12478302796528447, + "grad_norm": 1.9917078018188477, + "learning_rate": 4.810639675255579e-05, + "loss": 7.4564, + "step": 1294 + }, + { + "epoch": 0.12487945998071359, + "grad_norm": 3.8132834434509277, + "learning_rate": 4.8103504233348106e-05, + "loss": 7.3548, + "step": 1295 + }, + { + "epoch": 0.12497589199614272, + "grad_norm": 5.26384973526001, + "learning_rate": 4.810060959372924e-05, + "loss": 6.8394, + "step": 1296 + }, + { + "epoch": 0.12507232401157184, + "grad_norm": 5.4780707359313965, + "learning_rate": 4.809771283396485e-05, + "loss": 6.5325, + "step": 1297 + }, + { + "epoch": 0.12516875602700098, + "grad_norm": 2.8659932613372803, + "learning_rate": 4.809481395432081e-05, + "loss": 6.7165, + "step": 1298 + }, + { + "epoch": 0.1252651880424301, + "grad_norm": 4.3884100914001465, + "learning_rate": 4.809191295506317e-05, + "loss": 6.5711, + "step": 1299 + }, + { + "epoch": 0.1253616200578592, + "grad_norm": 3.38352108001709, + "learning_rate": 4.808900983645818e-05, + "loss": 6.641, + "step": 1300 + }, + { + "epoch": 0.12545805207328833, + "grad_norm": 3.6138923168182373, + "learning_rate": 4.8086104598772284e-05, + "loss": 6.594, + "step": 1301 + }, + { + "epoch": 0.12555448408871744, + "grad_norm": 3.6488115787506104, + "learning_rate": 4.8083197242272124e-05, + "loss": 6.5238, + "step": 1302 + }, + { + "epoch": 0.12565091610414658, + "grad_norm": 3.6168932914733887, + "learning_rate": 4.808028776722453e-05, + "loss": 6.4832, + "step": 1303 + }, + { + "epoch": 0.1257473481195757, + "grad_norm": 4.474188327789307, + "learning_rate": 4.807737617389654e-05, + "loss": 6.6218, + "step": 1304 + }, + { + "epoch": 0.12584378013500483, + "grad_norm": 2.7623989582061768, + "learning_rate": 4.807446246255536e-05, + "loss": 6.5617, + "step": 1305 + }, + { + "epoch": 0.12594021215043394, + "grad_norm": 1.667325496673584, + "learning_rate": 4.807154663346841e-05, + "loss": 6.6618, + "step": 1306 + }, + { + "epoch": 0.12603664416586308, + "grad_norm": 3.507291316986084, + "learning_rate": 4.806862868690332e-05, + "loss": 6.7554, + "step": 1307 + }, + { + "epoch": 0.1261330761812922, + "grad_norm": 10.000195503234863, + "learning_rate": 4.806570862312787e-05, + "loss": 7.4987, + "step": 1308 + }, + { + "epoch": 0.12622950819672132, + "grad_norm": 9.4988374710083, + "learning_rate": 4.806278644241008e-05, + "loss": 7.598, + "step": 1309 + }, + { + "epoch": 0.12632594021215043, + "grad_norm": 5.235955715179443, + "learning_rate": 4.805986214501813e-05, + "loss": 7.4784, + "step": 1310 + }, + { + "epoch": 0.12642237222757954, + "grad_norm": 3.8744800090789795, + "learning_rate": 4.8056935731220416e-05, + "loss": 7.301, + "step": 1311 + }, + { + "epoch": 0.12651880424300868, + "grad_norm": 3.733215808868408, + "learning_rate": 4.8054007201285525e-05, + "loss": 7.1579, + "step": 1312 + }, + { + "epoch": 0.1266152362584378, + "grad_norm": 2.711620569229126, + "learning_rate": 4.8051076555482214e-05, + "loss": 7.1606, + "step": 1313 + }, + { + "epoch": 0.12671166827386693, + "grad_norm": 3.936290740966797, + "learning_rate": 4.804814379407948e-05, + "loss": 7.0097, + "step": 1314 + }, + { + "epoch": 0.12680810028929604, + "grad_norm": 4.357621669769287, + "learning_rate": 4.8045208917346474e-05, + "loss": 7.1115, + "step": 1315 + }, + { + "epoch": 0.12690453230472518, + "grad_norm": 3.453382968902588, + "learning_rate": 4.8042271925552556e-05, + "loss": 7.041, + "step": 1316 + }, + { + "epoch": 0.12700096432015429, + "grad_norm": 4.170401573181152, + "learning_rate": 4.803933281896728e-05, + "loss": 6.7981, + "step": 1317 + }, + { + "epoch": 0.12709739633558342, + "grad_norm": 2.1517434120178223, + "learning_rate": 4.80363915978604e-05, + "loss": 6.9912, + "step": 1318 + }, + { + "epoch": 0.12719382835101253, + "grad_norm": 3.458941698074341, + "learning_rate": 4.8033448262501844e-05, + "loss": 6.7619, + "step": 1319 + }, + { + "epoch": 0.12729026036644167, + "grad_norm": 3.7802538871765137, + "learning_rate": 4.8030502813161766e-05, + "loss": 7.1542, + "step": 1320 + }, + { + "epoch": 0.12738669238187078, + "grad_norm": 5.178071975708008, + "learning_rate": 4.802755525011048e-05, + "loss": 7.06, + "step": 1321 + }, + { + "epoch": 0.1274831243972999, + "grad_norm": 3.4711318016052246, + "learning_rate": 4.802460557361852e-05, + "loss": 7.1224, + "step": 1322 + }, + { + "epoch": 0.12757955641272903, + "grad_norm": 4.636763572692871, + "learning_rate": 4.8021653783956585e-05, + "loss": 7.1857, + "step": 1323 + }, + { + "epoch": 0.12767598842815814, + "grad_norm": 3.860690116882324, + "learning_rate": 4.8018699881395616e-05, + "loss": 7.145, + "step": 1324 + }, + { + "epoch": 0.12777242044358728, + "grad_norm": 3.0911622047424316, + "learning_rate": 4.8015743866206696e-05, + "loss": 7.1366, + "step": 1325 + }, + { + "epoch": 0.12786885245901639, + "grad_norm": 4.485548973083496, + "learning_rate": 4.8012785738661134e-05, + "loss": 7.0236, + "step": 1326 + }, + { + "epoch": 0.12796528447444552, + "grad_norm": 9.610434532165527, + "learning_rate": 4.800982549903043e-05, + "loss": 6.9051, + "step": 1327 + }, + { + "epoch": 0.12806171648987463, + "grad_norm": 10.58263874053955, + "learning_rate": 4.800686314758624e-05, + "loss": 6.7681, + "step": 1328 + }, + { + "epoch": 0.12815814850530377, + "grad_norm": 7.696735858917236, + "learning_rate": 4.800389868460049e-05, + "loss": 6.7771, + "step": 1329 + }, + { + "epoch": 0.12825458052073288, + "grad_norm": 6.268252849578857, + "learning_rate": 4.800093211034522e-05, + "loss": 6.8019, + "step": 1330 + }, + { + "epoch": 0.12835101253616202, + "grad_norm": 10.353989601135254, + "learning_rate": 4.7997963425092716e-05, + "loss": 6.7361, + "step": 1331 + }, + { + "epoch": 0.12844744455159113, + "grad_norm": 11.671670913696289, + "learning_rate": 4.7994992629115434e-05, + "loss": 6.7238, + "step": 1332 + }, + { + "epoch": 0.12854387656702024, + "grad_norm": 11.043278694152832, + "learning_rate": 4.799201972268602e-05, + "loss": 6.6548, + "step": 1333 + }, + { + "epoch": 0.12864030858244938, + "grad_norm": 7.280984401702881, + "learning_rate": 4.7989044706077345e-05, + "loss": 6.4995, + "step": 1334 + }, + { + "epoch": 0.12873674059787849, + "grad_norm": 4.012502670288086, + "learning_rate": 4.798606757956244e-05, + "loss": 6.5013, + "step": 1335 + }, + { + "epoch": 0.12883317261330762, + "grad_norm": 5.816833972930908, + "learning_rate": 4.7983088343414545e-05, + "loss": 6.5633, + "step": 1336 + }, + { + "epoch": 0.12892960462873673, + "grad_norm": 8.176048278808594, + "learning_rate": 4.798010699790709e-05, + "loss": 6.5913, + "step": 1337 + }, + { + "epoch": 0.12902603664416587, + "grad_norm": 5.177139759063721, + "learning_rate": 4.797712354331369e-05, + "loss": 6.7659, + "step": 1338 + }, + { + "epoch": 0.12912246865959498, + "grad_norm": 4.561075687408447, + "learning_rate": 4.7974137979908185e-05, + "loss": 6.5695, + "step": 1339 + }, + { + "epoch": 0.12921890067502412, + "grad_norm": 5.0588178634643555, + "learning_rate": 4.797115030796456e-05, + "loss": 6.4787, + "step": 1340 + }, + { + "epoch": 0.12931533269045323, + "grad_norm": 3.8341989517211914, + "learning_rate": 4.796816052775705e-05, + "loss": 6.4004, + "step": 1341 + }, + { + "epoch": 0.12941176470588237, + "grad_norm": 8.982300758361816, + "learning_rate": 4.796516863956002e-05, + "loss": 7.3066, + "step": 1342 + }, + { + "epoch": 0.12950819672131147, + "grad_norm": 7.610286712646484, + "learning_rate": 4.796217464364808e-05, + "loss": 6.608, + "step": 1343 + }, + { + "epoch": 0.12960462873674058, + "grad_norm": 6.940805912017822, + "learning_rate": 4.795917854029601e-05, + "loss": 6.3849, + "step": 1344 + }, + { + "epoch": 0.12970106075216972, + "grad_norm": 5.499588966369629, + "learning_rate": 4.79561803297788e-05, + "loss": 6.3262, + "step": 1345 + }, + { + "epoch": 0.12979749276759883, + "grad_norm": 4.05429220199585, + "learning_rate": 4.795318001237161e-05, + "loss": 6.284, + "step": 1346 + }, + { + "epoch": 0.12989392478302797, + "grad_norm": 4.348995685577393, + "learning_rate": 4.795017758834981e-05, + "loss": 6.458, + "step": 1347 + }, + { + "epoch": 0.12999035679845708, + "grad_norm": 3.527035713195801, + "learning_rate": 4.794717305798896e-05, + "loss": 6.5564, + "step": 1348 + }, + { + "epoch": 0.13008678881388622, + "grad_norm": 2.8064236640930176, + "learning_rate": 4.794416642156481e-05, + "loss": 6.5844, + "step": 1349 + }, + { + "epoch": 0.13018322082931533, + "grad_norm": 3.77030086517334, + "learning_rate": 4.79411576793533e-05, + "loss": 6.2607, + "step": 1350 + }, + { + "epoch": 0.13027965284474446, + "grad_norm": 4.653589725494385, + "learning_rate": 4.7938146831630575e-05, + "loss": 6.5126, + "step": 1351 + }, + { + "epoch": 0.13037608486017357, + "grad_norm": 3.8799610137939453, + "learning_rate": 4.793513387867297e-05, + "loss": 6.5141, + "step": 1352 + }, + { + "epoch": 0.1304725168756027, + "grad_norm": 2.8023629188537598, + "learning_rate": 4.793211882075701e-05, + "loss": 6.629, + "step": 1353 + }, + { + "epoch": 0.13056894889103182, + "grad_norm": 4.008967876434326, + "learning_rate": 4.7929101658159404e-05, + "loss": 6.401, + "step": 1354 + }, + { + "epoch": 0.13066538090646093, + "grad_norm": 2.9612205028533936, + "learning_rate": 4.792608239115707e-05, + "loss": 6.5417, + "step": 1355 + }, + { + "epoch": 0.13076181292189007, + "grad_norm": 3.341012477874756, + "learning_rate": 4.792306102002711e-05, + "loss": 6.6363, + "step": 1356 + }, + { + "epoch": 0.13085824493731918, + "grad_norm": 4.2262282371521, + "learning_rate": 4.792003754504682e-05, + "loss": 6.652, + "step": 1357 + }, + { + "epoch": 0.13095467695274832, + "grad_norm": 4.115860462188721, + "learning_rate": 4.7917011966493696e-05, + "loss": 7.086, + "step": 1358 + }, + { + "epoch": 0.13105110896817743, + "grad_norm": 3.645376205444336, + "learning_rate": 4.791398428464543e-05, + "loss": 7.1192, + "step": 1359 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 2.2866406440734863, + "learning_rate": 4.7910954499779875e-05, + "loss": 7.1557, + "step": 1360 + }, + { + "epoch": 0.13124397299903567, + "grad_norm": 3.266589641571045, + "learning_rate": 4.790792261217512e-05, + "loss": 7.2873, + "step": 1361 + }, + { + "epoch": 0.1313404050144648, + "grad_norm": 4.173161506652832, + "learning_rate": 4.790488862210942e-05, + "loss": 7.1248, + "step": 1362 + }, + { + "epoch": 0.13143683702989392, + "grad_norm": 3.030715227127075, + "learning_rate": 4.790185252986124e-05, + "loss": 7.1702, + "step": 1363 + }, + { + "epoch": 0.13153326904532306, + "grad_norm": 2.4789199829101562, + "learning_rate": 4.789881433570922e-05, + "loss": 6.7595, + "step": 1364 + }, + { + "epoch": 0.13162970106075217, + "grad_norm": 2.2338547706604004, + "learning_rate": 4.789577403993221e-05, + "loss": 7.0684, + "step": 1365 + }, + { + "epoch": 0.13172613307618128, + "grad_norm": 4.547529697418213, + "learning_rate": 4.789273164280923e-05, + "loss": 7.0858, + "step": 1366 + }, + { + "epoch": 0.13182256509161042, + "grad_norm": 3.721975326538086, + "learning_rate": 4.788968714461951e-05, + "loss": 7.1512, + "step": 1367 + }, + { + "epoch": 0.13191899710703953, + "grad_norm": 3.155782461166382, + "learning_rate": 4.788664054564249e-05, + "loss": 7.2329, + "step": 1368 + }, + { + "epoch": 0.13201542912246866, + "grad_norm": 3.9552950859069824, + "learning_rate": 4.7883591846157764e-05, + "loss": 6.5618, + "step": 1369 + }, + { + "epoch": 0.13211186113789777, + "grad_norm": 2.4105300903320312, + "learning_rate": 4.788054104644515e-05, + "loss": 7.2543, + "step": 1370 + }, + { + "epoch": 0.1322082931533269, + "grad_norm": 3.005694627761841, + "learning_rate": 4.7877488146784634e-05, + "loss": 7.2564, + "step": 1371 + }, + { + "epoch": 0.13230472516875602, + "grad_norm": 5.246753215789795, + "learning_rate": 4.787443314745641e-05, + "loss": 7.1225, + "step": 1372 + }, + { + "epoch": 0.13240115718418516, + "grad_norm": 7.475520610809326, + "learning_rate": 4.787137604874087e-05, + "loss": 6.7555, + "step": 1373 + }, + { + "epoch": 0.13249758919961427, + "grad_norm": 6.792036533355713, + "learning_rate": 4.786831685091858e-05, + "loss": 6.574, + "step": 1374 + }, + { + "epoch": 0.1325940212150434, + "grad_norm": 3.259542465209961, + "learning_rate": 4.786525555427033e-05, + "loss": 6.6078, + "step": 1375 + }, + { + "epoch": 0.13269045323047252, + "grad_norm": 2.9490342140197754, + "learning_rate": 4.786219215907706e-05, + "loss": 6.5749, + "step": 1376 + }, + { + "epoch": 0.13278688524590163, + "grad_norm": 3.3047258853912354, + "learning_rate": 4.7859126665619925e-05, + "loss": 6.5939, + "step": 1377 + }, + { + "epoch": 0.13288331726133076, + "grad_norm": 5.018588542938232, + "learning_rate": 4.785605907418029e-05, + "loss": 6.4933, + "step": 1378 + }, + { + "epoch": 0.13297974927675987, + "grad_norm": 7.035295009613037, + "learning_rate": 4.7852989385039684e-05, + "loss": 6.5217, + "step": 1379 + }, + { + "epoch": 0.133076181292189, + "grad_norm": 5.070717811584473, + "learning_rate": 4.7849917598479834e-05, + "loss": 6.6128, + "step": 1380 + }, + { + "epoch": 0.13317261330761812, + "grad_norm": 3.4700448513031006, + "learning_rate": 4.784684371478267e-05, + "loss": 6.4712, + "step": 1381 + }, + { + "epoch": 0.13326904532304726, + "grad_norm": 5.184774398803711, + "learning_rate": 4.784376773423032e-05, + "loss": 6.7023, + "step": 1382 + }, + { + "epoch": 0.13336547733847637, + "grad_norm": 7.974812030792236, + "learning_rate": 4.784068965710507e-05, + "loss": 7.3045, + "step": 1383 + }, + { + "epoch": 0.1334619093539055, + "grad_norm": 8.240985870361328, + "learning_rate": 4.783760948368944e-05, + "loss": 7.2747, + "step": 1384 + }, + { + "epoch": 0.13355834136933462, + "grad_norm": 5.724721908569336, + "learning_rate": 4.783452721426612e-05, + "loss": 7.4663, + "step": 1385 + }, + { + "epoch": 0.13365477338476375, + "grad_norm": 3.690551519393921, + "learning_rate": 4.7831442849118e-05, + "loss": 6.2953, + "step": 1386 + }, + { + "epoch": 0.13375120540019286, + "grad_norm": 3.8149590492248535, + "learning_rate": 4.7828356388528155e-05, + "loss": 7.0379, + "step": 1387 + }, + { + "epoch": 0.13384763741562197, + "grad_norm": 3.9142963886260986, + "learning_rate": 4.7825267832779864e-05, + "loss": 7.2393, + "step": 1388 + }, + { + "epoch": 0.1339440694310511, + "grad_norm": 5.532604694366455, + "learning_rate": 4.782217718215657e-05, + "loss": 7.1492, + "step": 1389 + }, + { + "epoch": 0.13404050144648022, + "grad_norm": 5.381924629211426, + "learning_rate": 4.781908443694196e-05, + "loss": 6.984, + "step": 1390 + }, + { + "epoch": 0.13413693346190936, + "grad_norm": 4.69866943359375, + "learning_rate": 4.7815989597419853e-05, + "loss": 7.0973, + "step": 1391 + }, + { + "epoch": 0.13423336547733847, + "grad_norm": 4.605631351470947, + "learning_rate": 4.7812892663874306e-05, + "loss": 6.4037, + "step": 1392 + }, + { + "epoch": 0.1343297974927676, + "grad_norm": 4.039719104766846, + "learning_rate": 4.780979363658955e-05, + "loss": 7.0561, + "step": 1393 + }, + { + "epoch": 0.13442622950819672, + "grad_norm": 3.6495237350463867, + "learning_rate": 4.780669251585001e-05, + "loss": 6.9704, + "step": 1394 + }, + { + "epoch": 0.13452266152362585, + "grad_norm": 6.504954814910889, + "learning_rate": 4.7803589301940304e-05, + "loss": 7.1719, + "step": 1395 + }, + { + "epoch": 0.13461909353905496, + "grad_norm": 6.954169750213623, + "learning_rate": 4.7800483995145234e-05, + "loss": 7.2251, + "step": 1396 + }, + { + "epoch": 0.1347155255544841, + "grad_norm": 4.336275100708008, + "learning_rate": 4.779737659574981e-05, + "loss": 6.9763, + "step": 1397 + }, + { + "epoch": 0.1348119575699132, + "grad_norm": 4.977863311767578, + "learning_rate": 4.7794267104039214e-05, + "loss": 7.0902, + "step": 1398 + }, + { + "epoch": 0.13490838958534232, + "grad_norm": 5.362631320953369, + "learning_rate": 4.779115552029884e-05, + "loss": 7.149, + "step": 1399 + }, + { + "epoch": 0.13500482160077146, + "grad_norm": 3.300624132156372, + "learning_rate": 4.7788041844814266e-05, + "loss": 7.1824, + "step": 1400 + }, + { + "epoch": 0.13510125361620057, + "grad_norm": 2.5886294841766357, + "learning_rate": 4.7784926077871264e-05, + "loss": 6.9527, + "step": 1401 + }, + { + "epoch": 0.1351976856316297, + "grad_norm": 5.781499862670898, + "learning_rate": 4.778180821975578e-05, + "loss": 6.8657, + "step": 1402 + }, + { + "epoch": 0.13529411764705881, + "grad_norm": 5.229467868804932, + "learning_rate": 4.777868827075398e-05, + "loss": 6.6969, + "step": 1403 + }, + { + "epoch": 0.13539054966248795, + "grad_norm": 3.074295997619629, + "learning_rate": 4.777556623115221e-05, + "loss": 6.661, + "step": 1404 + }, + { + "epoch": 0.13548698167791706, + "grad_norm": 3.2602787017822266, + "learning_rate": 4.7772442101237004e-05, + "loss": 7.0979, + "step": 1405 + }, + { + "epoch": 0.1355834136933462, + "grad_norm": 3.376274347305298, + "learning_rate": 4.776931588129508e-05, + "loss": 7.1303, + "step": 1406 + }, + { + "epoch": 0.1356798457087753, + "grad_norm": 2.0409255027770996, + "learning_rate": 4.776618757161338e-05, + "loss": 7.1206, + "step": 1407 + }, + { + "epoch": 0.13577627772420445, + "grad_norm": 3.3393664360046387, + "learning_rate": 4.776305717247901e-05, + "loss": 7.3534, + "step": 1408 + }, + { + "epoch": 0.13587270973963356, + "grad_norm": 4.252486705780029, + "learning_rate": 4.775992468417926e-05, + "loss": 7.1964, + "step": 1409 + }, + { + "epoch": 0.1359691417550627, + "grad_norm": 4.581374645233154, + "learning_rate": 4.7756790107001636e-05, + "loss": 7.2303, + "step": 1410 + }, + { + "epoch": 0.1360655737704918, + "grad_norm": 3.57556414604187, + "learning_rate": 4.775365344123382e-05, + "loss": 7.1888, + "step": 1411 + }, + { + "epoch": 0.13616200578592091, + "grad_norm": 2.3019704818725586, + "learning_rate": 4.775051468716371e-05, + "loss": 7.1305, + "step": 1412 + }, + { + "epoch": 0.13625843780135005, + "grad_norm": 2.7968618869781494, + "learning_rate": 4.774737384507936e-05, + "loss": 7.1917, + "step": 1413 + }, + { + "epoch": 0.13635486981677916, + "grad_norm": 3.1241393089294434, + "learning_rate": 4.7744230915269025e-05, + "loss": 7.2081, + "step": 1414 + }, + { + "epoch": 0.1364513018322083, + "grad_norm": 2.304201364517212, + "learning_rate": 4.774108589802118e-05, + "loss": 7.1006, + "step": 1415 + }, + { + "epoch": 0.1365477338476374, + "grad_norm": 2.5283093452453613, + "learning_rate": 4.773793879362446e-05, + "loss": 7.2975, + "step": 1416 + }, + { + "epoch": 0.13664416586306655, + "grad_norm": 1.7724400758743286, + "learning_rate": 4.77347896023677e-05, + "loss": 7.2892, + "step": 1417 + }, + { + "epoch": 0.13674059787849566, + "grad_norm": 1.4972214698791504, + "learning_rate": 4.7731638324539935e-05, + "loss": 7.2416, + "step": 1418 + }, + { + "epoch": 0.1368370298939248, + "grad_norm": 1.259045124053955, + "learning_rate": 4.772848496043039e-05, + "loss": 7.2406, + "step": 1419 + }, + { + "epoch": 0.1369334619093539, + "grad_norm": 1.8162273168563843, + "learning_rate": 4.7725329510328455e-05, + "loss": 7.2201, + "step": 1420 + }, + { + "epoch": 0.13702989392478304, + "grad_norm": 3.1181929111480713, + "learning_rate": 4.7722171974523755e-05, + "loss": 7.1385, + "step": 1421 + }, + { + "epoch": 0.13712632594021215, + "grad_norm": 4.2943220138549805, + "learning_rate": 4.7719012353306076e-05, + "loss": 7.0451, + "step": 1422 + }, + { + "epoch": 0.13722275795564126, + "grad_norm": 2.929924726486206, + "learning_rate": 4.77158506469654e-05, + "loss": 7.1015, + "step": 1423 + }, + { + "epoch": 0.1373191899710704, + "grad_norm": 2.7517452239990234, + "learning_rate": 4.771268685579192e-05, + "loss": 7.181, + "step": 1424 + }, + { + "epoch": 0.1374156219864995, + "grad_norm": 2.29756236076355, + "learning_rate": 4.7709520980075995e-05, + "loss": 7.2194, + "step": 1425 + }, + { + "epoch": 0.13751205400192865, + "grad_norm": 1.7548775672912598, + "learning_rate": 4.770635302010818e-05, + "loss": 7.1097, + "step": 1426 + }, + { + "epoch": 0.13760848601735776, + "grad_norm": 2.09104323387146, + "learning_rate": 4.770318297617923e-05, + "loss": 7.1428, + "step": 1427 + }, + { + "epoch": 0.1377049180327869, + "grad_norm": 3.059793710708618, + "learning_rate": 4.77000108485801e-05, + "loss": 6.979, + "step": 1428 + }, + { + "epoch": 0.137801350048216, + "grad_norm": 2.2625718116760254, + "learning_rate": 4.76968366376019e-05, + "loss": 6.7327, + "step": 1429 + }, + { + "epoch": 0.13789778206364514, + "grad_norm": 4.647804260253906, + "learning_rate": 4.769366034353598e-05, + "loss": 7.215, + "step": 1430 + }, + { + "epoch": 0.13799421407907425, + "grad_norm": 3.493544816970825, + "learning_rate": 4.769048196667384e-05, + "loss": 7.149, + "step": 1431 + }, + { + "epoch": 0.1380906460945034, + "grad_norm": 3.0944204330444336, + "learning_rate": 4.768730150730719e-05, + "loss": 6.8781, + "step": 1432 + }, + { + "epoch": 0.1381870781099325, + "grad_norm": 4.288383960723877, + "learning_rate": 4.768411896572794e-05, + "loss": 7.3091, + "step": 1433 + }, + { + "epoch": 0.1382835101253616, + "grad_norm": 3.1691699028015137, + "learning_rate": 4.7680934342228164e-05, + "loss": 7.365, + "step": 1434 + }, + { + "epoch": 0.13837994214079075, + "grad_norm": 2.5559675693511963, + "learning_rate": 4.7677747637100156e-05, + "loss": 7.2967, + "step": 1435 + }, + { + "epoch": 0.13847637415621986, + "grad_norm": 2.71998929977417, + "learning_rate": 4.767455885063637e-05, + "loss": 7.1901, + "step": 1436 + }, + { + "epoch": 0.138572806171649, + "grad_norm": 2.1928396224975586, + "learning_rate": 4.767136798312949e-05, + "loss": 7.1663, + "step": 1437 + }, + { + "epoch": 0.1386692381870781, + "grad_norm": 1.6902536153793335, + "learning_rate": 4.766817503487236e-05, + "loss": 7.2698, + "step": 1438 + }, + { + "epoch": 0.13876567020250724, + "grad_norm": 2.9109914302825928, + "learning_rate": 4.766498000615802e-05, + "loss": 7.1788, + "step": 1439 + }, + { + "epoch": 0.13886210221793635, + "grad_norm": 3.4184374809265137, + "learning_rate": 4.7661782897279717e-05, + "loss": 7.1144, + "step": 1440 + }, + { + "epoch": 0.1389585342333655, + "grad_norm": 2.7066588401794434, + "learning_rate": 4.765858370853087e-05, + "loss": 7.0084, + "step": 1441 + }, + { + "epoch": 0.1390549662487946, + "grad_norm": 3.7095606327056885, + "learning_rate": 4.765538244020509e-05, + "loss": 7.2031, + "step": 1442 + }, + { + "epoch": 0.13915139826422374, + "grad_norm": 3.9012858867645264, + "learning_rate": 4.7652179092596205e-05, + "loss": 7.1357, + "step": 1443 + }, + { + "epoch": 0.13924783027965285, + "grad_norm": 4.444908618927002, + "learning_rate": 4.76489736659982e-05, + "loss": 7.0805, + "step": 1444 + }, + { + "epoch": 0.13934426229508196, + "grad_norm": 2.8493432998657227, + "learning_rate": 4.764576616070527e-05, + "loss": 7.1966, + "step": 1445 + }, + { + "epoch": 0.1394406943105111, + "grad_norm": 1.7852654457092285, + "learning_rate": 4.764255657701179e-05, + "loss": 6.8927, + "step": 1446 + }, + { + "epoch": 0.1395371263259402, + "grad_norm": 3.0515248775482178, + "learning_rate": 4.7639344915212334e-05, + "loss": 6.8634, + "step": 1447 + }, + { + "epoch": 0.13963355834136934, + "grad_norm": 4.251338005065918, + "learning_rate": 4.7636131175601674e-05, + "loss": 7.1871, + "step": 1448 + }, + { + "epoch": 0.13972999035679845, + "grad_norm": 3.037013530731201, + "learning_rate": 4.763291535847475e-05, + "loss": 7.178, + "step": 1449 + }, + { + "epoch": 0.1398264223722276, + "grad_norm": 3.9990122318267822, + "learning_rate": 4.762969746412671e-05, + "loss": 6.8645, + "step": 1450 + }, + { + "epoch": 0.1399228543876567, + "grad_norm": 5.325549602508545, + "learning_rate": 4.7626477492852896e-05, + "loss": 6.9198, + "step": 1451 + }, + { + "epoch": 0.14001928640308584, + "grad_norm": 4.666787147521973, + "learning_rate": 4.7623255444948825e-05, + "loss": 7.2674, + "step": 1452 + }, + { + "epoch": 0.14011571841851495, + "grad_norm": 6.384138107299805, + "learning_rate": 4.76200313207102e-05, + "loss": 7.1155, + "step": 1453 + }, + { + "epoch": 0.14021215043394408, + "grad_norm": 5.110367774963379, + "learning_rate": 4.761680512043296e-05, + "loss": 6.8294, + "step": 1454 + }, + { + "epoch": 0.1403085824493732, + "grad_norm": 3.4851973056793213, + "learning_rate": 4.761357684441317e-05, + "loss": 6.8922, + "step": 1455 + }, + { + "epoch": 0.1404050144648023, + "grad_norm": 4.939188480377197, + "learning_rate": 4.7610346492947134e-05, + "loss": 6.6418, + "step": 1456 + }, + { + "epoch": 0.14050144648023144, + "grad_norm": 3.324134588241577, + "learning_rate": 4.7607114066331325e-05, + "loss": 6.8658, + "step": 1457 + }, + { + "epoch": 0.14059787849566055, + "grad_norm": 3.577562093734741, + "learning_rate": 4.7603879564862416e-05, + "loss": 6.8796, + "step": 1458 + }, + { + "epoch": 0.1406943105110897, + "grad_norm": 2.6762855052948, + "learning_rate": 4.760064298883726e-05, + "loss": 6.6494, + "step": 1459 + }, + { + "epoch": 0.1407907425265188, + "grad_norm": 2.546217203140259, + "learning_rate": 4.7597404338552906e-05, + "loss": 7.043, + "step": 1460 + }, + { + "epoch": 0.14088717454194793, + "grad_norm": 3.610311985015869, + "learning_rate": 4.75941636143066e-05, + "loss": 7.1054, + "step": 1461 + }, + { + "epoch": 0.14098360655737704, + "grad_norm": 5.125514507293701, + "learning_rate": 4.759092081639576e-05, + "loss": 6.9286, + "step": 1462 + }, + { + "epoch": 0.14108003857280618, + "grad_norm": 6.223405361175537, + "learning_rate": 4.758767594511801e-05, + "loss": 6.951, + "step": 1463 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 3.3683412075042725, + "learning_rate": 4.758442900077116e-05, + "loss": 7.0333, + "step": 1464 + }, + { + "epoch": 0.14127290260366443, + "grad_norm": 3.6488282680511475, + "learning_rate": 4.758117998365322e-05, + "loss": 7.167, + "step": 1465 + }, + { + "epoch": 0.14136933461909354, + "grad_norm": 3.380436897277832, + "learning_rate": 4.757792889406237e-05, + "loss": 6.8792, + "step": 1466 + }, + { + "epoch": 0.14146576663452265, + "grad_norm": 3.2010347843170166, + "learning_rate": 4.7574675732297004e-05, + "loss": 6.9116, + "step": 1467 + }, + { + "epoch": 0.1415621986499518, + "grad_norm": 7.415546894073486, + "learning_rate": 4.757142049865568e-05, + "loss": 6.7717, + "step": 1468 + }, + { + "epoch": 0.1416586306653809, + "grad_norm": 6.159844398498535, + "learning_rate": 4.756816319343717e-05, + "loss": 6.9268, + "step": 1469 + }, + { + "epoch": 0.14175506268081003, + "grad_norm": 5.585522174835205, + "learning_rate": 4.7564903816940405e-05, + "loss": 7.1597, + "step": 1470 + }, + { + "epoch": 0.14185149469623914, + "grad_norm": 4.073063850402832, + "learning_rate": 4.756164236946454e-05, + "loss": 7.2202, + "step": 1471 + }, + { + "epoch": 0.14194792671166828, + "grad_norm": 3.3549818992614746, + "learning_rate": 4.7558378851308914e-05, + "loss": 7.0217, + "step": 1472 + }, + { + "epoch": 0.1420443587270974, + "grad_norm": 3.6694483757019043, + "learning_rate": 4.755511326277304e-05, + "loss": 7.1157, + "step": 1473 + }, + { + "epoch": 0.14214079074252653, + "grad_norm": 4.128441333770752, + "learning_rate": 4.755184560415664e-05, + "loss": 7.0776, + "step": 1474 + }, + { + "epoch": 0.14223722275795564, + "grad_norm": 3.937960147857666, + "learning_rate": 4.75485758757596e-05, + "loss": 7.0882, + "step": 1475 + }, + { + "epoch": 0.14233365477338478, + "grad_norm": 2.5523972511291504, + "learning_rate": 4.754530407788202e-05, + "loss": 6.9827, + "step": 1476 + }, + { + "epoch": 0.1424300867888139, + "grad_norm": 2.586390972137451, + "learning_rate": 4.754203021082418e-05, + "loss": 6.9247, + "step": 1477 + }, + { + "epoch": 0.142526518804243, + "grad_norm": 2.6148838996887207, + "learning_rate": 4.753875427488655e-05, + "loss": 7.2124, + "step": 1478 + }, + { + "epoch": 0.14262295081967213, + "grad_norm": 2.317155361175537, + "learning_rate": 4.753547627036979e-05, + "loss": 7.3128, + "step": 1479 + }, + { + "epoch": 0.14271938283510124, + "grad_norm": 2.8585944175720215, + "learning_rate": 4.7532196197574766e-05, + "loss": 7.0598, + "step": 1480 + }, + { + "epoch": 0.14281581485053038, + "grad_norm": 2.436184883117676, + "learning_rate": 4.75289140568025e-05, + "loss": 7.0934, + "step": 1481 + }, + { + "epoch": 0.1429122468659595, + "grad_norm": 1.9885950088500977, + "learning_rate": 4.7525629848354234e-05, + "loss": 7.2593, + "step": 1482 + }, + { + "epoch": 0.14300867888138863, + "grad_norm": 3.1847314834594727, + "learning_rate": 4.7522343572531384e-05, + "loss": 7.3926, + "step": 1483 + }, + { + "epoch": 0.14310511089681774, + "grad_norm": 3.5565595626831055, + "learning_rate": 4.751905522963556e-05, + "loss": 6.8907, + "step": 1484 + }, + { + "epoch": 0.14320154291224688, + "grad_norm": 4.189769268035889, + "learning_rate": 4.751576481996857e-05, + "loss": 7.21, + "step": 1485 + }, + { + "epoch": 0.143297974927676, + "grad_norm": 1.826671838760376, + "learning_rate": 4.7512472343832384e-05, + "loss": 7.2789, + "step": 1486 + }, + { + "epoch": 0.14339440694310512, + "grad_norm": 2.1048407554626465, + "learning_rate": 4.7509177801529205e-05, + "loss": 7.2592, + "step": 1487 + }, + { + "epoch": 0.14349083895853423, + "grad_norm": 5.093955993652344, + "learning_rate": 4.750588119336138e-05, + "loss": 6.9952, + "step": 1488 + }, + { + "epoch": 0.14358727097396334, + "grad_norm": 1.973682165145874, + "learning_rate": 4.750258251963149e-05, + "loss": 6.9564, + "step": 1489 + }, + { + "epoch": 0.14368370298939248, + "grad_norm": 4.32019567489624, + "learning_rate": 4.749928178064228e-05, + "loss": 6.8551, + "step": 1490 + }, + { + "epoch": 0.1437801350048216, + "grad_norm": 2.3317105770111084, + "learning_rate": 4.749597897669666e-05, + "loss": 6.8383, + "step": 1491 + }, + { + "epoch": 0.14387656702025073, + "grad_norm": 3.29215669631958, + "learning_rate": 4.749267410809779e-05, + "loss": 7.0025, + "step": 1492 + }, + { + "epoch": 0.14397299903567984, + "grad_norm": 3.4216411113739014, + "learning_rate": 4.748936717514897e-05, + "loss": 7.1433, + "step": 1493 + }, + { + "epoch": 0.14406943105110898, + "grad_norm": 2.344186782836914, + "learning_rate": 4.7486058178153713e-05, + "loss": 7.1066, + "step": 1494 + }, + { + "epoch": 0.14416586306653809, + "grad_norm": 2.2700610160827637, + "learning_rate": 4.7482747117415714e-05, + "loss": 7.0219, + "step": 1495 + }, + { + "epoch": 0.14426229508196722, + "grad_norm": 2.045239210128784, + "learning_rate": 4.747943399323885e-05, + "loss": 7.1325, + "step": 1496 + }, + { + "epoch": 0.14435872709739633, + "grad_norm": 2.7328357696533203, + "learning_rate": 4.747611880592721e-05, + "loss": 7.1747, + "step": 1497 + }, + { + "epoch": 0.14445515911282547, + "grad_norm": 2.2133545875549316, + "learning_rate": 4.747280155578505e-05, + "loss": 6.9338, + "step": 1498 + }, + { + "epoch": 0.14455159112825458, + "grad_norm": 2.1961286067962646, + "learning_rate": 4.7469482243116814e-05, + "loss": 6.9549, + "step": 1499 + }, + { + "epoch": 0.1446480231436837, + "grad_norm": 2.502500295639038, + "learning_rate": 4.746616086822716e-05, + "loss": 7.0853, + "step": 1500 + }, + { + "epoch": 0.14474445515911283, + "grad_norm": 4.178162097930908, + "learning_rate": 4.7462837431420914e-05, + "loss": 7.1286, + "step": 1501 + }, + { + "epoch": 0.14484088717454194, + "grad_norm": 2.5148813724517822, + "learning_rate": 4.745951193300309e-05, + "loss": 7.1966, + "step": 1502 + }, + { + "epoch": 0.14493731918997108, + "grad_norm": 3.525099039077759, + "learning_rate": 4.7456184373278906e-05, + "loss": 7.2012, + "step": 1503 + }, + { + "epoch": 0.14503375120540019, + "grad_norm": 4.232686996459961, + "learning_rate": 4.7452854752553756e-05, + "loss": 7.0317, + "step": 1504 + }, + { + "epoch": 0.14513018322082932, + "grad_norm": 5.112872123718262, + "learning_rate": 4.744952307113324e-05, + "loss": 7.2129, + "step": 1505 + }, + { + "epoch": 0.14522661523625843, + "grad_norm": 1.8115507364273071, + "learning_rate": 4.744618932932312e-05, + "loss": 7.048, + "step": 1506 + }, + { + "epoch": 0.14532304725168757, + "grad_norm": 2.016359567642212, + "learning_rate": 4.7442853527429375e-05, + "loss": 7.1005, + "step": 1507 + }, + { + "epoch": 0.14541947926711668, + "grad_norm": 2.147184371948242, + "learning_rate": 4.743951566575815e-05, + "loss": 7.0633, + "step": 1508 + }, + { + "epoch": 0.14551591128254582, + "grad_norm": 3.319457769393921, + "learning_rate": 4.7436175744615805e-05, + "loss": 6.8854, + "step": 1509 + }, + { + "epoch": 0.14561234329797493, + "grad_norm": 2.833875894546509, + "learning_rate": 4.743283376430886e-05, + "loss": 7.0719, + "step": 1510 + }, + { + "epoch": 0.14570877531340404, + "grad_norm": 2.463930606842041, + "learning_rate": 4.742948972514404e-05, + "loss": 7.1921, + "step": 1511 + }, + { + "epoch": 0.14580520732883318, + "grad_norm": 1.9980497360229492, + "learning_rate": 4.742614362742826e-05, + "loss": 7.1128, + "step": 1512 + }, + { + "epoch": 0.14590163934426228, + "grad_norm": 3.2323105335235596, + "learning_rate": 4.742279547146862e-05, + "loss": 7.1635, + "step": 1513 + }, + { + "epoch": 0.14599807135969142, + "grad_norm": 3.606111526489258, + "learning_rate": 4.741944525757242e-05, + "loss": 7.322, + "step": 1514 + }, + { + "epoch": 0.14609450337512053, + "grad_norm": 2.694765567779541, + "learning_rate": 4.741609298604711e-05, + "loss": 7.286, + "step": 1515 + }, + { + "epoch": 0.14619093539054967, + "grad_norm": 2.9078474044799805, + "learning_rate": 4.741273865720039e-05, + "loss": 7.2747, + "step": 1516 + }, + { + "epoch": 0.14628736740597878, + "grad_norm": 2.3657166957855225, + "learning_rate": 4.7409382271340084e-05, + "loss": 6.9932, + "step": 1517 + }, + { + "epoch": 0.14638379942140792, + "grad_norm": 3.235893726348877, + "learning_rate": 4.7406023828774274e-05, + "loss": 6.7878, + "step": 1518 + }, + { + "epoch": 0.14648023143683703, + "grad_norm": 2.6321499347686768, + "learning_rate": 4.740266332981116e-05, + "loss": 7.1272, + "step": 1519 + }, + { + "epoch": 0.14657666345226616, + "grad_norm": 2.250006675720215, + "learning_rate": 4.739930077475919e-05, + "loss": 7.1594, + "step": 1520 + }, + { + "epoch": 0.14667309546769527, + "grad_norm": 6.42025089263916, + "learning_rate": 4.739593616392696e-05, + "loss": 7.095, + "step": 1521 + }, + { + "epoch": 0.14676952748312438, + "grad_norm": 3.1445677280426025, + "learning_rate": 4.739256949762327e-05, + "loss": 6.9062, + "step": 1522 + }, + { + "epoch": 0.14686595949855352, + "grad_norm": 3.0555357933044434, + "learning_rate": 4.738920077615712e-05, + "loss": 6.8099, + "step": 1523 + }, + { + "epoch": 0.14696239151398263, + "grad_norm": 3.8681418895721436, + "learning_rate": 4.738582999983767e-05, + "loss": 6.9816, + "step": 1524 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 3.3748562335968018, + "learning_rate": 4.73824571689743e-05, + "loss": 7.0532, + "step": 1525 + }, + { + "epoch": 0.14715525554484088, + "grad_norm": 2.6328518390655518, + "learning_rate": 4.7379082283876566e-05, + "loss": 7.0938, + "step": 1526 + }, + { + "epoch": 0.14725168756027002, + "grad_norm": 2.7508952617645264, + "learning_rate": 4.737570534485419e-05, + "loss": 6.8891, + "step": 1527 + }, + { + "epoch": 0.14734811957569913, + "grad_norm": 3.5398001670837402, + "learning_rate": 4.737232635221713e-05, + "loss": 7.0213, + "step": 1528 + }, + { + "epoch": 0.14744455159112826, + "grad_norm": 4.408915042877197, + "learning_rate": 4.736894530627548e-05, + "loss": 7.04, + "step": 1529 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 3.889435291290283, + "learning_rate": 4.736556220733957e-05, + "loss": 6.9013, + "step": 1530 + }, + { + "epoch": 0.1476374156219865, + "grad_norm": 2.8994550704956055, + "learning_rate": 4.736217705571989e-05, + "loss": 7.0701, + "step": 1531 + }, + { + "epoch": 0.14773384763741562, + "grad_norm": 3.843791961669922, + "learning_rate": 4.7358789851727124e-05, + "loss": 6.6753, + "step": 1532 + }, + { + "epoch": 0.14783027965284473, + "grad_norm": 3.47680926322937, + "learning_rate": 4.7355400595672136e-05, + "loss": 6.9555, + "step": 1533 + }, + { + "epoch": 0.14792671166827387, + "grad_norm": 3.0597856044769287, + "learning_rate": 4.7352009287866005e-05, + "loss": 7.0627, + "step": 1534 + }, + { + "epoch": 0.14802314368370298, + "grad_norm": 2.4877219200134277, + "learning_rate": 4.7348615928619964e-05, + "loss": 6.7942, + "step": 1535 + }, + { + "epoch": 0.14811957569913212, + "grad_norm": 3.395073652267456, + "learning_rate": 4.734522051824547e-05, + "loss": 6.7946, + "step": 1536 + }, + { + "epoch": 0.14821600771456123, + "grad_norm": 2.152977705001831, + "learning_rate": 4.734182305705414e-05, + "loss": 6.9322, + "step": 1537 + }, + { + "epoch": 0.14831243972999036, + "grad_norm": 2.6432666778564453, + "learning_rate": 4.733842354535778e-05, + "loss": 6.9876, + "step": 1538 + }, + { + "epoch": 0.14840887174541947, + "grad_norm": 2.7355806827545166, + "learning_rate": 4.7335021983468404e-05, + "loss": 6.9369, + "step": 1539 + }, + { + "epoch": 0.1485053037608486, + "grad_norm": 2.7209529876708984, + "learning_rate": 4.733161837169819e-05, + "loss": 6.9797, + "step": 1540 + }, + { + "epoch": 0.14860173577627772, + "grad_norm": 2.5985488891601562, + "learning_rate": 4.7328212710359544e-05, + "loss": 7.0173, + "step": 1541 + }, + { + "epoch": 0.14869816779170686, + "grad_norm": 2.771484136581421, + "learning_rate": 4.732480499976502e-05, + "loss": 7.1221, + "step": 1542 + }, + { + "epoch": 0.14879459980713597, + "grad_norm": 2.820298910140991, + "learning_rate": 4.732139524022735e-05, + "loss": 6.953, + "step": 1543 + }, + { + "epoch": 0.14889103182256508, + "grad_norm": 3.7108495235443115, + "learning_rate": 4.731798343205951e-05, + "loss": 7.0491, + "step": 1544 + }, + { + "epoch": 0.14898746383799422, + "grad_norm": 2.9169538021087646, + "learning_rate": 4.731456957557462e-05, + "loss": 7.0473, + "step": 1545 + }, + { + "epoch": 0.14908389585342333, + "grad_norm": 2.635296106338501, + "learning_rate": 4.7311153671085996e-05, + "loss": 6.9669, + "step": 1546 + }, + { + "epoch": 0.14918032786885246, + "grad_norm": 3.720771551132202, + "learning_rate": 4.7307735718907156e-05, + "loss": 7.03, + "step": 1547 + }, + { + "epoch": 0.14927675988428157, + "grad_norm": 4.228583335876465, + "learning_rate": 4.730431571935178e-05, + "loss": 6.7952, + "step": 1548 + }, + { + "epoch": 0.1493731918997107, + "grad_norm": 2.331291675567627, + "learning_rate": 4.730089367273376e-05, + "loss": 6.5438, + "step": 1549 + }, + { + "epoch": 0.14946962391513982, + "grad_norm": 2.1174187660217285, + "learning_rate": 4.729746957936717e-05, + "loss": 6.7996, + "step": 1550 + }, + { + "epoch": 0.14956605593056896, + "grad_norm": 3.270839214324951, + "learning_rate": 4.729404343956626e-05, + "loss": 6.9129, + "step": 1551 + }, + { + "epoch": 0.14966248794599807, + "grad_norm": 5.356337070465088, + "learning_rate": 4.7290615253645485e-05, + "loss": 6.8964, + "step": 1552 + }, + { + "epoch": 0.1497589199614272, + "grad_norm": 5.158341407775879, + "learning_rate": 4.728718502191948e-05, + "loss": 6.6968, + "step": 1553 + }, + { + "epoch": 0.14985535197685632, + "grad_norm": 3.7917656898498535, + "learning_rate": 4.728375274470307e-05, + "loss": 6.9891, + "step": 1554 + }, + { + "epoch": 0.14995178399228543, + "grad_norm": 2.5613656044006348, + "learning_rate": 4.728031842231125e-05, + "loss": 6.9302, + "step": 1555 + }, + { + "epoch": 0.15004821600771456, + "grad_norm": 5.328647613525391, + "learning_rate": 4.7276882055059234e-05, + "loss": 6.8544, + "step": 1556 + }, + { + "epoch": 0.15014464802314367, + "grad_norm": 4.885748863220215, + "learning_rate": 4.727344364326239e-05, + "loss": 7.1123, + "step": 1557 + }, + { + "epoch": 0.1502410800385728, + "grad_norm": 3.3411920070648193, + "learning_rate": 4.727000318723632e-05, + "loss": 6.9181, + "step": 1558 + }, + { + "epoch": 0.15033751205400192, + "grad_norm": 5.265377998352051, + "learning_rate": 4.726656068729676e-05, + "loss": 7.1734, + "step": 1559 + }, + { + "epoch": 0.15043394406943106, + "grad_norm": 5.009007930755615, + "learning_rate": 4.726311614375967e-05, + "loss": 6.9489, + "step": 1560 + }, + { + "epoch": 0.15053037608486017, + "grad_norm": 2.928086519241333, + "learning_rate": 4.7259669556941164e-05, + "loss": 7.0994, + "step": 1561 + }, + { + "epoch": 0.1506268081002893, + "grad_norm": 3.287931203842163, + "learning_rate": 4.72562209271576e-05, + "loss": 7.1265, + "step": 1562 + }, + { + "epoch": 0.15072324011571842, + "grad_norm": 2.827148199081421, + "learning_rate": 4.725277025472547e-05, + "loss": 7.155, + "step": 1563 + }, + { + "epoch": 0.15081967213114755, + "grad_norm": 3.79838228225708, + "learning_rate": 4.7249317539961476e-05, + "loss": 7.06, + "step": 1564 + }, + { + "epoch": 0.15091610414657666, + "grad_norm": 2.9123003482818604, + "learning_rate": 4.7245862783182496e-05, + "loss": 7.0515, + "step": 1565 + }, + { + "epoch": 0.15101253616200577, + "grad_norm": 2.509793996810913, + "learning_rate": 4.724240598470562e-05, + "loss": 6.9376, + "step": 1566 + }, + { + "epoch": 0.1511089681774349, + "grad_norm": 1.8083585500717163, + "learning_rate": 4.723894714484809e-05, + "loss": 6.8187, + "step": 1567 + }, + { + "epoch": 0.15120540019286402, + "grad_norm": 2.7149314880371094, + "learning_rate": 4.7235486263927356e-05, + "loss": 6.8061, + "step": 1568 + }, + { + "epoch": 0.15130183220829316, + "grad_norm": 2.155200958251953, + "learning_rate": 4.723202334226107e-05, + "loss": 6.7965, + "step": 1569 + }, + { + "epoch": 0.15139826422372227, + "grad_norm": 2.133831739425659, + "learning_rate": 4.7228558380167044e-05, + "loss": 6.9249, + "step": 1570 + }, + { + "epoch": 0.1514946962391514, + "grad_norm": 2.291635751724243, + "learning_rate": 4.722509137796329e-05, + "loss": 6.8915, + "step": 1571 + }, + { + "epoch": 0.15159112825458051, + "grad_norm": 4.1780104637146, + "learning_rate": 4.722162233596801e-05, + "loss": 6.7312, + "step": 1572 + }, + { + "epoch": 0.15168756027000965, + "grad_norm": 4.519463539123535, + "learning_rate": 4.721815125449957e-05, + "loss": 6.9729, + "step": 1573 + }, + { + "epoch": 0.15178399228543876, + "grad_norm": 3.9464690685272217, + "learning_rate": 4.721467813387655e-05, + "loss": 6.8626, + "step": 1574 + }, + { + "epoch": 0.1518804243008679, + "grad_norm": 3.181406259536743, + "learning_rate": 4.721120297441773e-05, + "loss": 7.0478, + "step": 1575 + }, + { + "epoch": 0.151976856316297, + "grad_norm": 3.5450611114501953, + "learning_rate": 4.720772577644203e-05, + "loss": 7.0863, + "step": 1576 + }, + { + "epoch": 0.15207328833172612, + "grad_norm": 3.3168418407440186, + "learning_rate": 4.720424654026859e-05, + "loss": 6.9869, + "step": 1577 + }, + { + "epoch": 0.15216972034715526, + "grad_norm": 2.109327793121338, + "learning_rate": 4.720076526621674e-05, + "loss": 6.8925, + "step": 1578 + }, + { + "epoch": 0.15226615236258437, + "grad_norm": 2.8523335456848145, + "learning_rate": 4.7197281954605966e-05, + "loss": 6.6067, + "step": 1579 + }, + { + "epoch": 0.1523625843780135, + "grad_norm": 3.8801989555358887, + "learning_rate": 4.719379660575598e-05, + "loss": 6.9992, + "step": 1580 + }, + { + "epoch": 0.15245901639344261, + "grad_norm": 4.2881598472595215, + "learning_rate": 4.719030921998666e-05, + "loss": 7.1354, + "step": 1581 + }, + { + "epoch": 0.15255544840887175, + "grad_norm": 1.7773375511169434, + "learning_rate": 4.718681979761807e-05, + "loss": 7.0703, + "step": 1582 + }, + { + "epoch": 0.15265188042430086, + "grad_norm": 3.5569357872009277, + "learning_rate": 4.718332833897046e-05, + "loss": 6.7526, + "step": 1583 + }, + { + "epoch": 0.15274831243973, + "grad_norm": 4.004058361053467, + "learning_rate": 4.717983484436429e-05, + "loss": 7.0691, + "step": 1584 + }, + { + "epoch": 0.1528447444551591, + "grad_norm": 4.683288097381592, + "learning_rate": 4.717633931412017e-05, + "loss": 6.9783, + "step": 1585 + }, + { + "epoch": 0.15294117647058825, + "grad_norm": 6.068997383117676, + "learning_rate": 4.717284174855892e-05, + "loss": 6.7816, + "step": 1586 + }, + { + "epoch": 0.15303760848601736, + "grad_norm": 2.441666603088379, + "learning_rate": 4.716934214800155e-05, + "loss": 6.7126, + "step": 1587 + }, + { + "epoch": 0.15313404050144647, + "grad_norm": 2.9009077548980713, + "learning_rate": 4.716584051276924e-05, + "loss": 6.8113, + "step": 1588 + }, + { + "epoch": 0.1532304725168756, + "grad_norm": 2.57731556892395, + "learning_rate": 4.716233684318337e-05, + "loss": 6.9901, + "step": 1589 + }, + { + "epoch": 0.15332690453230471, + "grad_norm": 2.480916976928711, + "learning_rate": 4.715883113956551e-05, + "loss": 6.9397, + "step": 1590 + }, + { + "epoch": 0.15342333654773385, + "grad_norm": 2.773987293243408, + "learning_rate": 4.7155323402237395e-05, + "loss": 6.8992, + "step": 1591 + }, + { + "epoch": 0.15351976856316296, + "grad_norm": 2.0383121967315674, + "learning_rate": 4.7151813631520966e-05, + "loss": 6.9128, + "step": 1592 + }, + { + "epoch": 0.1536162005785921, + "grad_norm": 2.1904091835021973, + "learning_rate": 4.7148301827738336e-05, + "loss": 6.9987, + "step": 1593 + }, + { + "epoch": 0.1537126325940212, + "grad_norm": 2.650275230407715, + "learning_rate": 4.714478799121184e-05, + "loss": 6.8966, + "step": 1594 + }, + { + "epoch": 0.15380906460945035, + "grad_norm": 2.7271928787231445, + "learning_rate": 4.714127212226396e-05, + "loss": 7.038, + "step": 1595 + }, + { + "epoch": 0.15390549662487946, + "grad_norm": 3.3188552856445312, + "learning_rate": 4.713775422121737e-05, + "loss": 6.8297, + "step": 1596 + }, + { + "epoch": 0.1540019286403086, + "grad_norm": 3.1841588020324707, + "learning_rate": 4.713423428839494e-05, + "loss": 6.7668, + "step": 1597 + }, + { + "epoch": 0.1540983606557377, + "grad_norm": 4.0068278312683105, + "learning_rate": 4.713071232411973e-05, + "loss": 6.7274, + "step": 1598 + }, + { + "epoch": 0.1541947926711668, + "grad_norm": 2.3689117431640625, + "learning_rate": 4.7127188328714986e-05, + "loss": 6.8638, + "step": 1599 + }, + { + "epoch": 0.15429122468659595, + "grad_norm": 2.4484241008758545, + "learning_rate": 4.7123662302504136e-05, + "loss": 6.8676, + "step": 1600 + }, + { + "epoch": 0.15438765670202506, + "grad_norm": 3.3646347522735596, + "learning_rate": 4.712013424581077e-05, + "loss": 6.7072, + "step": 1601 + }, + { + "epoch": 0.1544840887174542, + "grad_norm": 2.221536636352539, + "learning_rate": 4.7116604158958736e-05, + "loss": 6.8321, + "step": 1602 + }, + { + "epoch": 0.1545805207328833, + "grad_norm": 1.9830851554870605, + "learning_rate": 4.711307204227198e-05, + "loss": 6.7963, + "step": 1603 + }, + { + "epoch": 0.15467695274831245, + "grad_norm": 2.740218162536621, + "learning_rate": 4.7109537896074686e-05, + "loss": 6.9255, + "step": 1604 + }, + { + "epoch": 0.15477338476374156, + "grad_norm": 2.7812211513519287, + "learning_rate": 4.7106001720691216e-05, + "loss": 6.8663, + "step": 1605 + }, + { + "epoch": 0.1548698167791707, + "grad_norm": 2.1307225227355957, + "learning_rate": 4.7102463516446116e-05, + "loss": 6.7293, + "step": 1606 + }, + { + "epoch": 0.1549662487945998, + "grad_norm": 3.054504156112671, + "learning_rate": 4.709892328366412e-05, + "loss": 6.9155, + "step": 1607 + }, + { + "epoch": 0.15506268081002894, + "grad_norm": 3.748260021209717, + "learning_rate": 4.709538102267015e-05, + "loss": 7.1579, + "step": 1608 + }, + { + "epoch": 0.15515911282545805, + "grad_norm": 2.8804075717926025, + "learning_rate": 4.70918367337893e-05, + "loss": 6.945, + "step": 1609 + }, + { + "epoch": 0.15525554484088716, + "grad_norm": 3.248958110809326, + "learning_rate": 4.708829041734687e-05, + "loss": 6.8125, + "step": 1610 + }, + { + "epoch": 0.1553519768563163, + "grad_norm": 4.654376983642578, + "learning_rate": 4.708474207366832e-05, + "loss": 6.6209, + "step": 1611 + }, + { + "epoch": 0.1554484088717454, + "grad_norm": 3.6690824031829834, + "learning_rate": 4.708119170307933e-05, + "loss": 6.9446, + "step": 1612 + }, + { + "epoch": 0.15554484088717455, + "grad_norm": 1.9863308668136597, + "learning_rate": 4.707763930590575e-05, + "loss": 6.7781, + "step": 1613 + }, + { + "epoch": 0.15564127290260366, + "grad_norm": 2.091304063796997, + "learning_rate": 4.70740848824736e-05, + "loss": 6.8623, + "step": 1614 + }, + { + "epoch": 0.1557377049180328, + "grad_norm": 2.683678150177002, + "learning_rate": 4.7070528433109114e-05, + "loss": 6.8652, + "step": 1615 + }, + { + "epoch": 0.1558341369334619, + "grad_norm": 2.613877534866333, + "learning_rate": 4.706696995813868e-05, + "loss": 7.0244, + "step": 1616 + }, + { + "epoch": 0.15593056894889104, + "grad_norm": 2.199794292449951, + "learning_rate": 4.706340945788892e-05, + "loss": 6.8632, + "step": 1617 + }, + { + "epoch": 0.15602700096432015, + "grad_norm": 3.0507731437683105, + "learning_rate": 4.705984693268659e-05, + "loss": 6.8654, + "step": 1618 + }, + { + "epoch": 0.1561234329797493, + "grad_norm": 3.031449556350708, + "learning_rate": 4.705628238285866e-05, + "loss": 7.1051, + "step": 1619 + }, + { + "epoch": 0.1562198649951784, + "grad_norm": 2.8776912689208984, + "learning_rate": 4.705271580873227e-05, + "loss": 6.8034, + "step": 1620 + }, + { + "epoch": 0.15631629701060754, + "grad_norm": 2.8407130241394043, + "learning_rate": 4.704914721063478e-05, + "loss": 6.7207, + "step": 1621 + }, + { + "epoch": 0.15641272902603665, + "grad_norm": 2.6617517471313477, + "learning_rate": 4.704557658889368e-05, + "loss": 6.7979, + "step": 1622 + }, + { + "epoch": 0.15650916104146576, + "grad_norm": 2.5329413414001465, + "learning_rate": 4.704200394383671e-05, + "loss": 6.9826, + "step": 1623 + }, + { + "epoch": 0.1566055930568949, + "grad_norm": 2.523305892944336, + "learning_rate": 4.7038429275791745e-05, + "loss": 6.9724, + "step": 1624 + }, + { + "epoch": 0.156702025072324, + "grad_norm": 2.4610893726348877, + "learning_rate": 4.7034852585086864e-05, + "loss": 7.0174, + "step": 1625 + }, + { + "epoch": 0.15679845708775314, + "grad_norm": 3.0245485305786133, + "learning_rate": 4.703127387205033e-05, + "loss": 7.0304, + "step": 1626 + }, + { + "epoch": 0.15689488910318225, + "grad_norm": 3.221019744873047, + "learning_rate": 4.70276931370106e-05, + "loss": 6.9979, + "step": 1627 + }, + { + "epoch": 0.1569913211186114, + "grad_norm": 4.046229362487793, + "learning_rate": 4.70241103802963e-05, + "loss": 7.0019, + "step": 1628 + }, + { + "epoch": 0.1570877531340405, + "grad_norm": 2.163240432739258, + "learning_rate": 4.702052560223627e-05, + "loss": 6.8053, + "step": 1629 + }, + { + "epoch": 0.15718418514946964, + "grad_norm": 1.8968032598495483, + "learning_rate": 4.701693880315949e-05, + "loss": 6.9102, + "step": 1630 + }, + { + "epoch": 0.15728061716489874, + "grad_norm": 1.761291265487671, + "learning_rate": 4.701334998339517e-05, + "loss": 6.9958, + "step": 1631 + }, + { + "epoch": 0.15737704918032788, + "grad_norm": 2.6070263385772705, + "learning_rate": 4.700975914327268e-05, + "loss": 7.0822, + "step": 1632 + }, + { + "epoch": 0.157473481195757, + "grad_norm": 2.4770348072052, + "learning_rate": 4.700616628312158e-05, + "loss": 6.9201, + "step": 1633 + }, + { + "epoch": 0.1575699132111861, + "grad_norm": 2.6471874713897705, + "learning_rate": 4.700257140327163e-05, + "loss": 6.7779, + "step": 1634 + }, + { + "epoch": 0.15766634522661524, + "grad_norm": 2.7054290771484375, + "learning_rate": 4.699897450405276e-05, + "loss": 6.9681, + "step": 1635 + }, + { + "epoch": 0.15776277724204435, + "grad_norm": 1.930306077003479, + "learning_rate": 4.699537558579509e-05, + "loss": 6.9443, + "step": 1636 + }, + { + "epoch": 0.1578592092574735, + "grad_norm": 1.8372762203216553, + "learning_rate": 4.699177464882891e-05, + "loss": 7.0514, + "step": 1637 + }, + { + "epoch": 0.1579556412729026, + "grad_norm": 2.627546787261963, + "learning_rate": 4.698817169348473e-05, + "loss": 6.8908, + "step": 1638 + }, + { + "epoch": 0.15805207328833173, + "grad_norm": 1.9001567363739014, + "learning_rate": 4.6984566720093206e-05, + "loss": 6.8812, + "step": 1639 + }, + { + "epoch": 0.15814850530376084, + "grad_norm": 2.5219228267669678, + "learning_rate": 4.698095972898521e-05, + "loss": 6.776, + "step": 1640 + }, + { + "epoch": 0.15824493731918998, + "grad_norm": 3.3942065238952637, + "learning_rate": 4.697735072049179e-05, + "loss": 7.0083, + "step": 1641 + }, + { + "epoch": 0.1583413693346191, + "grad_norm": 2.8214495182037354, + "learning_rate": 4.6973739694944164e-05, + "loss": 6.9523, + "step": 1642 + }, + { + "epoch": 0.15843780135004823, + "grad_norm": 2.943143844604492, + "learning_rate": 4.6970126652673754e-05, + "loss": 6.8362, + "step": 1643 + }, + { + "epoch": 0.15853423336547734, + "grad_norm": 3.516000986099243, + "learning_rate": 4.6966511594012166e-05, + "loss": 6.8266, + "step": 1644 + }, + { + "epoch": 0.15863066538090645, + "grad_norm": 4.347445964813232, + "learning_rate": 4.6962894519291174e-05, + "loss": 7.1132, + "step": 1645 + }, + { + "epoch": 0.1587270973963356, + "grad_norm": 3.741752862930298, + "learning_rate": 4.6959275428842753e-05, + "loss": 6.9586, + "step": 1646 + }, + { + "epoch": 0.1588235294117647, + "grad_norm": 3.444396495819092, + "learning_rate": 4.695565432299906e-05, + "loss": 6.918, + "step": 1647 + }, + { + "epoch": 0.15891996142719383, + "grad_norm": 3.5643725395202637, + "learning_rate": 4.695203120209245e-05, + "loss": 6.8936, + "step": 1648 + }, + { + "epoch": 0.15901639344262294, + "grad_norm": 6.198506832122803, + "learning_rate": 4.694840606645543e-05, + "loss": 7.0359, + "step": 1649 + }, + { + "epoch": 0.15911282545805208, + "grad_norm": 3.5435996055603027, + "learning_rate": 4.6944778916420705e-05, + "loss": 6.9961, + "step": 1650 + }, + { + "epoch": 0.1592092574734812, + "grad_norm": 3.2507553100585938, + "learning_rate": 4.694114975232119e-05, + "loss": 7.0148, + "step": 1651 + }, + { + "epoch": 0.15930568948891033, + "grad_norm": 3.607332468032837, + "learning_rate": 4.693751857448996e-05, + "loss": 6.9489, + "step": 1652 + }, + { + "epoch": 0.15940212150433944, + "grad_norm": 5.474004745483398, + "learning_rate": 4.6933885383260264e-05, + "loss": 6.9209, + "step": 1653 + }, + { + "epoch": 0.15949855351976858, + "grad_norm": 3.435805559158325, + "learning_rate": 4.693025017896557e-05, + "loss": 6.9862, + "step": 1654 + }, + { + "epoch": 0.1595949855351977, + "grad_norm": 2.951266288757324, + "learning_rate": 4.6926612961939504e-05, + "loss": 6.7939, + "step": 1655 + }, + { + "epoch": 0.1596914175506268, + "grad_norm": 2.997162103652954, + "learning_rate": 4.692297373251589e-05, + "loss": 7.0622, + "step": 1656 + }, + { + "epoch": 0.15978784956605593, + "grad_norm": 3.3229732513427734, + "learning_rate": 4.691933249102873e-05, + "loss": 6.8946, + "step": 1657 + }, + { + "epoch": 0.15988428158148504, + "grad_norm": 3.9964592456817627, + "learning_rate": 4.691568923781221e-05, + "loss": 6.6976, + "step": 1658 + }, + { + "epoch": 0.15998071359691418, + "grad_norm": 4.348190784454346, + "learning_rate": 4.6912043973200705e-05, + "loss": 6.8022, + "step": 1659 + }, + { + "epoch": 0.1600771456123433, + "grad_norm": 3.855637550354004, + "learning_rate": 4.690839669752878e-05, + "loss": 6.6563, + "step": 1660 + }, + { + "epoch": 0.16017357762777243, + "grad_norm": 3.8486812114715576, + "learning_rate": 4.690474741113116e-05, + "loss": 6.907, + "step": 1661 + }, + { + "epoch": 0.16027000964320154, + "grad_norm": 3.7076330184936523, + "learning_rate": 4.690109611434279e-05, + "loss": 6.9296, + "step": 1662 + }, + { + "epoch": 0.16036644165863068, + "grad_norm": 3.476219415664673, + "learning_rate": 4.689744280749877e-05, + "loss": 6.7422, + "step": 1663 + }, + { + "epoch": 0.16046287367405979, + "grad_norm": 2.8202948570251465, + "learning_rate": 4.689378749093441e-05, + "loss": 6.7275, + "step": 1664 + }, + { + "epoch": 0.16055930568948892, + "grad_norm": 3.468275547027588, + "learning_rate": 4.689013016498517e-05, + "loss": 7.0127, + "step": 1665 + }, + { + "epoch": 0.16065573770491803, + "grad_norm": 2.8182497024536133, + "learning_rate": 4.688647082998673e-05, + "loss": 7.0471, + "step": 1666 + }, + { + "epoch": 0.16075216972034714, + "grad_norm": 3.2250266075134277, + "learning_rate": 4.6882809486274936e-05, + "loss": 7.0106, + "step": 1667 + }, + { + "epoch": 0.16084860173577628, + "grad_norm": 2.5745849609375, + "learning_rate": 4.6879146134185823e-05, + "loss": 7.086, + "step": 1668 + }, + { + "epoch": 0.1609450337512054, + "grad_norm": 3.8508846759796143, + "learning_rate": 4.6875480774055605e-05, + "loss": 6.9813, + "step": 1669 + }, + { + "epoch": 0.16104146576663453, + "grad_norm": 4.274245262145996, + "learning_rate": 4.687181340622068e-05, + "loss": 6.9713, + "step": 1670 + }, + { + "epoch": 0.16113789778206364, + "grad_norm": 2.0990660190582275, + "learning_rate": 4.686814403101765e-05, + "loss": 6.7939, + "step": 1671 + }, + { + "epoch": 0.16123432979749278, + "grad_norm": 2.294264793395996, + "learning_rate": 4.686447264878327e-05, + "loss": 6.9833, + "step": 1672 + }, + { + "epoch": 0.16133076181292189, + "grad_norm": 2.57818865776062, + "learning_rate": 4.68607992598545e-05, + "loss": 7.0612, + "step": 1673 + }, + { + "epoch": 0.16142719382835102, + "grad_norm": 4.098738193511963, + "learning_rate": 4.6857123864568486e-05, + "loss": 6.8661, + "step": 1674 + }, + { + "epoch": 0.16152362584378013, + "grad_norm": 4.164111137390137, + "learning_rate": 4.685344646326254e-05, + "loss": 6.8918, + "step": 1675 + }, + { + "epoch": 0.16162005785920927, + "grad_norm": 3.5518853664398193, + "learning_rate": 4.684976705627418e-05, + "loss": 6.8223, + "step": 1676 + }, + { + "epoch": 0.16171648987463838, + "grad_norm": 3.277531385421753, + "learning_rate": 4.684608564394108e-05, + "loss": 6.638, + "step": 1677 + }, + { + "epoch": 0.1618129218900675, + "grad_norm": 3.2259178161621094, + "learning_rate": 4.6842402226601144e-05, + "loss": 6.5645, + "step": 1678 + }, + { + "epoch": 0.16190935390549663, + "grad_norm": 4.025298118591309, + "learning_rate": 4.683871680459241e-05, + "loss": 6.6364, + "step": 1679 + }, + { + "epoch": 0.16200578592092574, + "grad_norm": 4.341943740844727, + "learning_rate": 4.683502937825313e-05, + "loss": 6.8052, + "step": 1680 + }, + { + "epoch": 0.16210221793635488, + "grad_norm": 2.9898223876953125, + "learning_rate": 4.6831339947921713e-05, + "loss": 7.0294, + "step": 1681 + }, + { + "epoch": 0.16219864995178399, + "grad_norm": 2.850348949432373, + "learning_rate": 4.6827648513936804e-05, + "loss": 7.239, + "step": 1682 + }, + { + "epoch": 0.16229508196721312, + "grad_norm": 4.301338195800781, + "learning_rate": 4.682395507663717e-05, + "loss": 6.8061, + "step": 1683 + }, + { + "epoch": 0.16239151398264223, + "grad_norm": 4.267773151397705, + "learning_rate": 4.68202596363618e-05, + "loss": 6.9403, + "step": 1684 + }, + { + "epoch": 0.16248794599807137, + "grad_norm": 4.272030353546143, + "learning_rate": 4.681656219344986e-05, + "loss": 6.9877, + "step": 1685 + }, + { + "epoch": 0.16258437801350048, + "grad_norm": 2.732760429382324, + "learning_rate": 4.681286274824069e-05, + "loss": 6.7994, + "step": 1686 + }, + { + "epoch": 0.16268081002892962, + "grad_norm": 2.0142369270324707, + "learning_rate": 4.680916130107383e-05, + "loss": 6.9087, + "step": 1687 + }, + { + "epoch": 0.16277724204435873, + "grad_norm": 2.278698682785034, + "learning_rate": 4.680545785228898e-05, + "loss": 6.7846, + "step": 1688 + }, + { + "epoch": 0.16287367405978784, + "grad_norm": 3.518625259399414, + "learning_rate": 4.6801752402226055e-05, + "loss": 6.7589, + "step": 1689 + }, + { + "epoch": 0.16297010607521698, + "grad_norm": 3.399460554122925, + "learning_rate": 4.679804495122513e-05, + "loss": 6.8197, + "step": 1690 + }, + { + "epoch": 0.16306653809064608, + "grad_norm": 3.3650505542755127, + "learning_rate": 4.679433549962646e-05, + "loss": 6.8089, + "step": 1691 + }, + { + "epoch": 0.16316297010607522, + "grad_norm": 3.463127851486206, + "learning_rate": 4.67906240477705e-05, + "loss": 6.6434, + "step": 1692 + }, + { + "epoch": 0.16325940212150433, + "grad_norm": 2.8286216259002686, + "learning_rate": 4.678691059599789e-05, + "loss": 6.7976, + "step": 1693 + }, + { + "epoch": 0.16335583413693347, + "grad_norm": 3.8683648109436035, + "learning_rate": 4.6783195144649445e-05, + "loss": 6.6936, + "step": 1694 + }, + { + "epoch": 0.16345226615236258, + "grad_norm": 2.9772119522094727, + "learning_rate": 4.6779477694066155e-05, + "loss": 6.8484, + "step": 1695 + }, + { + "epoch": 0.16354869816779172, + "grad_norm": 2.1739227771759033, + "learning_rate": 4.6775758244589205e-05, + "loss": 6.8358, + "step": 1696 + }, + { + "epoch": 0.16364513018322083, + "grad_norm": 2.5116984844207764, + "learning_rate": 4.677203679655998e-05, + "loss": 6.9531, + "step": 1697 + }, + { + "epoch": 0.16374156219864996, + "grad_norm": 3.394111156463623, + "learning_rate": 4.6768313350320006e-05, + "loss": 6.8678, + "step": 1698 + }, + { + "epoch": 0.16383799421407907, + "grad_norm": 2.801257371902466, + "learning_rate": 4.676458790621102e-05, + "loss": 6.7263, + "step": 1699 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 2.1140689849853516, + "learning_rate": 4.6760860464574944e-05, + "loss": 6.5651, + "step": 1700 + }, + { + "epoch": 0.16403085824493732, + "grad_norm": 2.5506951808929443, + "learning_rate": 4.6757131025753886e-05, + "loss": 6.7298, + "step": 1701 + }, + { + "epoch": 0.16412729026036643, + "grad_norm": 3.766819715499878, + "learning_rate": 4.675339959009011e-05, + "loss": 6.8251, + "step": 1702 + }, + { + "epoch": 0.16422372227579557, + "grad_norm": 3.2410454750061035, + "learning_rate": 4.6749666157926106e-05, + "loss": 6.7931, + "step": 1703 + }, + { + "epoch": 0.16432015429122468, + "grad_norm": 3.242682456970215, + "learning_rate": 4.6745930729604505e-05, + "loss": 6.5962, + "step": 1704 + }, + { + "epoch": 0.16441658630665382, + "grad_norm": 2.8443756103515625, + "learning_rate": 4.674219330546814e-05, + "loss": 6.4858, + "step": 1705 + }, + { + "epoch": 0.16451301832208293, + "grad_norm": 3.051806926727295, + "learning_rate": 4.673845388586005e-05, + "loss": 6.4105, + "step": 1706 + }, + { + "epoch": 0.16460945033751206, + "grad_norm": 2.2035932540893555, + "learning_rate": 4.673471247112341e-05, + "loss": 6.2425, + "step": 1707 + }, + { + "epoch": 0.16470588235294117, + "grad_norm": 3.1453332901000977, + "learning_rate": 4.67309690616016e-05, + "loss": 6.4494, + "step": 1708 + }, + { + "epoch": 0.1648023143683703, + "grad_norm": 3.4628853797912598, + "learning_rate": 4.672722365763821e-05, + "loss": 6.7826, + "step": 1709 + }, + { + "epoch": 0.16489874638379942, + "grad_norm": 2.5954222679138184, + "learning_rate": 4.672347625957697e-05, + "loss": 6.761, + "step": 1710 + }, + { + "epoch": 0.16499517839922853, + "grad_norm": 2.623720407485962, + "learning_rate": 4.671972686776181e-05, + "loss": 6.8231, + "step": 1711 + }, + { + "epoch": 0.16509161041465767, + "grad_norm": 2.902787208557129, + "learning_rate": 4.6715975482536865e-05, + "loss": 6.7575, + "step": 1712 + }, + { + "epoch": 0.16518804243008678, + "grad_norm": 2.8463971614837646, + "learning_rate": 4.671222210424641e-05, + "loss": 6.8401, + "step": 1713 + }, + { + "epoch": 0.16528447444551592, + "grad_norm": 2.888240098953247, + "learning_rate": 4.6708466733234934e-05, + "loss": 6.5894, + "step": 1714 + }, + { + "epoch": 0.16538090646094503, + "grad_norm": 2.4944489002227783, + "learning_rate": 4.67047093698471e-05, + "loss": 6.8348, + "step": 1715 + }, + { + "epoch": 0.16547733847637416, + "grad_norm": 2.93617844581604, + "learning_rate": 4.670095001442775e-05, + "loss": 6.7656, + "step": 1716 + }, + { + "epoch": 0.16557377049180327, + "grad_norm": 3.363741397857666, + "learning_rate": 4.669718866732192e-05, + "loss": 6.4719, + "step": 1717 + }, + { + "epoch": 0.1656702025072324, + "grad_norm": 2.8156955242156982, + "learning_rate": 4.6693425328874824e-05, + "loss": 6.7192, + "step": 1718 + }, + { + "epoch": 0.16576663452266152, + "grad_norm": 3.7404417991638184, + "learning_rate": 4.668965999943184e-05, + "loss": 6.5424, + "step": 1719 + }, + { + "epoch": 0.16586306653809066, + "grad_norm": 3.821254253387451, + "learning_rate": 4.6685892679338575e-05, + "loss": 6.8179, + "step": 1720 + }, + { + "epoch": 0.16595949855351977, + "grad_norm": 2.963926315307617, + "learning_rate": 4.668212336894075e-05, + "loss": 6.9147, + "step": 1721 + }, + { + "epoch": 0.16605593056894888, + "grad_norm": 3.609245538711548, + "learning_rate": 4.6678352068584345e-05, + "loss": 6.5972, + "step": 1722 + }, + { + "epoch": 0.16615236258437802, + "grad_norm": 2.8455381393432617, + "learning_rate": 4.667457877861546e-05, + "loss": 6.7999, + "step": 1723 + }, + { + "epoch": 0.16624879459980713, + "grad_norm": 2.6152100563049316, + "learning_rate": 4.667080349938042e-05, + "loss": 6.8828, + "step": 1724 + }, + { + "epoch": 0.16634522661523626, + "grad_norm": 4.973761081695557, + "learning_rate": 4.66670262312257e-05, + "loss": 7.0205, + "step": 1725 + }, + { + "epoch": 0.16644165863066537, + "grad_norm": 4.087960720062256, + "learning_rate": 4.666324697449798e-05, + "loss": 6.91, + "step": 1726 + }, + { + "epoch": 0.1665380906460945, + "grad_norm": 3.2376015186309814, + "learning_rate": 4.6659465729544125e-05, + "loss": 6.8025, + "step": 1727 + }, + { + "epoch": 0.16663452266152362, + "grad_norm": 3.276843309402466, + "learning_rate": 4.665568249671115e-05, + "loss": 6.6779, + "step": 1728 + }, + { + "epoch": 0.16673095467695276, + "grad_norm": 3.174675703048706, + "learning_rate": 4.66518972763463e-05, + "loss": 6.6956, + "step": 1729 + }, + { + "epoch": 0.16682738669238187, + "grad_norm": 2.990255832672119, + "learning_rate": 4.6648110068796963e-05, + "loss": 6.7521, + "step": 1730 + }, + { + "epoch": 0.166923818707811, + "grad_norm": 3.934302568435669, + "learning_rate": 4.664432087441073e-05, + "loss": 6.9302, + "step": 1731 + }, + { + "epoch": 0.16702025072324012, + "grad_norm": 4.301646709442139, + "learning_rate": 4.664052969353536e-05, + "loss": 6.6708, + "step": 1732 + }, + { + "epoch": 0.16711668273866923, + "grad_norm": 3.18221378326416, + "learning_rate": 4.663673652651882e-05, + "loss": 6.5725, + "step": 1733 + }, + { + "epoch": 0.16721311475409836, + "grad_norm": 3.1600844860076904, + "learning_rate": 4.6632941373709226e-05, + "loss": 6.8613, + "step": 1734 + }, + { + "epoch": 0.16730954676952747, + "grad_norm": 3.0984249114990234, + "learning_rate": 4.66291442354549e-05, + "loss": 6.6276, + "step": 1735 + }, + { + "epoch": 0.1674059787849566, + "grad_norm": 2.8610117435455322, + "learning_rate": 4.6625345112104335e-05, + "loss": 6.6424, + "step": 1736 + }, + { + "epoch": 0.16750241080038572, + "grad_norm": 2.35729718208313, + "learning_rate": 4.6621544004006216e-05, + "loss": 6.6411, + "step": 1737 + }, + { + "epoch": 0.16759884281581486, + "grad_norm": 3.3267345428466797, + "learning_rate": 4.66177409115094e-05, + "loss": 6.4968, + "step": 1738 + }, + { + "epoch": 0.16769527483124397, + "grad_norm": 4.192264080047607, + "learning_rate": 4.661393583496293e-05, + "loss": 6.8108, + "step": 1739 + }, + { + "epoch": 0.1677917068466731, + "grad_norm": 2.8315610885620117, + "learning_rate": 4.661012877471603e-05, + "loss": 6.9113, + "step": 1740 + }, + { + "epoch": 0.16788813886210222, + "grad_norm": 2.0838427543640137, + "learning_rate": 4.660631973111811e-05, + "loss": 6.7509, + "step": 1741 + }, + { + "epoch": 0.16798457087753135, + "grad_norm": 3.0729565620422363, + "learning_rate": 4.6602508704518754e-05, + "loss": 6.7968, + "step": 1742 + }, + { + "epoch": 0.16808100289296046, + "grad_norm": 2.424053192138672, + "learning_rate": 4.659869569526774e-05, + "loss": 6.776, + "step": 1743 + }, + { + "epoch": 0.16817743490838957, + "grad_norm": 1.5183104276657104, + "learning_rate": 4.659488070371502e-05, + "loss": 6.9094, + "step": 1744 + }, + { + "epoch": 0.1682738669238187, + "grad_norm": 3.3052682876586914, + "learning_rate": 4.6591063730210726e-05, + "loss": 6.9513, + "step": 1745 + }, + { + "epoch": 0.16837029893924782, + "grad_norm": 3.155778408050537, + "learning_rate": 4.658724477510518e-05, + "loss": 6.8164, + "step": 1746 + }, + { + "epoch": 0.16846673095467696, + "grad_norm": 2.696176052093506, + "learning_rate": 4.658342383874887e-05, + "loss": 6.7022, + "step": 1747 + }, + { + "epoch": 0.16856316297010607, + "grad_norm": 1.7938921451568604, + "learning_rate": 4.65796009214925e-05, + "loss": 6.8752, + "step": 1748 + }, + { + "epoch": 0.1686595949855352, + "grad_norm": 2.7208251953125, + "learning_rate": 4.6575776023686914e-05, + "loss": 6.6531, + "step": 1749 + }, + { + "epoch": 0.16875602700096431, + "grad_norm": 2.019643545150757, + "learning_rate": 4.657194914568315e-05, + "loss": 6.7624, + "step": 1750 + }, + { + "epoch": 0.16885245901639345, + "grad_norm": 3.464484214782715, + "learning_rate": 4.656812028783245e-05, + "loss": 6.6328, + "step": 1751 + }, + { + "epoch": 0.16894889103182256, + "grad_norm": 3.3729140758514404, + "learning_rate": 4.656428945048621e-05, + "loss": 6.4768, + "step": 1752 + }, + { + "epoch": 0.1690453230472517, + "grad_norm": 3.5049057006835938, + "learning_rate": 4.656045663399604e-05, + "loss": 6.5064, + "step": 1753 + }, + { + "epoch": 0.1691417550626808, + "grad_norm": 1.9124177694320679, + "learning_rate": 4.655662183871368e-05, + "loss": 6.6484, + "step": 1754 + }, + { + "epoch": 0.16923818707810992, + "grad_norm": 3.292079210281372, + "learning_rate": 4.655278506499111e-05, + "loss": 6.5536, + "step": 1755 + }, + { + "epoch": 0.16933461909353906, + "grad_norm": 3.518204689025879, + "learning_rate": 4.6548946313180454e-05, + "loss": 6.373, + "step": 1756 + }, + { + "epoch": 0.16943105110896817, + "grad_norm": 2.4326331615448, + "learning_rate": 4.654510558363402e-05, + "loss": 6.5519, + "step": 1757 + }, + { + "epoch": 0.1695274831243973, + "grad_norm": 4.064035892486572, + "learning_rate": 4.654126287670432e-05, + "loss": 6.7788, + "step": 1758 + }, + { + "epoch": 0.16962391513982641, + "grad_norm": 2.5215048789978027, + "learning_rate": 4.653741819274402e-05, + "loss": 7.1075, + "step": 1759 + }, + { + "epoch": 0.16972034715525555, + "grad_norm": 4.238713264465332, + "learning_rate": 4.6533571532105994e-05, + "loss": 7.2149, + "step": 1760 + }, + { + "epoch": 0.16981677917068466, + "grad_norm": 3.6842148303985596, + "learning_rate": 4.6529722895143276e-05, + "loss": 6.8069, + "step": 1761 + }, + { + "epoch": 0.1699132111861138, + "grad_norm": 4.00356388092041, + "learning_rate": 4.652587228220908e-05, + "loss": 6.9474, + "step": 1762 + }, + { + "epoch": 0.1700096432015429, + "grad_norm": 3.301279067993164, + "learning_rate": 4.6522019693656834e-05, + "loss": 6.7696, + "step": 1763 + }, + { + "epoch": 0.17010607521697205, + "grad_norm": 3.49251127243042, + "learning_rate": 4.65181651298401e-05, + "loss": 6.7827, + "step": 1764 + }, + { + "epoch": 0.17020250723240116, + "grad_norm": 4.175543785095215, + "learning_rate": 4.6514308591112654e-05, + "loss": 6.1671, + "step": 1765 + }, + { + "epoch": 0.17029893924783027, + "grad_norm": 3.468759775161743, + "learning_rate": 4.651045007782845e-05, + "loss": 6.7517, + "step": 1766 + }, + { + "epoch": 0.1703953712632594, + "grad_norm": 4.090583324432373, + "learning_rate": 4.650658959034161e-05, + "loss": 6.5511, + "step": 1767 + }, + { + "epoch": 0.17049180327868851, + "grad_norm": 3.698301076889038, + "learning_rate": 4.650272712900645e-05, + "loss": 6.8888, + "step": 1768 + }, + { + "epoch": 0.17058823529411765, + "grad_norm": 5.542658805847168, + "learning_rate": 4.649886269417746e-05, + "loss": 6.8252, + "step": 1769 + }, + { + "epoch": 0.17068466730954676, + "grad_norm": 3.339907169342041, + "learning_rate": 4.649499628620931e-05, + "loss": 7.0623, + "step": 1770 + }, + { + "epoch": 0.1707810993249759, + "grad_norm": 3.1558456420898438, + "learning_rate": 4.649112790545686e-05, + "loss": 6.8415, + "step": 1771 + }, + { + "epoch": 0.170877531340405, + "grad_norm": 4.553060531616211, + "learning_rate": 4.6487257552275145e-05, + "loss": 6.1036, + "step": 1772 + }, + { + "epoch": 0.17097396335583415, + "grad_norm": 4.8865742683410645, + "learning_rate": 4.6483385227019374e-05, + "loss": 6.5828, + "step": 1773 + }, + { + "epoch": 0.17107039537126326, + "grad_norm": 4.217456340789795, + "learning_rate": 4.647951093004494e-05, + "loss": 6.7691, + "step": 1774 + }, + { + "epoch": 0.1711668273866924, + "grad_norm": 3.604764223098755, + "learning_rate": 4.647563466170744e-05, + "loss": 6.6685, + "step": 1775 + }, + { + "epoch": 0.1712632594021215, + "grad_norm": 3.3134117126464844, + "learning_rate": 4.647175642236262e-05, + "loss": 6.8595, + "step": 1776 + }, + { + "epoch": 0.1713596914175506, + "grad_norm": 2.9917445182800293, + "learning_rate": 4.646787621236643e-05, + "loss": 6.8709, + "step": 1777 + }, + { + "epoch": 0.17145612343297975, + "grad_norm": 2.9994051456451416, + "learning_rate": 4.6463994032074974e-05, + "loss": 6.8643, + "step": 1778 + }, + { + "epoch": 0.17155255544840886, + "grad_norm": 2.801222801208496, + "learning_rate": 4.646010988184457e-05, + "loss": 6.7975, + "step": 1779 + }, + { + "epoch": 0.171648987463838, + "grad_norm": 2.531203508377075, + "learning_rate": 4.6456223762031694e-05, + "loss": 6.7907, + "step": 1780 + }, + { + "epoch": 0.1717454194792671, + "grad_norm": 2.5393569469451904, + "learning_rate": 4.645233567299301e-05, + "loss": 6.656, + "step": 1781 + }, + { + "epoch": 0.17184185149469625, + "grad_norm": 3.339008331298828, + "learning_rate": 4.644844561508536e-05, + "loss": 6.5213, + "step": 1782 + }, + { + "epoch": 0.17193828351012536, + "grad_norm": 2.761211395263672, + "learning_rate": 4.6444553588665764e-05, + "loss": 6.5992, + "step": 1783 + }, + { + "epoch": 0.1720347155255545, + "grad_norm": 2.136714220046997, + "learning_rate": 4.644065959409144e-05, + "loss": 6.6064, + "step": 1784 + }, + { + "epoch": 0.1721311475409836, + "grad_norm": 2.7016704082489014, + "learning_rate": 4.643676363171977e-05, + "loss": 6.1891, + "step": 1785 + }, + { + "epoch": 0.17222757955641274, + "grad_norm": 3.385812759399414, + "learning_rate": 4.643286570190831e-05, + "loss": 6.6027, + "step": 1786 + }, + { + "epoch": 0.17232401157184185, + "grad_norm": 2.5297656059265137, + "learning_rate": 4.6428965805014824e-05, + "loss": 6.777, + "step": 1787 + }, + { + "epoch": 0.17242044358727096, + "grad_norm": 4.4374260902404785, + "learning_rate": 4.6425063941397225e-05, + "loss": 6.4545, + "step": 1788 + }, + { + "epoch": 0.1725168756027001, + "grad_norm": 3.023895263671875, + "learning_rate": 4.642116011141363e-05, + "loss": 6.4883, + "step": 1789 + }, + { + "epoch": 0.1726133076181292, + "grad_norm": 6.461230278015137, + "learning_rate": 4.641725431542232e-05, + "loss": 6.7227, + "step": 1790 + }, + { + "epoch": 0.17270973963355835, + "grad_norm": 4.800800323486328, + "learning_rate": 4.6413346553781776e-05, + "loss": 6.3338, + "step": 1791 + }, + { + "epoch": 0.17280617164898746, + "grad_norm": 4.821989059448242, + "learning_rate": 4.640943682685064e-05, + "loss": 6.0424, + "step": 1792 + }, + { + "epoch": 0.1729026036644166, + "grad_norm": 6.212090015411377, + "learning_rate": 4.640552513498774e-05, + "loss": 6.5222, + "step": 1793 + }, + { + "epoch": 0.1729990356798457, + "grad_norm": 6.602437496185303, + "learning_rate": 4.640161147855209e-05, + "loss": 6.8622, + "step": 1794 + }, + { + "epoch": 0.17309546769527484, + "grad_norm": 4.9440155029296875, + "learning_rate": 4.6397695857902885e-05, + "loss": 6.5968, + "step": 1795 + }, + { + "epoch": 0.17319189971070395, + "grad_norm": 5.154370307922363, + "learning_rate": 4.639377827339948e-05, + "loss": 6.805, + "step": 1796 + }, + { + "epoch": 0.1732883317261331, + "grad_norm": 5.05769157409668, + "learning_rate": 4.638985872540145e-05, + "loss": 6.9017, + "step": 1797 + }, + { + "epoch": 0.1733847637415622, + "grad_norm": 3.2863757610321045, + "learning_rate": 4.63859372142685e-05, + "loss": 7.1281, + "step": 1798 + }, + { + "epoch": 0.1734811957569913, + "grad_norm": 6.0192341804504395, + "learning_rate": 4.638201374036056e-05, + "loss": 7.0922, + "step": 1799 + }, + { + "epoch": 0.17357762777242045, + "grad_norm": 6.826696395874023, + "learning_rate": 4.6378088304037716e-05, + "loss": 6.7801, + "step": 1800 + }, + { + "epoch": 0.17367405978784956, + "grad_norm": 4.124227046966553, + "learning_rate": 4.6374160905660236e-05, + "loss": 6.7945, + "step": 1801 + }, + { + "epoch": 0.1737704918032787, + "grad_norm": 3.6522083282470703, + "learning_rate": 4.637023154558858e-05, + "loss": 6.7298, + "step": 1802 + }, + { + "epoch": 0.1738669238187078, + "grad_norm": 3.29876708984375, + "learning_rate": 4.6366300224183374e-05, + "loss": 6.6708, + "step": 1803 + }, + { + "epoch": 0.17396335583413694, + "grad_norm": 2.099921703338623, + "learning_rate": 4.6362366941805426e-05, + "loss": 6.7335, + "step": 1804 + }, + { + "epoch": 0.17405978784956605, + "grad_norm": 2.4487175941467285, + "learning_rate": 4.635843169881574e-05, + "loss": 6.777, + "step": 1805 + }, + { + "epoch": 0.1741562198649952, + "grad_norm": 2.6894896030426025, + "learning_rate": 4.635449449557548e-05, + "loss": 6.7327, + "step": 1806 + }, + { + "epoch": 0.1742526518804243, + "grad_norm": 2.4546377658843994, + "learning_rate": 4.635055533244599e-05, + "loss": 6.8282, + "step": 1807 + }, + { + "epoch": 0.17434908389585344, + "grad_norm": 3.022937536239624, + "learning_rate": 4.634661420978882e-05, + "loss": 6.7573, + "step": 1808 + }, + { + "epoch": 0.17444551591128254, + "grad_norm": 3.180574655532837, + "learning_rate": 4.6342671127965665e-05, + "loss": 6.5646, + "step": 1809 + }, + { + "epoch": 0.17454194792671165, + "grad_norm": 2.6597626209259033, + "learning_rate": 4.633872608733842e-05, + "loss": 6.6257, + "step": 1810 + }, + { + "epoch": 0.1746383799421408, + "grad_norm": 1.9660979509353638, + "learning_rate": 4.633477908826917e-05, + "loss": 6.7335, + "step": 1811 + }, + { + "epoch": 0.1747348119575699, + "grad_norm": 2.308392286300659, + "learning_rate": 4.6330830131120154e-05, + "loss": 6.7251, + "step": 1812 + }, + { + "epoch": 0.17483124397299904, + "grad_norm": 1.885748028755188, + "learning_rate": 4.6326879216253794e-05, + "loss": 6.7208, + "step": 1813 + }, + { + "epoch": 0.17492767598842815, + "grad_norm": 1.9920825958251953, + "learning_rate": 4.6322926344032705e-05, + "loss": 6.7319, + "step": 1814 + }, + { + "epoch": 0.1750241080038573, + "grad_norm": 2.4807252883911133, + "learning_rate": 4.631897151481969e-05, + "loss": 6.8134, + "step": 1815 + }, + { + "epoch": 0.1751205400192864, + "grad_norm": 2.5571765899658203, + "learning_rate": 4.63150147289777e-05, + "loss": 6.8133, + "step": 1816 + }, + { + "epoch": 0.17521697203471553, + "grad_norm": 3.2055392265319824, + "learning_rate": 4.63110559868699e-05, + "loss": 6.1964, + "step": 1817 + }, + { + "epoch": 0.17531340405014464, + "grad_norm": 3.1816554069519043, + "learning_rate": 4.630709528885961e-05, + "loss": 6.7764, + "step": 1818 + }, + { + "epoch": 0.17540983606557378, + "grad_norm": 2.9863979816436768, + "learning_rate": 4.6303132635310335e-05, + "loss": 6.66, + "step": 1819 + }, + { + "epoch": 0.1755062680810029, + "grad_norm": 1.929006814956665, + "learning_rate": 4.6299168026585774e-05, + "loss": 6.7928, + "step": 1820 + }, + { + "epoch": 0.175602700096432, + "grad_norm": 4.313773155212402, + "learning_rate": 4.629520146304978e-05, + "loss": 6.8417, + "step": 1821 + }, + { + "epoch": 0.17569913211186114, + "grad_norm": 5.085310459136963, + "learning_rate": 4.629123294506641e-05, + "loss": 6.66, + "step": 1822 + }, + { + "epoch": 0.17579556412729025, + "grad_norm": 2.6358962059020996, + "learning_rate": 4.6287262472999883e-05, + "loss": 6.6077, + "step": 1823 + }, + { + "epoch": 0.1758919961427194, + "grad_norm": 4.9965386390686035, + "learning_rate": 4.62832900472146e-05, + "loss": 6.7726, + "step": 1824 + }, + { + "epoch": 0.1759884281581485, + "grad_norm": 6.871905326843262, + "learning_rate": 4.6279315668075164e-05, + "loss": 6.6524, + "step": 1825 + }, + { + "epoch": 0.17608486017357763, + "grad_norm": 5.650396823883057, + "learning_rate": 4.627533933594632e-05, + "loss": 6.5873, + "step": 1826 + }, + { + "epoch": 0.17618129218900674, + "grad_norm": 2.423068046569824, + "learning_rate": 4.627136105119302e-05, + "loss": 6.7303, + "step": 1827 + }, + { + "epoch": 0.17627772420443588, + "grad_norm": 5.125938415527344, + "learning_rate": 4.6267380814180384e-05, + "loss": 6.648, + "step": 1828 + }, + { + "epoch": 0.176374156219865, + "grad_norm": 5.275598049163818, + "learning_rate": 4.626339862527371e-05, + "loss": 6.713, + "step": 1829 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 5.160116195678711, + "learning_rate": 4.625941448483848e-05, + "loss": 6.7367, + "step": 1830 + }, + { + "epoch": 0.17656702025072324, + "grad_norm": 3.893692970275879, + "learning_rate": 4.625542839324036e-05, + "loss": 6.6631, + "step": 1831 + }, + { + "epoch": 0.17666345226615235, + "grad_norm": 3.826209783554077, + "learning_rate": 4.625144035084518e-05, + "loss": 6.6977, + "step": 1832 + }, + { + "epoch": 0.1767598842815815, + "grad_norm": 2.2182793617248535, + "learning_rate": 4.624745035801896e-05, + "loss": 6.5779, + "step": 1833 + }, + { + "epoch": 0.1768563162970106, + "grad_norm": 7.468666076660156, + "learning_rate": 4.62434584151279e-05, + "loss": 6.3578, + "step": 1834 + }, + { + "epoch": 0.17695274831243973, + "grad_norm": 6.440155982971191, + "learning_rate": 4.6239464522538364e-05, + "loss": 6.3047, + "step": 1835 + }, + { + "epoch": 0.17704918032786884, + "grad_norm": 3.493818521499634, + "learning_rate": 4.623546868061692e-05, + "loss": 6.8253, + "step": 1836 + }, + { + "epoch": 0.17714561234329798, + "grad_norm": 3.002912998199463, + "learning_rate": 4.623147088973031e-05, + "loss": 6.9946, + "step": 1837 + }, + { + "epoch": 0.1772420443587271, + "grad_norm": 4.058692932128906, + "learning_rate": 4.622747115024542e-05, + "loss": 6.9305, + "step": 1838 + }, + { + "epoch": 0.17733847637415623, + "grad_norm": 4.55403470993042, + "learning_rate": 4.622346946252936e-05, + "loss": 6.8238, + "step": 1839 + }, + { + "epoch": 0.17743490838958534, + "grad_norm": 3.6636924743652344, + "learning_rate": 4.6219465826949394e-05, + "loss": 6.8041, + "step": 1840 + }, + { + "epoch": 0.17753134040501448, + "grad_norm": 3.342630624771118, + "learning_rate": 4.6215460243872965e-05, + "loss": 6.7101, + "step": 1841 + }, + { + "epoch": 0.17762777242044359, + "grad_norm": 2.7435898780822754, + "learning_rate": 4.6211452713667716e-05, + "loss": 6.8619, + "step": 1842 + }, + { + "epoch": 0.17772420443587272, + "grad_norm": 2.3932104110717773, + "learning_rate": 4.6207443236701445e-05, + "loss": 6.7474, + "step": 1843 + }, + { + "epoch": 0.17782063645130183, + "grad_norm": 2.56235408782959, + "learning_rate": 4.620343181334213e-05, + "loss": 6.9129, + "step": 1844 + }, + { + "epoch": 0.17791706846673094, + "grad_norm": 3.2361178398132324, + "learning_rate": 4.619941844395794e-05, + "loss": 6.8903, + "step": 1845 + }, + { + "epoch": 0.17801350048216008, + "grad_norm": 3.558311700820923, + "learning_rate": 4.619540312891721e-05, + "loss": 6.7057, + "step": 1846 + }, + { + "epoch": 0.1781099324975892, + "grad_norm": 2.219431161880493, + "learning_rate": 4.6191385868588484e-05, + "loss": 6.7751, + "step": 1847 + }, + { + "epoch": 0.17820636451301833, + "grad_norm": 4.211676597595215, + "learning_rate": 4.6187366663340434e-05, + "loss": 6.8185, + "step": 1848 + }, + { + "epoch": 0.17830279652844744, + "grad_norm": 4.450948715209961, + "learning_rate": 4.618334551354195e-05, + "loss": 6.6675, + "step": 1849 + }, + { + "epoch": 0.17839922854387658, + "grad_norm": 3.4464941024780273, + "learning_rate": 4.61793224195621e-05, + "loss": 6.6469, + "step": 1850 + }, + { + "epoch": 0.17849566055930569, + "grad_norm": 3.085902452468872, + "learning_rate": 4.6175297381770086e-05, + "loss": 6.4485, + "step": 1851 + }, + { + "epoch": 0.17859209257473482, + "grad_norm": 3.271071195602417, + "learning_rate": 4.617127040053536e-05, + "loss": 6.365, + "step": 1852 + }, + { + "epoch": 0.17868852459016393, + "grad_norm": 1.9916082620620728, + "learning_rate": 4.616724147622748e-05, + "loss": 6.321, + "step": 1853 + }, + { + "epoch": 0.17878495660559307, + "grad_norm": 2.654414653778076, + "learning_rate": 4.6163210609216236e-05, + "loss": 6.7179, + "step": 1854 + }, + { + "epoch": 0.17888138862102218, + "grad_norm": 1.8616728782653809, + "learning_rate": 4.615917779987157e-05, + "loss": 6.7969, + "step": 1855 + }, + { + "epoch": 0.1789778206364513, + "grad_norm": 4.685477256774902, + "learning_rate": 4.615514304856361e-05, + "loss": 6.5673, + "step": 1856 + }, + { + "epoch": 0.17907425265188043, + "grad_norm": 3.652576208114624, + "learning_rate": 4.615110635566265e-05, + "loss": 6.4324, + "step": 1857 + }, + { + "epoch": 0.17917068466730954, + "grad_norm": 3.98230242729187, + "learning_rate": 4.6147067721539196e-05, + "loss": 6.5041, + "step": 1858 + }, + { + "epoch": 0.17926711668273868, + "grad_norm": 5.15512228012085, + "learning_rate": 4.614302714656388e-05, + "loss": 6.668, + "step": 1859 + }, + { + "epoch": 0.17936354869816779, + "grad_norm": 4.375883102416992, + "learning_rate": 4.613898463110756e-05, + "loss": 6.7039, + "step": 1860 + }, + { + "epoch": 0.17945998071359692, + "grad_norm": 5.34254789352417, + "learning_rate": 4.613494017554125e-05, + "loss": 6.6501, + "step": 1861 + }, + { + "epoch": 0.17955641272902603, + "grad_norm": 5.297417640686035, + "learning_rate": 4.613089378023615e-05, + "loss": 6.645, + "step": 1862 + }, + { + "epoch": 0.17965284474445517, + "grad_norm": 1.9949544668197632, + "learning_rate": 4.612684544556361e-05, + "loss": 6.9273, + "step": 1863 + }, + { + "epoch": 0.17974927675988428, + "grad_norm": 3.2581427097320557, + "learning_rate": 4.612279517189522e-05, + "loss": 6.8036, + "step": 1864 + }, + { + "epoch": 0.17984570877531342, + "grad_norm": 2.4915285110473633, + "learning_rate": 4.611874295960267e-05, + "loss": 6.6926, + "step": 1865 + }, + { + "epoch": 0.17994214079074253, + "grad_norm": 2.0818004608154297, + "learning_rate": 4.6114688809057887e-05, + "loss": 6.6442, + "step": 1866 + }, + { + "epoch": 0.18003857280617164, + "grad_norm": 3.1741490364074707, + "learning_rate": 4.611063272063296e-05, + "loss": 6.7808, + "step": 1867 + }, + { + "epoch": 0.18013500482160077, + "grad_norm": 2.946621894836426, + "learning_rate": 4.6106574694700135e-05, + "loss": 6.7404, + "step": 1868 + }, + { + "epoch": 0.18023143683702988, + "grad_norm": 4.022921085357666, + "learning_rate": 4.610251473163187e-05, + "loss": 6.948, + "step": 1869 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 4.8261590003967285, + "learning_rate": 4.6098452831800775e-05, + "loss": 7.0327, + "step": 1870 + }, + { + "epoch": 0.18042430086788813, + "grad_norm": 2.9970901012420654, + "learning_rate": 4.609438899557964e-05, + "loss": 6.9613, + "step": 1871 + }, + { + "epoch": 0.18052073288331727, + "grad_norm": 4.434769153594971, + "learning_rate": 4.609032322334146e-05, + "loss": 6.864, + "step": 1872 + }, + { + "epoch": 0.18061716489874638, + "grad_norm": 4.098046779632568, + "learning_rate": 4.608625551545936e-05, + "loss": 6.6975, + "step": 1873 + }, + { + "epoch": 0.18071359691417552, + "grad_norm": 2.3095858097076416, + "learning_rate": 4.6082185872306685e-05, + "loss": 6.6454, + "step": 1874 + }, + { + "epoch": 0.18081002892960463, + "grad_norm": 4.007388114929199, + "learning_rate": 4.6078114294256935e-05, + "loss": 6.8409, + "step": 1875 + }, + { + "epoch": 0.18090646094503376, + "grad_norm": 4.9945292472839355, + "learning_rate": 4.60740407816838e-05, + "loss": 6.7012, + "step": 1876 + }, + { + "epoch": 0.18100289296046287, + "grad_norm": 2.996701955795288, + "learning_rate": 4.606996533496115e-05, + "loss": 6.7471, + "step": 1877 + }, + { + "epoch": 0.18109932497589198, + "grad_norm": 2.9677741527557373, + "learning_rate": 4.6065887954463e-05, + "loss": 6.6159, + "step": 1878 + }, + { + "epoch": 0.18119575699132112, + "grad_norm": 3.2264904975891113, + "learning_rate": 4.60618086405636e-05, + "loss": 6.7005, + "step": 1879 + }, + { + "epoch": 0.18129218900675023, + "grad_norm": 3.075927257537842, + "learning_rate": 4.6057727393637303e-05, + "loss": 6.6623, + "step": 1880 + }, + { + "epoch": 0.18138862102217937, + "grad_norm": 3.067009210586548, + "learning_rate": 4.605364421405872e-05, + "loss": 6.6242, + "step": 1881 + }, + { + "epoch": 0.18148505303760848, + "grad_norm": 3.0260260105133057, + "learning_rate": 4.604955910220258e-05, + "loss": 5.7413, + "step": 1882 + }, + { + "epoch": 0.18158148505303762, + "grad_norm": 1.6929268836975098, + "learning_rate": 4.604547205844381e-05, + "loss": 6.1172, + "step": 1883 + }, + { + "epoch": 0.18167791706846673, + "grad_norm": 2.5973410606384277, + "learning_rate": 4.604138308315752e-05, + "loss": 6.4974, + "step": 1884 + }, + { + "epoch": 0.18177434908389586, + "grad_norm": 5.32592248916626, + "learning_rate": 4.603729217671899e-05, + "loss": 6.8363, + "step": 1885 + }, + { + "epoch": 0.18187078109932497, + "grad_norm": 4.3958964347839355, + "learning_rate": 4.6033199339503676e-05, + "loss": 6.857, + "step": 1886 + }, + { + "epoch": 0.1819672131147541, + "grad_norm": 3.971256732940674, + "learning_rate": 4.6029104571887204e-05, + "loss": 6.7181, + "step": 1887 + }, + { + "epoch": 0.18206364513018322, + "grad_norm": 5.071672439575195, + "learning_rate": 4.6025007874245405e-05, + "loss": 6.465, + "step": 1888 + }, + { + "epoch": 0.18216007714561233, + "grad_norm": 4.467513561248779, + "learning_rate": 4.602090924695427e-05, + "loss": 6.1996, + "step": 1889 + }, + { + "epoch": 0.18225650916104147, + "grad_norm": 2.6286141872406006, + "learning_rate": 4.601680869038994e-05, + "loss": 6.3721, + "step": 1890 + }, + { + "epoch": 0.18235294117647058, + "grad_norm": 3.71329402923584, + "learning_rate": 4.601270620492879e-05, + "loss": 6.8069, + "step": 1891 + }, + { + "epoch": 0.18244937319189972, + "grad_norm": 6.329822063446045, + "learning_rate": 4.600860179094732e-05, + "loss": 6.5924, + "step": 1892 + }, + { + "epoch": 0.18254580520732883, + "grad_norm": 6.220271110534668, + "learning_rate": 4.6004495448822236e-05, + "loss": 6.2637, + "step": 1893 + }, + { + "epoch": 0.18264223722275796, + "grad_norm": 4.1072540283203125, + "learning_rate": 4.6000387178930416e-05, + "loss": 6.0956, + "step": 1894 + }, + { + "epoch": 0.18273866923818707, + "grad_norm": 8.070725440979004, + "learning_rate": 4.59962769816489e-05, + "loss": 6.2101, + "step": 1895 + }, + { + "epoch": 0.1828351012536162, + "grad_norm": 7.4923834800720215, + "learning_rate": 4.599216485735493e-05, + "loss": 6.9106, + "step": 1896 + }, + { + "epoch": 0.18293153326904532, + "grad_norm": 6.408032417297363, + "learning_rate": 4.59880508064259e-05, + "loss": 6.8222, + "step": 1897 + }, + { + "epoch": 0.18302796528447446, + "grad_norm": 3.044995069503784, + "learning_rate": 4.598393482923941e-05, + "loss": 6.52, + "step": 1898 + }, + { + "epoch": 0.18312439729990357, + "grad_norm": 5.0433549880981445, + "learning_rate": 4.59798169261732e-05, + "loss": 6.5988, + "step": 1899 + }, + { + "epoch": 0.18322082931533268, + "grad_norm": 5.320488929748535, + "learning_rate": 4.597569709760522e-05, + "loss": 6.6812, + "step": 1900 + }, + { + "epoch": 0.18331726133076182, + "grad_norm": 4.690096378326416, + "learning_rate": 4.5971575343913575e-05, + "loss": 6.829, + "step": 1901 + }, + { + "epoch": 0.18341369334619093, + "grad_norm": 4.540584087371826, + "learning_rate": 4.596745166547656e-05, + "loss": 6.8305, + "step": 1902 + }, + { + "epoch": 0.18351012536162006, + "grad_norm": 6.600525856018066, + "learning_rate": 4.596332606267263e-05, + "loss": 7.1156, + "step": 1903 + }, + { + "epoch": 0.18360655737704917, + "grad_norm": 4.634177207946777, + "learning_rate": 4.595919853588044e-05, + "loss": 6.9616, + "step": 1904 + }, + { + "epoch": 0.1837029893924783, + "grad_norm": 3.680494546890259, + "learning_rate": 4.595506908547881e-05, + "loss": 6.7262, + "step": 1905 + }, + { + "epoch": 0.18379942140790742, + "grad_norm": 3.7979342937469482, + "learning_rate": 4.595093771184672e-05, + "loss": 6.7801, + "step": 1906 + }, + { + "epoch": 0.18389585342333656, + "grad_norm": 3.634589910507202, + "learning_rate": 4.5946804415363364e-05, + "loss": 6.9066, + "step": 1907 + }, + { + "epoch": 0.18399228543876567, + "grad_norm": 2.5350704193115234, + "learning_rate": 4.5942669196408074e-05, + "loss": 6.6807, + "step": 1908 + }, + { + "epoch": 0.1840887174541948, + "grad_norm": 3.668820381164551, + "learning_rate": 4.593853205536038e-05, + "loss": 6.9091, + "step": 1909 + }, + { + "epoch": 0.18418514946962392, + "grad_norm": 3.6371541023254395, + "learning_rate": 4.593439299259998e-05, + "loss": 6.8156, + "step": 1910 + }, + { + "epoch": 0.18428158148505303, + "grad_norm": 2.820383071899414, + "learning_rate": 4.593025200850678e-05, + "loss": 6.7944, + "step": 1911 + }, + { + "epoch": 0.18437801350048216, + "grad_norm": 1.737941861152649, + "learning_rate": 4.592610910346079e-05, + "loss": 6.6536, + "step": 1912 + }, + { + "epoch": 0.18447444551591127, + "grad_norm": 3.321199417114258, + "learning_rate": 4.592196427784226e-05, + "loss": 6.6242, + "step": 1913 + }, + { + "epoch": 0.1845708775313404, + "grad_norm": 3.09264874458313, + "learning_rate": 4.5917817532031615e-05, + "loss": 6.5888, + "step": 1914 + }, + { + "epoch": 0.18466730954676952, + "grad_norm": 3.953774929046631, + "learning_rate": 4.591366886640942e-05, + "loss": 6.5406, + "step": 1915 + }, + { + "epoch": 0.18476374156219866, + "grad_norm": 3.125641345977783, + "learning_rate": 4.590951828135643e-05, + "loss": 6.8417, + "step": 1916 + }, + { + "epoch": 0.18486017357762777, + "grad_norm": 2.9089720249176025, + "learning_rate": 4.590536577725359e-05, + "loss": 6.6561, + "step": 1917 + }, + { + "epoch": 0.1849566055930569, + "grad_norm": 2.167332649230957, + "learning_rate": 4.590121135448201e-05, + "loss": 6.7359, + "step": 1918 + }, + { + "epoch": 0.18505303760848602, + "grad_norm": 6.057506561279297, + "learning_rate": 4.5897055013422973e-05, + "loss": 6.84, + "step": 1919 + }, + { + "epoch": 0.18514946962391515, + "grad_norm": 4.288477420806885, + "learning_rate": 4.5892896754457957e-05, + "loss": 6.8882, + "step": 1920 + }, + { + "epoch": 0.18524590163934426, + "grad_norm": 3.5342321395874023, + "learning_rate": 4.588873657796858e-05, + "loss": 6.705, + "step": 1921 + }, + { + "epoch": 0.18534233365477337, + "grad_norm": 4.417941093444824, + "learning_rate": 4.588457448433667e-05, + "loss": 6.6816, + "step": 1922 + }, + { + "epoch": 0.1854387656702025, + "grad_norm": 3.652714490890503, + "learning_rate": 4.588041047394424e-05, + "loss": 6.5709, + "step": 1923 + }, + { + "epoch": 0.18553519768563162, + "grad_norm": 3.7423577308654785, + "learning_rate": 4.587624454717341e-05, + "loss": 6.8221, + "step": 1924 + }, + { + "epoch": 0.18563162970106076, + "grad_norm": 4.479109287261963, + "learning_rate": 4.587207670440656e-05, + "loss": 6.7188, + "step": 1925 + }, + { + "epoch": 0.18572806171648987, + "grad_norm": 4.667773723602295, + "learning_rate": 4.5867906946026194e-05, + "loss": 6.7766, + "step": 1926 + }, + { + "epoch": 0.185824493731919, + "grad_norm": 4.772796630859375, + "learning_rate": 4.586373527241502e-05, + "loss": 6.7447, + "step": 1927 + }, + { + "epoch": 0.18592092574734811, + "grad_norm": 4.235954761505127, + "learning_rate": 4.58595616839559e-05, + "loss": 6.6708, + "step": 1928 + }, + { + "epoch": 0.18601735776277725, + "grad_norm": 4.129576683044434, + "learning_rate": 4.585538618103188e-05, + "loss": 6.4177, + "step": 1929 + }, + { + "epoch": 0.18611378977820636, + "grad_norm": 5.595423221588135, + "learning_rate": 4.585120876402619e-05, + "loss": 6.6691, + "step": 1930 + }, + { + "epoch": 0.1862102217936355, + "grad_norm": 4.282949447631836, + "learning_rate": 4.5847029433322224e-05, + "loss": 6.799, + "step": 1931 + }, + { + "epoch": 0.1863066538090646, + "grad_norm": 4.0319132804870605, + "learning_rate": 4.584284818930355e-05, + "loss": 6.6476, + "step": 1932 + }, + { + "epoch": 0.18640308582449372, + "grad_norm": 3.9748241901397705, + "learning_rate": 4.5838665032353924e-05, + "loss": 6.6369, + "step": 1933 + }, + { + "epoch": 0.18649951783992286, + "grad_norm": 4.843644142150879, + "learning_rate": 4.5834479962857275e-05, + "loss": 6.7178, + "step": 1934 + }, + { + "epoch": 0.18659594985535197, + "grad_norm": 2.606294631958008, + "learning_rate": 4.5830292981197686e-05, + "loss": 6.7625, + "step": 1935 + }, + { + "epoch": 0.1866923818707811, + "grad_norm": 3.0530929565429688, + "learning_rate": 4.582610408775946e-05, + "loss": 6.6757, + "step": 1936 + }, + { + "epoch": 0.18678881388621021, + "grad_norm": 2.983153820037842, + "learning_rate": 4.582191328292702e-05, + "loss": 6.6844, + "step": 1937 + }, + { + "epoch": 0.18688524590163935, + "grad_norm": 3.6324238777160645, + "learning_rate": 4.5817720567085016e-05, + "loss": 6.7209, + "step": 1938 + }, + { + "epoch": 0.18698167791706846, + "grad_norm": 2.775887966156006, + "learning_rate": 4.5813525940618236e-05, + "loss": 6.7004, + "step": 1939 + }, + { + "epoch": 0.1870781099324976, + "grad_norm": 3.4130659103393555, + "learning_rate": 4.580932940391166e-05, + "loss": 6.6234, + "step": 1940 + }, + { + "epoch": 0.1871745419479267, + "grad_norm": 2.4547019004821777, + "learning_rate": 4.580513095735045e-05, + "loss": 6.5176, + "step": 1941 + }, + { + "epoch": 0.18727097396335585, + "grad_norm": 3.3145437240600586, + "learning_rate": 4.5800930601319924e-05, + "loss": 6.6146, + "step": 1942 + }, + { + "epoch": 0.18736740597878496, + "grad_norm": 2.4775993824005127, + "learning_rate": 4.579672833620559e-05, + "loss": 6.8278, + "step": 1943 + }, + { + "epoch": 0.18746383799421407, + "grad_norm": 2.6371209621429443, + "learning_rate": 4.579252416239313e-05, + "loss": 6.6683, + "step": 1944 + }, + { + "epoch": 0.1875602700096432, + "grad_norm": 2.589205741882324, + "learning_rate": 4.5788318080268385e-05, + "loss": 6.567, + "step": 1945 + }, + { + "epoch": 0.18765670202507231, + "grad_norm": 2.518470525741577, + "learning_rate": 4.57841100902174e-05, + "loss": 6.7148, + "step": 1946 + }, + { + "epoch": 0.18775313404050145, + "grad_norm": 3.1604647636413574, + "learning_rate": 4.577990019262637e-05, + "loss": 6.6349, + "step": 1947 + }, + { + "epoch": 0.18784956605593056, + "grad_norm": 2.5713584423065186, + "learning_rate": 4.577568838788168e-05, + "loss": 6.9214, + "step": 1948 + }, + { + "epoch": 0.1879459980713597, + "grad_norm": 2.8227789402008057, + "learning_rate": 4.577147467636987e-05, + "loss": 6.6627, + "step": 1949 + }, + { + "epoch": 0.1880424300867888, + "grad_norm": 3.509096384048462, + "learning_rate": 4.5767259058477683e-05, + "loss": 6.6254, + "step": 1950 + }, + { + "epoch": 0.18813886210221795, + "grad_norm": 4.060064792633057, + "learning_rate": 4.5763041534592025e-05, + "loss": 6.5094, + "step": 1951 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 2.733124256134033, + "learning_rate": 4.575882210509997e-05, + "loss": 6.4949, + "step": 1952 + }, + { + "epoch": 0.1883317261330762, + "grad_norm": 2.22452449798584, + "learning_rate": 4.575460077038877e-05, + "loss": 6.4687, + "step": 1953 + }, + { + "epoch": 0.1884281581485053, + "grad_norm": 3.78707218170166, + "learning_rate": 4.5750377530845853e-05, + "loss": 6.5389, + "step": 1954 + }, + { + "epoch": 0.1885245901639344, + "grad_norm": 2.3839995861053467, + "learning_rate": 4.5746152386858834e-05, + "loss": 6.566, + "step": 1955 + }, + { + "epoch": 0.18862102217936355, + "grad_norm": 3.407989978790283, + "learning_rate": 4.5741925338815474e-05, + "loss": 6.2773, + "step": 1956 + }, + { + "epoch": 0.18871745419479266, + "grad_norm": 2.357642889022827, + "learning_rate": 4.573769638710374e-05, + "loss": 6.6488, + "step": 1957 + }, + { + "epoch": 0.1888138862102218, + "grad_norm": 2.9385507106781006, + "learning_rate": 4.573346553211175e-05, + "loss": 6.8081, + "step": 1958 + }, + { + "epoch": 0.1889103182256509, + "grad_norm": 2.3021044731140137, + "learning_rate": 4.5729232774227826e-05, + "loss": 6.7857, + "step": 1959 + }, + { + "epoch": 0.18900675024108005, + "grad_norm": 1.6245763301849365, + "learning_rate": 4.572499811384042e-05, + "loss": 6.81, + "step": 1960 + }, + { + "epoch": 0.18910318225650916, + "grad_norm": 3.0398809909820557, + "learning_rate": 4.5720761551338195e-05, + "loss": 6.5986, + "step": 1961 + }, + { + "epoch": 0.1891996142719383, + "grad_norm": 2.206817626953125, + "learning_rate": 4.5716523087109984e-05, + "loss": 6.6095, + "step": 1962 + }, + { + "epoch": 0.1892960462873674, + "grad_norm": 1.7520995140075684, + "learning_rate": 4.571228272154478e-05, + "loss": 6.4235, + "step": 1963 + }, + { + "epoch": 0.18939247830279654, + "grad_norm": 3.457282543182373, + "learning_rate": 4.5708040455031764e-05, + "loss": 6.6134, + "step": 1964 + }, + { + "epoch": 0.18948891031822565, + "grad_norm": 2.514634370803833, + "learning_rate": 4.570379628796028e-05, + "loss": 6.7416, + "step": 1965 + }, + { + "epoch": 0.18958534233365476, + "grad_norm": 2.1677279472351074, + "learning_rate": 4.569955022071985e-05, + "loss": 6.5688, + "step": 1966 + }, + { + "epoch": 0.1896817743490839, + "grad_norm": 3.5108561515808105, + "learning_rate": 4.569530225370018e-05, + "loss": 6.5062, + "step": 1967 + }, + { + "epoch": 0.189778206364513, + "grad_norm": 3.1009058952331543, + "learning_rate": 4.569105238729115e-05, + "loss": 6.6084, + "step": 1968 + }, + { + "epoch": 0.18987463837994215, + "grad_norm": 4.551214694976807, + "learning_rate": 4.568680062188278e-05, + "loss": 6.5767, + "step": 1969 + }, + { + "epoch": 0.18997107039537126, + "grad_norm": 2.9119515419006348, + "learning_rate": 4.5682546957865326e-05, + "loss": 6.4599, + "step": 1970 + }, + { + "epoch": 0.1900675024108004, + "grad_norm": 3.3139243125915527, + "learning_rate": 4.5678291395629155e-05, + "loss": 6.5936, + "step": 1971 + }, + { + "epoch": 0.1901639344262295, + "grad_norm": 5.15049409866333, + "learning_rate": 4.5674033935564865e-05, + "loss": 6.4971, + "step": 1972 + }, + { + "epoch": 0.19026036644165864, + "grad_norm": 5.516653537750244, + "learning_rate": 4.5669774578063174e-05, + "loss": 6.2912, + "step": 1973 + }, + { + "epoch": 0.19035679845708775, + "grad_norm": 4.262955665588379, + "learning_rate": 4.566551332351502e-05, + "loss": 5.648, + "step": 1974 + }, + { + "epoch": 0.1904532304725169, + "grad_norm": 3.425905704498291, + "learning_rate": 4.5661250172311485e-05, + "loss": 5.6219, + "step": 1975 + }, + { + "epoch": 0.190549662487946, + "grad_norm": 4.45957612991333, + "learning_rate": 4.565698512484384e-05, + "loss": 5.6248, + "step": 1976 + }, + { + "epoch": 0.1906460945033751, + "grad_norm": 4.143961429595947, + "learning_rate": 4.565271818150353e-05, + "loss": 6.3773, + "step": 1977 + }, + { + "epoch": 0.19074252651880425, + "grad_norm": 3.76819109916687, + "learning_rate": 4.564844934268216e-05, + "loss": 6.7065, + "step": 1978 + }, + { + "epoch": 0.19083895853423335, + "grad_norm": 2.3572380542755127, + "learning_rate": 4.564417860877152e-05, + "loss": 6.6825, + "step": 1979 + }, + { + "epoch": 0.1909353905496625, + "grad_norm": 4.021049499511719, + "learning_rate": 4.563990598016358e-05, + "loss": 6.6536, + "step": 1980 + }, + { + "epoch": 0.1910318225650916, + "grad_norm": 4.5370402336120605, + "learning_rate": 4.5635631457250474e-05, + "loss": 6.9506, + "step": 1981 + }, + { + "epoch": 0.19112825458052074, + "grad_norm": 5.122212886810303, + "learning_rate": 4.563135504042451e-05, + "loss": 7.0367, + "step": 1982 + }, + { + "epoch": 0.19122468659594985, + "grad_norm": 3.729797840118408, + "learning_rate": 4.5627076730078165e-05, + "loss": 7.0163, + "step": 1983 + }, + { + "epoch": 0.191321118611379, + "grad_norm": 4.8285346031188965, + "learning_rate": 4.562279652660412e-05, + "loss": 6.6561, + "step": 1984 + }, + { + "epoch": 0.1914175506268081, + "grad_norm": 5.3200554847717285, + "learning_rate": 4.561851443039519e-05, + "loss": 6.8546, + "step": 1985 + }, + { + "epoch": 0.19151398264223723, + "grad_norm": 3.3440425395965576, + "learning_rate": 4.561423044184437e-05, + "loss": 6.7345, + "step": 1986 + }, + { + "epoch": 0.19161041465766634, + "grad_norm": 2.9901301860809326, + "learning_rate": 4.560994456134487e-05, + "loss": 6.709, + "step": 1987 + }, + { + "epoch": 0.19170684667309545, + "grad_norm": 5.58203125, + "learning_rate": 4.560565678929001e-05, + "loss": 6.7538, + "step": 1988 + }, + { + "epoch": 0.1918032786885246, + "grad_norm": 4.698734760284424, + "learning_rate": 4.5601367126073336e-05, + "loss": 6.9604, + "step": 1989 + }, + { + "epoch": 0.1918997107039537, + "grad_norm": 2.5897645950317383, + "learning_rate": 4.559707557208854e-05, + "loss": 6.6261, + "step": 1990 + }, + { + "epoch": 0.19199614271938284, + "grad_norm": 4.499833583831787, + "learning_rate": 4.5592782127729505e-05, + "loss": 6.6162, + "step": 1991 + }, + { + "epoch": 0.19209257473481195, + "grad_norm": 5.535737991333008, + "learning_rate": 4.558848679339027e-05, + "loss": 6.7608, + "step": 1992 + }, + { + "epoch": 0.1921890067502411, + "grad_norm": 2.9236104488372803, + "learning_rate": 4.5584189569465054e-05, + "loss": 6.497, + "step": 1993 + }, + { + "epoch": 0.1922854387656702, + "grad_norm": 2.1590728759765625, + "learning_rate": 4.557989045634826e-05, + "loss": 6.6363, + "step": 1994 + }, + { + "epoch": 0.19238187078109933, + "grad_norm": 2.854942798614502, + "learning_rate": 4.5575589454434444e-05, + "loss": 6.852, + "step": 1995 + }, + { + "epoch": 0.19247830279652844, + "grad_norm": 2.327202796936035, + "learning_rate": 4.557128656411836e-05, + "loss": 6.2995, + "step": 1996 + }, + { + "epoch": 0.19257473481195758, + "grad_norm": 2.562225580215454, + "learning_rate": 4.556698178579491e-05, + "loss": 6.5688, + "step": 1997 + }, + { + "epoch": 0.1926711668273867, + "grad_norm": 4.050666809082031, + "learning_rate": 4.556267511985919e-05, + "loss": 6.5336, + "step": 1998 + }, + { + "epoch": 0.1927675988428158, + "grad_norm": 2.833083152770996, + "learning_rate": 4.5558366566706446e-05, + "loss": 6.5811, + "step": 1999 + }, + { + "epoch": 0.19286403085824494, + "grad_norm": 4.567006587982178, + "learning_rate": 4.5554056126732125e-05, + "loss": 6.4705, + "step": 2000 + }, + { + "epoch": 0.19296046287367405, + "grad_norm": 3.991163492202759, + "learning_rate": 4.554974380033183e-05, + "loss": 6.5175, + "step": 2001 + }, + { + "epoch": 0.1930568948891032, + "grad_norm": 4.607915878295898, + "learning_rate": 4.554542958790135e-05, + "loss": 6.5381, + "step": 2002 + }, + { + "epoch": 0.1931533269045323, + "grad_norm": 5.254535675048828, + "learning_rate": 4.5541113489836625e-05, + "loss": 6.6731, + "step": 2003 + }, + { + "epoch": 0.19324975891996143, + "grad_norm": 3.7127881050109863, + "learning_rate": 4.553679550653378e-05, + "loss": 6.4104, + "step": 2004 + }, + { + "epoch": 0.19334619093539054, + "grad_norm": 3.5790226459503174, + "learning_rate": 4.5532475638389125e-05, + "loss": 6.6684, + "step": 2005 + }, + { + "epoch": 0.19344262295081968, + "grad_norm": 4.493264198303223, + "learning_rate": 4.552815388579913e-05, + "loss": 6.4826, + "step": 2006 + }, + { + "epoch": 0.1935390549662488, + "grad_norm": 4.028301239013672, + "learning_rate": 4.5523830249160435e-05, + "loss": 6.383, + "step": 2007 + }, + { + "epoch": 0.19363548698167793, + "grad_norm": 3.8462374210357666, + "learning_rate": 4.551950472886986e-05, + "loss": 6.3707, + "step": 2008 + }, + { + "epoch": 0.19373191899710704, + "grad_norm": 3.2923877239227295, + "learning_rate": 4.55151773253244e-05, + "loss": 6.0547, + "step": 2009 + }, + { + "epoch": 0.19382835101253615, + "grad_norm": 2.3692831993103027, + "learning_rate": 4.551084803892121e-05, + "loss": 6.2343, + "step": 2010 + }, + { + "epoch": 0.1939247830279653, + "grad_norm": 2.894170045852661, + "learning_rate": 4.550651687005763e-05, + "loss": 6.5131, + "step": 2011 + }, + { + "epoch": 0.1940212150433944, + "grad_norm": 2.807115077972412, + "learning_rate": 4.550218381913118e-05, + "loss": 6.4702, + "step": 2012 + }, + { + "epoch": 0.19411764705882353, + "grad_norm": 3.0858802795410156, + "learning_rate": 4.549784888653952e-05, + "loss": 6.3201, + "step": 2013 + }, + { + "epoch": 0.19421407907425264, + "grad_norm": 2.5225064754486084, + "learning_rate": 4.5493512072680536e-05, + "loss": 6.3584, + "step": 2014 + }, + { + "epoch": 0.19431051108968178, + "grad_norm": 3.609409809112549, + "learning_rate": 4.5489173377952224e-05, + "loss": 6.6658, + "step": 2015 + }, + { + "epoch": 0.1944069431051109, + "grad_norm": 2.863551139831543, + "learning_rate": 4.54848328027528e-05, + "loss": 6.6208, + "step": 2016 + }, + { + "epoch": 0.19450337512054003, + "grad_norm": 3.3810954093933105, + "learning_rate": 4.548049034748063e-05, + "loss": 6.6655, + "step": 2017 + }, + { + "epoch": 0.19459980713596914, + "grad_norm": 1.905274510383606, + "learning_rate": 4.5476146012534274e-05, + "loss": 6.6247, + "step": 2018 + }, + { + "epoch": 0.19469623915139828, + "grad_norm": 2.6747255325317383, + "learning_rate": 4.547179979831243e-05, + "loss": 6.7691, + "step": 2019 + }, + { + "epoch": 0.19479267116682739, + "grad_norm": 4.00029182434082, + "learning_rate": 4.5467451705214e-05, + "loss": 6.7112, + "step": 2020 + }, + { + "epoch": 0.1948891031822565, + "grad_norm": 4.100391387939453, + "learning_rate": 4.546310173363805e-05, + "loss": 6.8658, + "step": 2021 + }, + { + "epoch": 0.19498553519768563, + "grad_norm": 2.729137659072876, + "learning_rate": 4.54587498839838e-05, + "loss": 6.9534, + "step": 2022 + }, + { + "epoch": 0.19508196721311474, + "grad_norm": 3.7016916275024414, + "learning_rate": 4.5454396156650665e-05, + "loss": 6.7122, + "step": 2023 + }, + { + "epoch": 0.19517839922854388, + "grad_norm": 4.539555072784424, + "learning_rate": 4.545004055203823e-05, + "loss": 6.4312, + "step": 2024 + }, + { + "epoch": 0.195274831243973, + "grad_norm": 3.1200063228607178, + "learning_rate": 4.544568307054624e-05, + "loss": 6.5124, + "step": 2025 + }, + { + "epoch": 0.19537126325940213, + "grad_norm": 2.894911527633667, + "learning_rate": 4.544132371257463e-05, + "loss": 6.5548, + "step": 2026 + }, + { + "epoch": 0.19546769527483124, + "grad_norm": 4.085165500640869, + "learning_rate": 4.543696247852348e-05, + "loss": 6.5023, + "step": 2027 + }, + { + "epoch": 0.19556412729026038, + "grad_norm": 3.491859197616577, + "learning_rate": 4.543259936879307e-05, + "loss": 6.3684, + "step": 2028 + }, + { + "epoch": 0.19566055930568949, + "grad_norm": 3.6481053829193115, + "learning_rate": 4.542823438378384e-05, + "loss": 6.5922, + "step": 2029 + }, + { + "epoch": 0.19575699132111862, + "grad_norm": 4.157027244567871, + "learning_rate": 4.542386752389641e-05, + "loss": 6.5923, + "step": 2030 + }, + { + "epoch": 0.19585342333654773, + "grad_norm": 4.627676963806152, + "learning_rate": 4.541949878953154e-05, + "loss": 5.6628, + "step": 2031 + }, + { + "epoch": 0.19594985535197684, + "grad_norm": 3.5895862579345703, + "learning_rate": 4.541512818109022e-05, + "loss": 5.803, + "step": 2032 + }, + { + "epoch": 0.19604628736740598, + "grad_norm": 3.699622869491577, + "learning_rate": 4.541075569897355e-05, + "loss": 6.4636, + "step": 2033 + }, + { + "epoch": 0.1961427193828351, + "grad_norm": 4.402612209320068, + "learning_rate": 4.540638134358285e-05, + "loss": 6.7638, + "step": 2034 + }, + { + "epoch": 0.19623915139826423, + "grad_norm": 3.627237319946289, + "learning_rate": 4.540200511531958e-05, + "loss": 6.7084, + "step": 2035 + }, + { + "epoch": 0.19633558341369334, + "grad_norm": 3.0867578983306885, + "learning_rate": 4.53976270145854e-05, + "loss": 6.7073, + "step": 2036 + }, + { + "epoch": 0.19643201542912248, + "grad_norm": 2.2256317138671875, + "learning_rate": 4.539324704178211e-05, + "loss": 6.8035, + "step": 2037 + }, + { + "epoch": 0.19652844744455158, + "grad_norm": 3.0736629962921143, + "learning_rate": 4.53888651973117e-05, + "loss": 6.5303, + "step": 2038 + }, + { + "epoch": 0.19662487945998072, + "grad_norm": 3.5741395950317383, + "learning_rate": 4.538448148157636e-05, + "loss": 6.6534, + "step": 2039 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 2.9140939712524414, + "learning_rate": 4.538009589497838e-05, + "loss": 6.6298, + "step": 2040 + }, + { + "epoch": 0.19681774349083897, + "grad_norm": 2.582388401031494, + "learning_rate": 4.5375708437920284e-05, + "loss": 6.5443, + "step": 2041 + }, + { + "epoch": 0.19691417550626808, + "grad_norm": 3.2238028049468994, + "learning_rate": 4.537131911080475e-05, + "loss": 6.4485, + "step": 2042 + }, + { + "epoch": 0.1970106075216972, + "grad_norm": 2.423248767852783, + "learning_rate": 4.5366927914034624e-05, + "loss": 6.6733, + "step": 2043 + }, + { + "epoch": 0.19710703953712633, + "grad_norm": 3.4016411304473877, + "learning_rate": 4.5362534848012915e-05, + "loss": 6.6134, + "step": 2044 + }, + { + "epoch": 0.19720347155255544, + "grad_norm": 2.785555124282837, + "learning_rate": 4.535813991314283e-05, + "loss": 6.6403, + "step": 2045 + }, + { + "epoch": 0.19729990356798457, + "grad_norm": 2.203758716583252, + "learning_rate": 4.5353743109827705e-05, + "loss": 6.3836, + "step": 2046 + }, + { + "epoch": 0.19739633558341368, + "grad_norm": 2.867201089859009, + "learning_rate": 4.5349344438471094e-05, + "loss": 6.4544, + "step": 2047 + }, + { + "epoch": 0.19749276759884282, + "grad_norm": 3.0166027545928955, + "learning_rate": 4.534494389947671e-05, + "loss": 6.6013, + "step": 2048 + }, + { + "epoch": 0.19758919961427193, + "grad_norm": 2.2489960193634033, + "learning_rate": 4.53405414932484e-05, + "loss": 6.5817, + "step": 2049 + }, + { + "epoch": 0.19768563162970107, + "grad_norm": 3.2869324684143066, + "learning_rate": 4.5336137220190235e-05, + "loss": 6.3942, + "step": 2050 + }, + { + "epoch": 0.19778206364513018, + "grad_norm": 3.1288156509399414, + "learning_rate": 4.533173108070643e-05, + "loss": 6.5126, + "step": 2051 + }, + { + "epoch": 0.19787849566055932, + "grad_norm": 2.940187931060791, + "learning_rate": 4.532732307520136e-05, + "loss": 6.4911, + "step": 2052 + }, + { + "epoch": 0.19797492767598843, + "grad_norm": 3.152571201324463, + "learning_rate": 4.53229132040796e-05, + "loss": 6.6092, + "step": 2053 + }, + { + "epoch": 0.19807135969141756, + "grad_norm": 2.7970962524414062, + "learning_rate": 4.531850146774588e-05, + "loss": 6.6313, + "step": 2054 + }, + { + "epoch": 0.19816779170684667, + "grad_norm": 3.7319588661193848, + "learning_rate": 4.5314087866605105e-05, + "loss": 6.5126, + "step": 2055 + }, + { + "epoch": 0.19826422372227578, + "grad_norm": 4.643565654754639, + "learning_rate": 4.530967240106234e-05, + "loss": 6.8425, + "step": 2056 + }, + { + "epoch": 0.19836065573770492, + "grad_norm": 3.87626051902771, + "learning_rate": 4.5305255071522844e-05, + "loss": 6.8492, + "step": 2057 + }, + { + "epoch": 0.19845708775313403, + "grad_norm": 4.3806586265563965, + "learning_rate": 4.530083587839203e-05, + "loss": 6.5644, + "step": 2058 + }, + { + "epoch": 0.19855351976856317, + "grad_norm": 4.895103454589844, + "learning_rate": 4.5296414822075495e-05, + "loss": 6.7231, + "step": 2059 + }, + { + "epoch": 0.19864995178399228, + "grad_norm": 2.4937100410461426, + "learning_rate": 4.529199190297898e-05, + "loss": 6.7888, + "step": 2060 + }, + { + "epoch": 0.19874638379942142, + "grad_norm": 2.594719648361206, + "learning_rate": 4.528756712150842e-05, + "loss": 6.7039, + "step": 2061 + }, + { + "epoch": 0.19884281581485053, + "grad_norm": 3.9749372005462646, + "learning_rate": 4.5283140478069925e-05, + "loss": 6.7096, + "step": 2062 + }, + { + "epoch": 0.19893924783027966, + "grad_norm": 2.7939822673797607, + "learning_rate": 4.5278711973069757e-05, + "loss": 6.5651, + "step": 2063 + }, + { + "epoch": 0.19903567984570877, + "grad_norm": 3.4548869132995605, + "learning_rate": 4.527428160691437e-05, + "loss": 6.5307, + "step": 2064 + }, + { + "epoch": 0.1991321118611379, + "grad_norm": 3.073849678039551, + "learning_rate": 4.526984938001037e-05, + "loss": 6.4809, + "step": 2065 + }, + { + "epoch": 0.19922854387656702, + "grad_norm": 3.377626419067383, + "learning_rate": 4.5265415292764534e-05, + "loss": 6.8087, + "step": 2066 + }, + { + "epoch": 0.19932497589199613, + "grad_norm": 4.838515281677246, + "learning_rate": 4.526097934558383e-05, + "loss": 6.8392, + "step": 2067 + }, + { + "epoch": 0.19942140790742527, + "grad_norm": 4.268012523651123, + "learning_rate": 4.5256541538875376e-05, + "loss": 6.571, + "step": 2068 + }, + { + "epoch": 0.19951783992285438, + "grad_norm": 3.374300479888916, + "learning_rate": 4.525210187304647e-05, + "loss": 6.6774, + "step": 2069 + }, + { + "epoch": 0.19961427193828352, + "grad_norm": 2.8668384552001953, + "learning_rate": 4.5247660348504585e-05, + "loss": 6.3371, + "step": 2070 + }, + { + "epoch": 0.19971070395371263, + "grad_norm": 3.827153444290161, + "learning_rate": 4.5243216965657355e-05, + "loss": 6.0833, + "step": 2071 + }, + { + "epoch": 0.19980713596914176, + "grad_norm": 3.606536865234375, + "learning_rate": 4.523877172491259e-05, + "loss": 6.4895, + "step": 2072 + }, + { + "epoch": 0.19990356798457087, + "grad_norm": 2.7865512371063232, + "learning_rate": 4.523432462667826e-05, + "loss": 6.6402, + "step": 2073 + }, + { + "epoch": 0.2, + "grad_norm": 3.5975770950317383, + "learning_rate": 4.522987567136252e-05, + "loss": 6.458, + "step": 2074 + }, + { + "epoch": 0.20009643201542912, + "grad_norm": 4.933679580688477, + "learning_rate": 4.522542485937369e-05, + "loss": 6.5904, + "step": 2075 + }, + { + "epoch": 0.20019286403085826, + "grad_norm": 3.69720458984375, + "learning_rate": 4.522097219112026e-05, + "loss": 6.6966, + "step": 2076 + }, + { + "epoch": 0.20028929604628737, + "grad_norm": 3.0982682704925537, + "learning_rate": 4.52165176670109e-05, + "loss": 6.6845, + "step": 2077 + }, + { + "epoch": 0.20038572806171648, + "grad_norm": 4.26364803314209, + "learning_rate": 4.521206128745441e-05, + "loss": 6.5934, + "step": 2078 + }, + { + "epoch": 0.20048216007714562, + "grad_norm": 4.481204986572266, + "learning_rate": 4.520760305285983e-05, + "loss": 6.7587, + "step": 2079 + }, + { + "epoch": 0.20057859209257473, + "grad_norm": 2.494863748550415, + "learning_rate": 4.5203142963636305e-05, + "loss": 6.7301, + "step": 2080 + }, + { + "epoch": 0.20067502410800386, + "grad_norm": 3.8369510173797607, + "learning_rate": 4.519868102019319e-05, + "loss": 6.2318, + "step": 2081 + }, + { + "epoch": 0.20077145612343297, + "grad_norm": 3.287707805633545, + "learning_rate": 4.519421722293998e-05, + "loss": 5.9975, + "step": 2082 + }, + { + "epoch": 0.2008678881388621, + "grad_norm": 3.8482697010040283, + "learning_rate": 4.5189751572286384e-05, + "loss": 6.7232, + "step": 2083 + }, + { + "epoch": 0.20096432015429122, + "grad_norm": 2.403101682662964, + "learning_rate": 4.518528406864223e-05, + "loss": 6.4786, + "step": 2084 + }, + { + "epoch": 0.20106075216972036, + "grad_norm": 2.4170384407043457, + "learning_rate": 4.518081471241755e-05, + "loss": 6.6495, + "step": 2085 + }, + { + "epoch": 0.20115718418514947, + "grad_norm": 2.6750199794769287, + "learning_rate": 4.517634350402253e-05, + "loss": 6.4094, + "step": 2086 + }, + { + "epoch": 0.2012536162005786, + "grad_norm": 2.404757022857666, + "learning_rate": 4.517187044386755e-05, + "loss": 6.6056, + "step": 2087 + }, + { + "epoch": 0.20135004821600772, + "grad_norm": 2.3447446823120117, + "learning_rate": 4.516739553236311e-05, + "loss": 6.5025, + "step": 2088 + }, + { + "epoch": 0.20144648023143683, + "grad_norm": 3.1715314388275146, + "learning_rate": 4.516291876991994e-05, + "loss": 6.4843, + "step": 2089 + }, + { + "epoch": 0.20154291224686596, + "grad_norm": 2.7318830490112305, + "learning_rate": 4.51584401569489e-05, + "loss": 6.8331, + "step": 2090 + }, + { + "epoch": 0.20163934426229507, + "grad_norm": 2.349592924118042, + "learning_rate": 4.5153959693861035e-05, + "loss": 6.7002, + "step": 2091 + }, + { + "epoch": 0.2017357762777242, + "grad_norm": 3.5696449279785156, + "learning_rate": 4.514947738106755e-05, + "loss": 6.5946, + "step": 2092 + }, + { + "epoch": 0.20183220829315332, + "grad_norm": 3.0649101734161377, + "learning_rate": 4.514499321897984e-05, + "loss": 6.7442, + "step": 2093 + }, + { + "epoch": 0.20192864030858246, + "grad_norm": 3.342607259750366, + "learning_rate": 4.5140507208009444e-05, + "loss": 6.5863, + "step": 2094 + }, + { + "epoch": 0.20202507232401157, + "grad_norm": 2.331026554107666, + "learning_rate": 4.513601934856808e-05, + "loss": 5.8534, + "step": 2095 + }, + { + "epoch": 0.2021215043394407, + "grad_norm": 3.1116373538970947, + "learning_rate": 4.513152964106764e-05, + "loss": 6.2034, + "step": 2096 + }, + { + "epoch": 0.20221793635486981, + "grad_norm": 4.213334083557129, + "learning_rate": 4.51270380859202e-05, + "loss": 6.5313, + "step": 2097 + }, + { + "epoch": 0.20231436837029895, + "grad_norm": 2.8191540241241455, + "learning_rate": 4.512254468353797e-05, + "loss": 6.4646, + "step": 2098 + }, + { + "epoch": 0.20241080038572806, + "grad_norm": 4.324586868286133, + "learning_rate": 4.5118049434333353e-05, + "loss": 6.4979, + "step": 2099 + }, + { + "epoch": 0.20250723240115717, + "grad_norm": 4.026733875274658, + "learning_rate": 4.5113552338718924e-05, + "loss": 6.5271, + "step": 2100 + }, + { + "epoch": 0.2026036644165863, + "grad_norm": 3.263655185699463, + "learning_rate": 4.510905339710741e-05, + "loss": 6.6071, + "step": 2101 + }, + { + "epoch": 0.20270009643201542, + "grad_norm": 3.399322986602783, + "learning_rate": 4.5104552609911736e-05, + "loss": 6.6166, + "step": 2102 + }, + { + "epoch": 0.20279652844744456, + "grad_norm": 3.662987470626831, + "learning_rate": 4.510004997754497e-05, + "loss": 6.6447, + "step": 2103 + }, + { + "epoch": 0.20289296046287367, + "grad_norm": 3.5920557975769043, + "learning_rate": 4.509554550042034e-05, + "loss": 6.7144, + "step": 2104 + }, + { + "epoch": 0.2029893924783028, + "grad_norm": 3.768519878387451, + "learning_rate": 4.50910391789513e-05, + "loss": 6.7441, + "step": 2105 + }, + { + "epoch": 0.20308582449373191, + "grad_norm": 2.874025821685791, + "learning_rate": 4.508653101355139e-05, + "loss": 6.7066, + "step": 2106 + }, + { + "epoch": 0.20318225650916105, + "grad_norm": 3.296978235244751, + "learning_rate": 4.50820210046344e-05, + "loss": 6.4006, + "step": 2107 + }, + { + "epoch": 0.20327868852459016, + "grad_norm": 2.9536099433898926, + "learning_rate": 4.507750915261424e-05, + "loss": 6.513, + "step": 2108 + }, + { + "epoch": 0.2033751205400193, + "grad_norm": 3.087449789047241, + "learning_rate": 4.5072995457904995e-05, + "loss": 6.5156, + "step": 2109 + }, + { + "epoch": 0.2034715525554484, + "grad_norm": 2.6249642372131348, + "learning_rate": 4.506847992092093e-05, + "loss": 6.4129, + "step": 2110 + }, + { + "epoch": 0.20356798457087752, + "grad_norm": 2.7508199214935303, + "learning_rate": 4.5063962542076485e-05, + "loss": 6.1438, + "step": 2111 + }, + { + "epoch": 0.20366441658630666, + "grad_norm": 3.094799280166626, + "learning_rate": 4.505944332178625e-05, + "loss": 6.6583, + "step": 2112 + }, + { + "epoch": 0.20376084860173577, + "grad_norm": 2.699286699295044, + "learning_rate": 4.505492226046499e-05, + "loss": 6.5762, + "step": 2113 + }, + { + "epoch": 0.2038572806171649, + "grad_norm": 2.1447365283966064, + "learning_rate": 4.505039935852766e-05, + "loss": 6.532, + "step": 2114 + }, + { + "epoch": 0.20395371263259401, + "grad_norm": 2.4159865379333496, + "learning_rate": 4.504587461638935e-05, + "loss": 5.9925, + "step": 2115 + }, + { + "epoch": 0.20405014464802315, + "grad_norm": 2.4101197719573975, + "learning_rate": 4.504134803446534e-05, + "loss": 6.4171, + "step": 2116 + }, + { + "epoch": 0.20414657666345226, + "grad_norm": 2.439328908920288, + "learning_rate": 4.5036819613171075e-05, + "loss": 6.479, + "step": 2117 + }, + { + "epoch": 0.2042430086788814, + "grad_norm": 2.7052159309387207, + "learning_rate": 4.503228935292216e-05, + "loss": 6.2176, + "step": 2118 + }, + { + "epoch": 0.2043394406943105, + "grad_norm": 3.0648372173309326, + "learning_rate": 4.502775725413439e-05, + "loss": 6.4692, + "step": 2119 + }, + { + "epoch": 0.20443587270973965, + "grad_norm": 2.4511046409606934, + "learning_rate": 4.5023223317223714e-05, + "loss": 6.5963, + "step": 2120 + }, + { + "epoch": 0.20453230472516876, + "grad_norm": 3.2594761848449707, + "learning_rate": 4.5018687542606244e-05, + "loss": 6.9281, + "step": 2121 + }, + { + "epoch": 0.20462873674059787, + "grad_norm": 2.318676710128784, + "learning_rate": 4.501414993069827e-05, + "loss": 6.6502, + "step": 2122 + }, + { + "epoch": 0.204725168756027, + "grad_norm": 2.323570966720581, + "learning_rate": 4.500961048191625e-05, + "loss": 6.7688, + "step": 2123 + }, + { + "epoch": 0.2048216007714561, + "grad_norm": 1.914433479309082, + "learning_rate": 4.50050691966768e-05, + "loss": 6.6601, + "step": 2124 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 2.14316463470459, + "learning_rate": 4.500052607539673e-05, + "loss": 6.5058, + "step": 2125 + }, + { + "epoch": 0.20501446480231436, + "grad_norm": 2.664877414703369, + "learning_rate": 4.499598111849299e-05, + "loss": 6.406, + "step": 2126 + }, + { + "epoch": 0.2051108968177435, + "grad_norm": 3.178287982940674, + "learning_rate": 4.499143432638271e-05, + "loss": 6.7767, + "step": 2127 + }, + { + "epoch": 0.2052073288331726, + "grad_norm": 2.7382256984710693, + "learning_rate": 4.4986885699483196e-05, + "loss": 6.5727, + "step": 2128 + }, + { + "epoch": 0.20530376084860175, + "grad_norm": 3.049332857131958, + "learning_rate": 4.4982335238211916e-05, + "loss": 6.5777, + "step": 2129 + }, + { + "epoch": 0.20540019286403086, + "grad_norm": 2.654095411300659, + "learning_rate": 4.49777829429865e-05, + "loss": 6.6822, + "step": 2130 + }, + { + "epoch": 0.20549662487946, + "grad_norm": 2.896799087524414, + "learning_rate": 4.4973228814224754e-05, + "loss": 6.4678, + "step": 2131 + }, + { + "epoch": 0.2055930568948891, + "grad_norm": 3.1110734939575195, + "learning_rate": 4.496867285234464e-05, + "loss": 6.0846, + "step": 2132 + }, + { + "epoch": 0.2056894889103182, + "grad_norm": 4.314020156860352, + "learning_rate": 4.496411505776432e-05, + "loss": 6.2377, + "step": 2133 + }, + { + "epoch": 0.20578592092574735, + "grad_norm": 4.42302942276001, + "learning_rate": 4.49595554309021e-05, + "loss": 6.4553, + "step": 2134 + }, + { + "epoch": 0.20588235294117646, + "grad_norm": 3.3546712398529053, + "learning_rate": 4.495499397217644e-05, + "loss": 6.1906, + "step": 2135 + }, + { + "epoch": 0.2059787849566056, + "grad_norm": 5.0673017501831055, + "learning_rate": 4.4950430682006e-05, + "loss": 6.4003, + "step": 2136 + }, + { + "epoch": 0.2060752169720347, + "grad_norm": 4.724764347076416, + "learning_rate": 4.494586556080957e-05, + "loss": 6.5823, + "step": 2137 + }, + { + "epoch": 0.20617164898746385, + "grad_norm": 3.2950446605682373, + "learning_rate": 4.494129860900616e-05, + "loss": 6.4873, + "step": 2138 + }, + { + "epoch": 0.20626808100289296, + "grad_norm": 6.244236469268799, + "learning_rate": 4.493672982701491e-05, + "loss": 6.5275, + "step": 2139 + }, + { + "epoch": 0.2063645130183221, + "grad_norm": 7.9417595863342285, + "learning_rate": 4.4932159215255125e-05, + "loss": 6.559, + "step": 2140 + }, + { + "epoch": 0.2064609450337512, + "grad_norm": 6.701818943023682, + "learning_rate": 4.49275867741463e-05, + "loss": 6.4208, + "step": 2141 + }, + { + "epoch": 0.20655737704918034, + "grad_norm": 3.543086051940918, + "learning_rate": 4.49230125041081e-05, + "loss": 6.4625, + "step": 2142 + }, + { + "epoch": 0.20665380906460945, + "grad_norm": 4.964417934417725, + "learning_rate": 4.491843640556033e-05, + "loss": 6.7517, + "step": 2143 + }, + { + "epoch": 0.20675024108003856, + "grad_norm": 7.586320400238037, + "learning_rate": 4.4913858478922974e-05, + "loss": 6.42, + "step": 2144 + }, + { + "epoch": 0.2068466730954677, + "grad_norm": 6.736458778381348, + "learning_rate": 4.49092787246162e-05, + "loss": 6.752, + "step": 2145 + }, + { + "epoch": 0.2069431051108968, + "grad_norm": 3.8283371925354004, + "learning_rate": 4.490469714306033e-05, + "loss": 6.6511, + "step": 2146 + }, + { + "epoch": 0.20703953712632595, + "grad_norm": 4.072104454040527, + "learning_rate": 4.490011373467586e-05, + "loss": 6.3418, + "step": 2147 + }, + { + "epoch": 0.20713596914175506, + "grad_norm": 2.281553030014038, + "learning_rate": 4.489552849988344e-05, + "loss": 6.4535, + "step": 2148 + }, + { + "epoch": 0.2072324011571842, + "grad_norm": 2.8023548126220703, + "learning_rate": 4.48909414391039e-05, + "loss": 6.4889, + "step": 2149 + }, + { + "epoch": 0.2073288331726133, + "grad_norm": 4.58903169631958, + "learning_rate": 4.488635255275824e-05, + "loss": 6.6121, + "step": 2150 + }, + { + "epoch": 0.20742526518804244, + "grad_norm": 4.961113452911377, + "learning_rate": 4.488176184126761e-05, + "loss": 6.248, + "step": 2151 + }, + { + "epoch": 0.20752169720347155, + "grad_norm": 6.594388484954834, + "learning_rate": 4.487716930505336e-05, + "loss": 6.2623, + "step": 2152 + }, + { + "epoch": 0.2076181292189007, + "grad_norm": 3.5130996704101562, + "learning_rate": 4.487257494453697e-05, + "loss": 6.5511, + "step": 2153 + }, + { + "epoch": 0.2077145612343298, + "grad_norm": 3.1714110374450684, + "learning_rate": 4.486797876014012e-05, + "loss": 6.5716, + "step": 2154 + }, + { + "epoch": 0.2078109932497589, + "grad_norm": 3.1285438537597656, + "learning_rate": 4.486338075228462e-05, + "loss": 6.4092, + "step": 2155 + }, + { + "epoch": 0.20790742526518805, + "grad_norm": 4.625370502471924, + "learning_rate": 4.485878092139249e-05, + "loss": 6.501, + "step": 2156 + }, + { + "epoch": 0.20800385728061715, + "grad_norm": 3.7050068378448486, + "learning_rate": 4.485417926788589e-05, + "loss": 6.4798, + "step": 2157 + }, + { + "epoch": 0.2081002892960463, + "grad_norm": 2.8934216499328613, + "learning_rate": 4.484957579218715e-05, + "loss": 6.4004, + "step": 2158 + }, + { + "epoch": 0.2081967213114754, + "grad_norm": 3.7094790935516357, + "learning_rate": 4.4844970494718774e-05, + "loss": 6.3886, + "step": 2159 + }, + { + "epoch": 0.20829315332690454, + "grad_norm": 3.958561658859253, + "learning_rate": 4.484036337590343e-05, + "loss": 6.4459, + "step": 2160 + }, + { + "epoch": 0.20838958534233365, + "grad_norm": 3.4779932498931885, + "learning_rate": 4.483575443616396e-05, + "loss": 6.4553, + "step": 2161 + }, + { + "epoch": 0.2084860173577628, + "grad_norm": 4.004011154174805, + "learning_rate": 4.483114367592336e-05, + "loss": 6.504, + "step": 2162 + }, + { + "epoch": 0.2085824493731919, + "grad_norm": 3.2009835243225098, + "learning_rate": 4.48265310956048e-05, + "loss": 6.4705, + "step": 2163 + }, + { + "epoch": 0.20867888138862103, + "grad_norm": 2.4645557403564453, + "learning_rate": 4.482191669563162e-05, + "loss": 6.4769, + "step": 2164 + }, + { + "epoch": 0.20877531340405014, + "grad_norm": 4.428523063659668, + "learning_rate": 4.481730047642732e-05, + "loss": 6.6629, + "step": 2165 + }, + { + "epoch": 0.20887174541947925, + "grad_norm": 2.7363533973693848, + "learning_rate": 4.481268243841557e-05, + "loss": 6.6612, + "step": 2166 + }, + { + "epoch": 0.2089681774349084, + "grad_norm": 2.7093288898468018, + "learning_rate": 4.4808062582020216e-05, + "loss": 6.5762, + "step": 2167 + }, + { + "epoch": 0.2090646094503375, + "grad_norm": 4.037708282470703, + "learning_rate": 4.480344090766526e-05, + "loss": 6.7028, + "step": 2168 + }, + { + "epoch": 0.20916104146576664, + "grad_norm": 2.9549689292907715, + "learning_rate": 4.4798817415774865e-05, + "loss": 6.5832, + "step": 2169 + }, + { + "epoch": 0.20925747348119575, + "grad_norm": 3.3252522945404053, + "learning_rate": 4.479419210677338e-05, + "loss": 6.4351, + "step": 2170 + }, + { + "epoch": 0.2093539054966249, + "grad_norm": 3.2014384269714355, + "learning_rate": 4.478956498108531e-05, + "loss": 6.3888, + "step": 2171 + }, + { + "epoch": 0.209450337512054, + "grad_norm": 5.328127384185791, + "learning_rate": 4.478493603913532e-05, + "loss": 6.5229, + "step": 2172 + }, + { + "epoch": 0.20954676952748313, + "grad_norm": 4.388777732849121, + "learning_rate": 4.478030528134825e-05, + "loss": 6.4548, + "step": 2173 + }, + { + "epoch": 0.20964320154291224, + "grad_norm": 3.2948849201202393, + "learning_rate": 4.477567270814911e-05, + "loss": 6.4526, + "step": 2174 + }, + { + "epoch": 0.20973963355834138, + "grad_norm": 3.274418354034424, + "learning_rate": 4.477103831996306e-05, + "loss": 6.4904, + "step": 2175 + }, + { + "epoch": 0.2098360655737705, + "grad_norm": 4.241164207458496, + "learning_rate": 4.476640211721545e-05, + "loss": 6.4627, + "step": 2176 + }, + { + "epoch": 0.2099324975891996, + "grad_norm": 2.618497848510742, + "learning_rate": 4.4761764100331795e-05, + "loss": 6.4521, + "step": 2177 + }, + { + "epoch": 0.21002892960462874, + "grad_norm": 3.1193947792053223, + "learning_rate": 4.475712426973775e-05, + "loss": 6.407, + "step": 2178 + }, + { + "epoch": 0.21012536162005785, + "grad_norm": 3.3123257160186768, + "learning_rate": 4.4752482625859146e-05, + "loss": 6.4425, + "step": 2179 + }, + { + "epoch": 0.210221793635487, + "grad_norm": 2.4604201316833496, + "learning_rate": 4.4747839169122005e-05, + "loss": 6.3673, + "step": 2180 + }, + { + "epoch": 0.2103182256509161, + "grad_norm": 3.012957811355591, + "learning_rate": 4.474319389995249e-05, + "loss": 6.3277, + "step": 2181 + }, + { + "epoch": 0.21041465766634523, + "grad_norm": 5.974411964416504, + "learning_rate": 4.4738546818776936e-05, + "loss": 6.3073, + "step": 2182 + }, + { + "epoch": 0.21051108968177434, + "grad_norm": 3.7672417163848877, + "learning_rate": 4.473389792602185e-05, + "loss": 6.4692, + "step": 2183 + }, + { + "epoch": 0.21060752169720348, + "grad_norm": 3.7732841968536377, + "learning_rate": 4.47292472221139e-05, + "loss": 6.5, + "step": 2184 + }, + { + "epoch": 0.2107039537126326, + "grad_norm": 4.064417839050293, + "learning_rate": 4.4724594707479925e-05, + "loss": 6.3761, + "step": 2185 + }, + { + "epoch": 0.21080038572806173, + "grad_norm": 3.834129810333252, + "learning_rate": 4.4719940382546935e-05, + "loss": 6.6753, + "step": 2186 + }, + { + "epoch": 0.21089681774349084, + "grad_norm": 2.817426919937134, + "learning_rate": 4.471528424774207e-05, + "loss": 6.92, + "step": 2187 + }, + { + "epoch": 0.21099324975891995, + "grad_norm": 4.752101421356201, + "learning_rate": 4.4710626303492694e-05, + "loss": 6.6946, + "step": 2188 + }, + { + "epoch": 0.21108968177434909, + "grad_norm": 5.609659671783447, + "learning_rate": 4.470596655022628e-05, + "loss": 6.7145, + "step": 2189 + }, + { + "epoch": 0.2111861137897782, + "grad_norm": 4.46241569519043, + "learning_rate": 4.470130498837053e-05, + "loss": 6.7286, + "step": 2190 + }, + { + "epoch": 0.21128254580520733, + "grad_norm": 4.030667781829834, + "learning_rate": 4.469664161835325e-05, + "loss": 6.7165, + "step": 2191 + }, + { + "epoch": 0.21137897782063644, + "grad_norm": 5.715841770172119, + "learning_rate": 4.469197644060244e-05, + "loss": 6.5121, + "step": 2192 + }, + { + "epoch": 0.21147540983606558, + "grad_norm": 5.435705184936523, + "learning_rate": 4.468730945554627e-05, + "loss": 6.5509, + "step": 2193 + }, + { + "epoch": 0.2115718418514947, + "grad_norm": 2.8621392250061035, + "learning_rate": 4.468264066361307e-05, + "loss": 6.4583, + "step": 2194 + }, + { + "epoch": 0.21166827386692383, + "grad_norm": 4.725160598754883, + "learning_rate": 4.467797006523134e-05, + "loss": 6.4577, + "step": 2195 + }, + { + "epoch": 0.21176470588235294, + "grad_norm": 7.736532211303711, + "learning_rate": 4.467329766082974e-05, + "loss": 6.5432, + "step": 2196 + }, + { + "epoch": 0.21186113789778208, + "grad_norm": 3.608785629272461, + "learning_rate": 4.4668623450837085e-05, + "loss": 6.6358, + "step": 2197 + }, + { + "epoch": 0.21195756991321119, + "grad_norm": 3.440523862838745, + "learning_rate": 4.466394743568238e-05, + "loss": 6.6784, + "step": 2198 + }, + { + "epoch": 0.2120540019286403, + "grad_norm": 2.3828771114349365, + "learning_rate": 4.4659269615794785e-05, + "loss": 6.5956, + "step": 2199 + }, + { + "epoch": 0.21215043394406943, + "grad_norm": 3.4563653469085693, + "learning_rate": 4.465458999160362e-05, + "loss": 6.5259, + "step": 2200 + }, + { + "epoch": 0.21224686595949854, + "grad_norm": 4.362351417541504, + "learning_rate": 4.4649908563538375e-05, + "loss": 6.6043, + "step": 2201 + }, + { + "epoch": 0.21234329797492768, + "grad_norm": 4.720169544219971, + "learning_rate": 4.464522533202871e-05, + "loss": 6.6043, + "step": 2202 + }, + { + "epoch": 0.2124397299903568, + "grad_norm": 5.957839488983154, + "learning_rate": 4.464054029750444e-05, + "loss": 6.2643, + "step": 2203 + }, + { + "epoch": 0.21253616200578593, + "grad_norm": 5.625751495361328, + "learning_rate": 4.463585346039555e-05, + "loss": 6.1458, + "step": 2204 + }, + { + "epoch": 0.21263259402121504, + "grad_norm": 4.033935070037842, + "learning_rate": 4.4631164821132206e-05, + "loss": 6.2674, + "step": 2205 + }, + { + "epoch": 0.21272902603664418, + "grad_norm": 5.144409656524658, + "learning_rate": 4.462647438014471e-05, + "loss": 6.3733, + "step": 2206 + }, + { + "epoch": 0.21282545805207329, + "grad_norm": 5.208834648132324, + "learning_rate": 4.462178213786356e-05, + "loss": 6.2602, + "step": 2207 + }, + { + "epoch": 0.21292189006750242, + "grad_norm": 3.8015406131744385, + "learning_rate": 4.4617088094719396e-05, + "loss": 6.3736, + "step": 2208 + }, + { + "epoch": 0.21301832208293153, + "grad_norm": 7.694065570831299, + "learning_rate": 4.461239225114302e-05, + "loss": 6.821, + "step": 2209 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 6.3599395751953125, + "learning_rate": 4.460769460756543e-05, + "loss": 6.5834, + "step": 2210 + }, + { + "epoch": 0.21321118611378978, + "grad_norm": 5.009999752044678, + "learning_rate": 4.460299516441777e-05, + "loss": 6.5403, + "step": 2211 + }, + { + "epoch": 0.2133076181292189, + "grad_norm": 5.098796844482422, + "learning_rate": 4.4598293922131326e-05, + "loss": 6.4168, + "step": 2212 + }, + { + "epoch": 0.21340405014464803, + "grad_norm": 4.7475266456604, + "learning_rate": 4.459359088113759e-05, + "loss": 6.3492, + "step": 2213 + }, + { + "epoch": 0.21350048216007714, + "grad_norm": 6.141800403594971, + "learning_rate": 4.4588886041868206e-05, + "loss": 6.7292, + "step": 2214 + }, + { + "epoch": 0.21359691417550628, + "grad_norm": 7.512917518615723, + "learning_rate": 4.458417940475497e-05, + "loss": 6.3711, + "step": 2215 + }, + { + "epoch": 0.21369334619093538, + "grad_norm": 5.439903736114502, + "learning_rate": 4.4579470970229844e-05, + "loss": 6.5643, + "step": 2216 + }, + { + "epoch": 0.21378977820636452, + "grad_norm": 6.307306289672852, + "learning_rate": 4.457476073872498e-05, + "loss": 6.4964, + "step": 2217 + }, + { + "epoch": 0.21388621022179363, + "grad_norm": 4.973323822021484, + "learning_rate": 4.4570048710672664e-05, + "loss": 6.3349, + "step": 2218 + }, + { + "epoch": 0.21398264223722277, + "grad_norm": 4.639034748077393, + "learning_rate": 4.4565334886505365e-05, + "loss": 6.4839, + "step": 2219 + }, + { + "epoch": 0.21407907425265188, + "grad_norm": 3.508241653442383, + "learning_rate": 4.456061926665571e-05, + "loss": 6.4598, + "step": 2220 + }, + { + "epoch": 0.214175506268081, + "grad_norm": 6.199587821960449, + "learning_rate": 4.45559018515565e-05, + "loss": 6.6385, + "step": 2221 + }, + { + "epoch": 0.21427193828351013, + "grad_norm": 6.0287628173828125, + "learning_rate": 4.4551182641640684e-05, + "loss": 5.8427, + "step": 2222 + }, + { + "epoch": 0.21436837029893924, + "grad_norm": 5.2718915939331055, + "learning_rate": 4.454646163734139e-05, + "loss": 6.3415, + "step": 2223 + }, + { + "epoch": 0.21446480231436837, + "grad_norm": 3.260964870452881, + "learning_rate": 4.4541738839091896e-05, + "loss": 6.2901, + "step": 2224 + }, + { + "epoch": 0.21456123432979748, + "grad_norm": 6.139833927154541, + "learning_rate": 4.453701424732567e-05, + "loss": 6.2366, + "step": 2225 + }, + { + "epoch": 0.21465766634522662, + "grad_norm": 10.250446319580078, + "learning_rate": 4.453228786247632e-05, + "loss": 6.2388, + "step": 2226 + }, + { + "epoch": 0.21475409836065573, + "grad_norm": 4.6312055587768555, + "learning_rate": 4.452755968497763e-05, + "loss": 6.4959, + "step": 2227 + }, + { + "epoch": 0.21485053037608487, + "grad_norm": 3.0164382457733154, + "learning_rate": 4.452282971526355e-05, + "loss": 6.6061, + "step": 2228 + }, + { + "epoch": 0.21494696239151398, + "grad_norm": 4.352289199829102, + "learning_rate": 4.451809795376819e-05, + "loss": 6.5349, + "step": 2229 + }, + { + "epoch": 0.21504339440694312, + "grad_norm": 5.258518218994141, + "learning_rate": 4.4513364400925815e-05, + "loss": 6.4519, + "step": 2230 + }, + { + "epoch": 0.21513982642237223, + "grad_norm": 4.1770548820495605, + "learning_rate": 4.450862905717088e-05, + "loss": 6.3164, + "step": 2231 + }, + { + "epoch": 0.21523625843780134, + "grad_norm": 3.2214925289154053, + "learning_rate": 4.4503891922937977e-05, + "loss": 6.5422, + "step": 2232 + }, + { + "epoch": 0.21533269045323047, + "grad_norm": 4.38609504699707, + "learning_rate": 4.449915299866188e-05, + "loss": 6.5013, + "step": 2233 + }, + { + "epoch": 0.21542912246865958, + "grad_norm": 4.0170698165893555, + "learning_rate": 4.4494412284777523e-05, + "loss": 6.5236, + "step": 2234 + }, + { + "epoch": 0.21552555448408872, + "grad_norm": 7.69564962387085, + "learning_rate": 4.448966978171999e-05, + "loss": 6.3717, + "step": 2235 + }, + { + "epoch": 0.21562198649951783, + "grad_norm": 3.974987268447876, + "learning_rate": 4.4484925489924555e-05, + "loss": 6.8036, + "step": 2236 + }, + { + "epoch": 0.21571841851494697, + "grad_norm": 4.676928520202637, + "learning_rate": 4.448017940982665e-05, + "loss": 6.7219, + "step": 2237 + }, + { + "epoch": 0.21581485053037608, + "grad_norm": 3.9676411151885986, + "learning_rate": 4.4475431541861846e-05, + "loss": 6.6431, + "step": 2238 + }, + { + "epoch": 0.21591128254580522, + "grad_norm": 3.836535692214966, + "learning_rate": 4.447068188646592e-05, + "loss": 6.9291, + "step": 2239 + }, + { + "epoch": 0.21600771456123433, + "grad_norm": 3.082110643386841, + "learning_rate": 4.4465930444074755e-05, + "loss": 6.766, + "step": 2240 + }, + { + "epoch": 0.21610414657666346, + "grad_norm": 2.782043933868408, + "learning_rate": 4.446117721512446e-05, + "loss": 6.6761, + "step": 2241 + }, + { + "epoch": 0.21620057859209257, + "grad_norm": 4.397092819213867, + "learning_rate": 4.4456422200051276e-05, + "loss": 6.3429, + "step": 2242 + }, + { + "epoch": 0.21629701060752168, + "grad_norm": 4.382070541381836, + "learning_rate": 4.445166539929161e-05, + "loss": 6.4, + "step": 2243 + }, + { + "epoch": 0.21639344262295082, + "grad_norm": 4.349655628204346, + "learning_rate": 4.444690681328203e-05, + "loss": 6.5881, + "step": 2244 + }, + { + "epoch": 0.21648987463837993, + "grad_norm": 3.6705164909362793, + "learning_rate": 4.444214644245928e-05, + "loss": 6.3766, + "step": 2245 + }, + { + "epoch": 0.21658630665380907, + "grad_norm": 3.0058844089508057, + "learning_rate": 4.4437384287260266e-05, + "loss": 6.5565, + "step": 2246 + }, + { + "epoch": 0.21668273866923818, + "grad_norm": 2.851795196533203, + "learning_rate": 4.443262034812203e-05, + "loss": 6.5796, + "step": 2247 + }, + { + "epoch": 0.21677917068466732, + "grad_norm": 3.199951648712158, + "learning_rate": 4.4427854625481827e-05, + "loss": 6.6421, + "step": 2248 + }, + { + "epoch": 0.21687560270009643, + "grad_norm": 3.7511210441589355, + "learning_rate": 4.442308711977704e-05, + "loss": 6.4696, + "step": 2249 + }, + { + "epoch": 0.21697203471552556, + "grad_norm": 4.094382286071777, + "learning_rate": 4.4418317831445225e-05, + "loss": 6.1723, + "step": 2250 + }, + { + "epoch": 0.21706846673095467, + "grad_norm": 2.8423967361450195, + "learning_rate": 4.441354676092408e-05, + "loss": 6.5106, + "step": 2251 + }, + { + "epoch": 0.2171648987463838, + "grad_norm": 2.7541773319244385, + "learning_rate": 4.440877390865153e-05, + "loss": 6.5723, + "step": 2252 + }, + { + "epoch": 0.21726133076181292, + "grad_norm": 3.421779155731201, + "learning_rate": 4.440399927506559e-05, + "loss": 6.4911, + "step": 2253 + }, + { + "epoch": 0.21735776277724203, + "grad_norm": 2.6177003383636475, + "learning_rate": 4.439922286060448e-05, + "loss": 6.5017, + "step": 2254 + }, + { + "epoch": 0.21745419479267117, + "grad_norm": 2.3824172019958496, + "learning_rate": 4.4394444665706575e-05, + "loss": 6.4791, + "step": 2255 + }, + { + "epoch": 0.21755062680810028, + "grad_norm": 2.4562182426452637, + "learning_rate": 4.4389664690810406e-05, + "loss": 6.6236, + "step": 2256 + }, + { + "epoch": 0.21764705882352942, + "grad_norm": 2.341773748397827, + "learning_rate": 4.438488293635468e-05, + "loss": 6.5813, + "step": 2257 + }, + { + "epoch": 0.21774349083895853, + "grad_norm": 2.3301117420196533, + "learning_rate": 4.4380099402778244e-05, + "loss": 6.5128, + "step": 2258 + }, + { + "epoch": 0.21783992285438766, + "grad_norm": 2.883662223815918, + "learning_rate": 4.437531409052015e-05, + "loss": 6.56, + "step": 2259 + }, + { + "epoch": 0.21793635486981677, + "grad_norm": 3.476595401763916, + "learning_rate": 4.4370527000019575e-05, + "loss": 6.5526, + "step": 2260 + }, + { + "epoch": 0.2180327868852459, + "grad_norm": 2.2758896350860596, + "learning_rate": 4.436573813171587e-05, + "loss": 6.6482, + "step": 2261 + }, + { + "epoch": 0.21812921890067502, + "grad_norm": 2.6074416637420654, + "learning_rate": 4.436094748604855e-05, + "loss": 6.5058, + "step": 2262 + }, + { + "epoch": 0.21822565091610416, + "grad_norm": 3.3601183891296387, + "learning_rate": 4.4356155063457314e-05, + "loss": 6.5206, + "step": 2263 + }, + { + "epoch": 0.21832208293153327, + "grad_norm": 2.884690523147583, + "learning_rate": 4.435136086438198e-05, + "loss": 6.3906, + "step": 2264 + }, + { + "epoch": 0.21841851494696238, + "grad_norm": 2.8478434085845947, + "learning_rate": 4.434656488926256e-05, + "loss": 6.4916, + "step": 2265 + }, + { + "epoch": 0.21851494696239152, + "grad_norm": 5.088033676147461, + "learning_rate": 4.434176713853924e-05, + "loss": 6.4537, + "step": 2266 + }, + { + "epoch": 0.21861137897782063, + "grad_norm": 3.2982895374298096, + "learning_rate": 4.4336967612652326e-05, + "loss": 5.8461, + "step": 2267 + }, + { + "epoch": 0.21870781099324976, + "grad_norm": 3.478607177734375, + "learning_rate": 4.4332166312042326e-05, + "loss": 6.0832, + "step": 2268 + }, + { + "epoch": 0.21880424300867887, + "grad_norm": 2.6243178844451904, + "learning_rate": 4.4327363237149897e-05, + "loss": 6.0272, + "step": 2269 + }, + { + "epoch": 0.218900675024108, + "grad_norm": 2.480304718017578, + "learning_rate": 4.4322558388415855e-05, + "loss": 6.6292, + "step": 2270 + }, + { + "epoch": 0.21899710703953712, + "grad_norm": 4.383986473083496, + "learning_rate": 4.431775176628119e-05, + "loss": 6.7211, + "step": 2271 + }, + { + "epoch": 0.21909353905496626, + "grad_norm": 3.6277668476104736, + "learning_rate": 4.431294337118705e-05, + "loss": 6.4508, + "step": 2272 + }, + { + "epoch": 0.21918997107039537, + "grad_norm": 2.618450880050659, + "learning_rate": 4.4308133203574733e-05, + "loss": 6.5637, + "step": 2273 + }, + { + "epoch": 0.2192864030858245, + "grad_norm": 3.400524854660034, + "learning_rate": 4.430332126388571e-05, + "loss": 6.6586, + "step": 2274 + }, + { + "epoch": 0.21938283510125361, + "grad_norm": 3.5319981575012207, + "learning_rate": 4.429850755256163e-05, + "loss": 6.6137, + "step": 2275 + }, + { + "epoch": 0.21947926711668275, + "grad_norm": 4.130249977111816, + "learning_rate": 4.429369207004427e-05, + "loss": 6.2644, + "step": 2276 + }, + { + "epoch": 0.21957569913211186, + "grad_norm": 2.8592097759246826, + "learning_rate": 4.42888748167756e-05, + "loss": 6.1533, + "step": 2277 + }, + { + "epoch": 0.21967213114754097, + "grad_norm": 4.8405842781066895, + "learning_rate": 4.4284055793197744e-05, + "loss": 6.5673, + "step": 2278 + }, + { + "epoch": 0.2197685631629701, + "grad_norm": 4.119534492492676, + "learning_rate": 4.4279234999752975e-05, + "loss": 6.5404, + "step": 2279 + }, + { + "epoch": 0.21986499517839922, + "grad_norm": 3.386483907699585, + "learning_rate": 4.427441243688375e-05, + "loss": 6.1677, + "step": 2280 + }, + { + "epoch": 0.21996142719382836, + "grad_norm": 3.6889076232910156, + "learning_rate": 4.4269588105032676e-05, + "loss": 6.2658, + "step": 2281 + }, + { + "epoch": 0.22005785920925747, + "grad_norm": 4.368974685668945, + "learning_rate": 4.4264762004642526e-05, + "loss": 6.3488, + "step": 2282 + }, + { + "epoch": 0.2201542912246866, + "grad_norm": 2.3270821571350098, + "learning_rate": 4.425993413615622e-05, + "loss": 6.6385, + "step": 2283 + }, + { + "epoch": 0.22025072324011571, + "grad_norm": 4.223208904266357, + "learning_rate": 4.425510450001688e-05, + "loss": 6.3711, + "step": 2284 + }, + { + "epoch": 0.22034715525554485, + "grad_norm": 3.818150520324707, + "learning_rate": 4.4250273096667735e-05, + "loss": 6.3951, + "step": 2285 + }, + { + "epoch": 0.22044358727097396, + "grad_norm": 2.1934762001037598, + "learning_rate": 4.4245439926552226e-05, + "loss": 6.0493, + "step": 2286 + }, + { + "epoch": 0.2205400192864031, + "grad_norm": 2.016266107559204, + "learning_rate": 4.424060499011392e-05, + "loss": 5.8509, + "step": 2287 + }, + { + "epoch": 0.2206364513018322, + "grad_norm": 3.431302070617676, + "learning_rate": 4.4235768287796574e-05, + "loss": 6.4621, + "step": 2288 + }, + { + "epoch": 0.22073288331726132, + "grad_norm": 3.6130683422088623, + "learning_rate": 4.4230929820044095e-05, + "loss": 6.6623, + "step": 2289 + }, + { + "epoch": 0.22082931533269046, + "grad_norm": 3.8338701725006104, + "learning_rate": 4.4226089587300536e-05, + "loss": 6.4391, + "step": 2290 + }, + { + "epoch": 0.22092574734811957, + "grad_norm": 2.6365537643432617, + "learning_rate": 4.422124759001015e-05, + "loss": 6.4799, + "step": 2291 + }, + { + "epoch": 0.2210221793635487, + "grad_norm": 3.1774258613586426, + "learning_rate": 4.4216403828617306e-05, + "loss": 6.5742, + "step": 2292 + }, + { + "epoch": 0.22111861137897781, + "grad_norm": 3.0779950618743896, + "learning_rate": 4.421155830356657e-05, + "loss": 6.3642, + "step": 2293 + }, + { + "epoch": 0.22121504339440695, + "grad_norm": 3.2191765308380127, + "learning_rate": 4.420671101530267e-05, + "loss": 6.4715, + "step": 2294 + }, + { + "epoch": 0.22131147540983606, + "grad_norm": 3.283883571624756, + "learning_rate": 4.420186196427046e-05, + "loss": 6.4521, + "step": 2295 + }, + { + "epoch": 0.2214079074252652, + "grad_norm": 3.7789323329925537, + "learning_rate": 4.4197011150915e-05, + "loss": 6.2189, + "step": 2296 + }, + { + "epoch": 0.2215043394406943, + "grad_norm": 3.235183000564575, + "learning_rate": 4.419215857568149e-05, + "loss": 5.5768, + "step": 2297 + }, + { + "epoch": 0.22160077145612345, + "grad_norm": 2.931995391845703, + "learning_rate": 4.418730423901528e-05, + "loss": 5.918, + "step": 2298 + }, + { + "epoch": 0.22169720347155256, + "grad_norm": 3.496951103210449, + "learning_rate": 4.41824481413619e-05, + "loss": 6.2847, + "step": 2299 + }, + { + "epoch": 0.22179363548698167, + "grad_norm": 3.9070165157318115, + "learning_rate": 4.4177590283167046e-05, + "loss": 6.4996, + "step": 2300 + }, + { + "epoch": 0.2218900675024108, + "grad_norm": 4.916171073913574, + "learning_rate": 4.4172730664876554e-05, + "loss": 6.3327, + "step": 2301 + }, + { + "epoch": 0.2219864995178399, + "grad_norm": 3.129147529602051, + "learning_rate": 4.416786928693644e-05, + "loss": 6.5115, + "step": 2302 + }, + { + "epoch": 0.22208293153326905, + "grad_norm": 3.036780834197998, + "learning_rate": 4.416300614979288e-05, + "loss": 6.6861, + "step": 2303 + }, + { + "epoch": 0.22217936354869816, + "grad_norm": 4.0152082443237305, + "learning_rate": 4.4158141253892195e-05, + "loss": 6.01, + "step": 2304 + }, + { + "epoch": 0.2222757955641273, + "grad_norm": 4.66019868850708, + "learning_rate": 4.415327459968089e-05, + "loss": 6.4563, + "step": 2305 + }, + { + "epoch": 0.2223722275795564, + "grad_norm": 3.599527359008789, + "learning_rate": 4.414840618760561e-05, + "loss": 6.4097, + "step": 2306 + }, + { + "epoch": 0.22246865959498555, + "grad_norm": 4.173127174377441, + "learning_rate": 4.414353601811318e-05, + "loss": 6.4293, + "step": 2307 + }, + { + "epoch": 0.22256509161041466, + "grad_norm": 2.829179525375366, + "learning_rate": 4.4138664091650584e-05, + "loss": 6.6219, + "step": 2308 + }, + { + "epoch": 0.2226615236258438, + "grad_norm": 3.1152377128601074, + "learning_rate": 4.413379040866495e-05, + "loss": 6.6585, + "step": 2309 + }, + { + "epoch": 0.2227579556412729, + "grad_norm": 3.3987984657287598, + "learning_rate": 4.412891496960358e-05, + "loss": 6.6509, + "step": 2310 + }, + { + "epoch": 0.222854387656702, + "grad_norm": 2.6921749114990234, + "learning_rate": 4.4124037774913934e-05, + "loss": 6.6767, + "step": 2311 + }, + { + "epoch": 0.22295081967213115, + "grad_norm": 2.1577603816986084, + "learning_rate": 4.411915882504365e-05, + "loss": 6.7029, + "step": 2312 + }, + { + "epoch": 0.22304725168756026, + "grad_norm": 2.748617172241211, + "learning_rate": 4.411427812044049e-05, + "loss": 6.3206, + "step": 2313 + }, + { + "epoch": 0.2231436837029894, + "grad_norm": 2.3996832370758057, + "learning_rate": 4.410939566155241e-05, + "loss": 6.4081, + "step": 2314 + }, + { + "epoch": 0.2232401157184185, + "grad_norm": 3.2171294689178467, + "learning_rate": 4.410451144882753e-05, + "loss": 6.1444, + "step": 2315 + }, + { + "epoch": 0.22333654773384765, + "grad_norm": 2.9111413955688477, + "learning_rate": 4.4099625482714084e-05, + "loss": 6.2742, + "step": 2316 + }, + { + "epoch": 0.22343297974927676, + "grad_norm": 3.1235573291778564, + "learning_rate": 4.409473776366053e-05, + "loss": 6.4657, + "step": 2317 + }, + { + "epoch": 0.2235294117647059, + "grad_norm": 8.391315460205078, + "learning_rate": 4.408984829211544e-05, + "loss": 6.3915, + "step": 2318 + }, + { + "epoch": 0.223625843780135, + "grad_norm": 4.718044757843018, + "learning_rate": 4.408495706852758e-05, + "loss": 6.4265, + "step": 2319 + }, + { + "epoch": 0.22372227579556414, + "grad_norm": 5.4033427238464355, + "learning_rate": 4.408006409334584e-05, + "loss": 6.5458, + "step": 2320 + }, + { + "epoch": 0.22381870781099325, + "grad_norm": 3.007391929626465, + "learning_rate": 4.4075169367019305e-05, + "loss": 6.4689, + "step": 2321 + }, + { + "epoch": 0.22391513982642236, + "grad_norm": 2.7784223556518555, + "learning_rate": 4.4070272889997197e-05, + "loss": 6.4759, + "step": 2322 + }, + { + "epoch": 0.2240115718418515, + "grad_norm": 3.8627867698669434, + "learning_rate": 4.406537466272893e-05, + "loss": 6.5233, + "step": 2323 + }, + { + "epoch": 0.2241080038572806, + "grad_norm": 3.4027717113494873, + "learning_rate": 4.406047468566403e-05, + "loss": 6.4915, + "step": 2324 + }, + { + "epoch": 0.22420443587270975, + "grad_norm": 3.262023448944092, + "learning_rate": 4.4055572959252225e-05, + "loss": 6.2452, + "step": 2325 + }, + { + "epoch": 0.22430086788813886, + "grad_norm": 3.439919948577881, + "learning_rate": 4.405066948394339e-05, + "loss": 6.4958, + "step": 2326 + }, + { + "epoch": 0.224397299903568, + "grad_norm": 3.4627785682678223, + "learning_rate": 4.404576426018755e-05, + "loss": 6.2406, + "step": 2327 + }, + { + "epoch": 0.2244937319189971, + "grad_norm": 3.7868239879608154, + "learning_rate": 4.4040857288434915e-05, + "loss": 6.1983, + "step": 2328 + }, + { + "epoch": 0.22459016393442624, + "grad_norm": 3.480984687805176, + "learning_rate": 4.403594856913583e-05, + "loss": 6.6558, + "step": 2329 + }, + { + "epoch": 0.22468659594985535, + "grad_norm": 2.905184745788574, + "learning_rate": 4.403103810274082e-05, + "loss": 6.463, + "step": 2330 + }, + { + "epoch": 0.2247830279652845, + "grad_norm": 2.7796993255615234, + "learning_rate": 4.4026125889700555e-05, + "loss": 6.6714, + "step": 2331 + }, + { + "epoch": 0.2248794599807136, + "grad_norm": 3.171651840209961, + "learning_rate": 4.402121193046586e-05, + "loss": 6.4854, + "step": 2332 + }, + { + "epoch": 0.2249758919961427, + "grad_norm": 4.057249546051025, + "learning_rate": 4.401629622548776e-05, + "loss": 5.6721, + "step": 2333 + }, + { + "epoch": 0.22507232401157184, + "grad_norm": 2.679816722869873, + "learning_rate": 4.40113787752174e-05, + "loss": 5.7711, + "step": 2334 + }, + { + "epoch": 0.22516875602700095, + "grad_norm": 3.1174824237823486, + "learning_rate": 4.4006459580106087e-05, + "loss": 6.3033, + "step": 2335 + }, + { + "epoch": 0.2252651880424301, + "grad_norm": 3.1693384647369385, + "learning_rate": 4.400153864060531e-05, + "loss": 6.3012, + "step": 2336 + }, + { + "epoch": 0.2253616200578592, + "grad_norm": 4.215686321258545, + "learning_rate": 4.39966159571667e-05, + "loss": 6.5059, + "step": 2337 + }, + { + "epoch": 0.22545805207328834, + "grad_norm": 3.1258997917175293, + "learning_rate": 4.3991691530242066e-05, + "loss": 6.3972, + "step": 2338 + }, + { + "epoch": 0.22555448408871745, + "grad_norm": 2.9048819541931152, + "learning_rate": 4.398676536028335e-05, + "loss": 6.2382, + "step": 2339 + }, + { + "epoch": 0.2256509161041466, + "grad_norm": 3.3641769886016846, + "learning_rate": 4.398183744774268e-05, + "loss": 6.1684, + "step": 2340 + }, + { + "epoch": 0.2257473481195757, + "grad_norm": 4.156621932983398, + "learning_rate": 4.3976907793072335e-05, + "loss": 6.255, + "step": 2341 + }, + { + "epoch": 0.22584378013500483, + "grad_norm": 3.923737049102783, + "learning_rate": 4.397197639672475e-05, + "loss": 6.323, + "step": 2342 + }, + { + "epoch": 0.22594021215043394, + "grad_norm": 3.3872005939483643, + "learning_rate": 4.396704325915252e-05, + "loss": 6.4542, + "step": 2343 + }, + { + "epoch": 0.22603664416586305, + "grad_norm": 3.570361852645874, + "learning_rate": 4.39621083808084e-05, + "loss": 6.5224, + "step": 2344 + }, + { + "epoch": 0.2261330761812922, + "grad_norm": 3.8613929748535156, + "learning_rate": 4.395717176214532e-05, + "loss": 6.4143, + "step": 2345 + }, + { + "epoch": 0.2262295081967213, + "grad_norm": 4.086845874786377, + "learning_rate": 4.395223340361634e-05, + "loss": 6.2939, + "step": 2346 + }, + { + "epoch": 0.22632594021215044, + "grad_norm": 3.0086896419525146, + "learning_rate": 4.3947293305674715e-05, + "loss": 6.2369, + "step": 2347 + }, + { + "epoch": 0.22642237222757955, + "grad_norm": 3.282146453857422, + "learning_rate": 4.3942351468773824e-05, + "loss": 6.2135, + "step": 2348 + }, + { + "epoch": 0.2265188042430087, + "grad_norm": 3.4081432819366455, + "learning_rate": 4.3937407893367225e-05, + "loss": 6.7718, + "step": 2349 + }, + { + "epoch": 0.2266152362584378, + "grad_norm": 3.1670162677764893, + "learning_rate": 4.393246257990865e-05, + "loss": 6.7913, + "step": 2350 + }, + { + "epoch": 0.22671166827386693, + "grad_norm": 3.815126419067383, + "learning_rate": 4.392751552885195e-05, + "loss": 6.8563, + "step": 2351 + }, + { + "epoch": 0.22680810028929604, + "grad_norm": 3.4052586555480957, + "learning_rate": 4.392256674065117e-05, + "loss": 6.2718, + "step": 2352 + }, + { + "epoch": 0.22690453230472518, + "grad_norm": 3.2826709747314453, + "learning_rate": 4.391761621576051e-05, + "loss": 6.5805, + "step": 2353 + }, + { + "epoch": 0.2270009643201543, + "grad_norm": 2.5842981338500977, + "learning_rate": 4.391266395463432e-05, + "loss": 6.6807, + "step": 2354 + }, + { + "epoch": 0.2270973963355834, + "grad_norm": 2.8261008262634277, + "learning_rate": 4.390770995772711e-05, + "loss": 6.4917, + "step": 2355 + }, + { + "epoch": 0.22719382835101254, + "grad_norm": 4.399174213409424, + "learning_rate": 4.390275422549355e-05, + "loss": 6.4245, + "step": 2356 + }, + { + "epoch": 0.22729026036644165, + "grad_norm": 3.8602066040039062, + "learning_rate": 4.389779675838846e-05, + "loss": 6.4793, + "step": 2357 + }, + { + "epoch": 0.2273866923818708, + "grad_norm": 3.7786824703216553, + "learning_rate": 4.389283755686686e-05, + "loss": 6.5567, + "step": 2358 + }, + { + "epoch": 0.2274831243972999, + "grad_norm": 5.2893242835998535, + "learning_rate": 4.388787662138387e-05, + "loss": 6.5763, + "step": 2359 + }, + { + "epoch": 0.22757955641272903, + "grad_norm": 7.8165459632873535, + "learning_rate": 4.388291395239482e-05, + "loss": 6.4492, + "step": 2360 + }, + { + "epoch": 0.22767598842815814, + "grad_norm": 5.760965824127197, + "learning_rate": 4.387794955035517e-05, + "loss": 6.5174, + "step": 2361 + }, + { + "epoch": 0.22777242044358728, + "grad_norm": 3.413902759552002, + "learning_rate": 4.3872983415720537e-05, + "loss": 6.058, + "step": 2362 + }, + { + "epoch": 0.2278688524590164, + "grad_norm": 2.4083638191223145, + "learning_rate": 4.386801554894672e-05, + "loss": 6.4286, + "step": 2363 + }, + { + "epoch": 0.22796528447444553, + "grad_norm": 4.006086349487305, + "learning_rate": 4.386304595048966e-05, + "loss": 6.4459, + "step": 2364 + }, + { + "epoch": 0.22806171648987464, + "grad_norm": 4.226266860961914, + "learning_rate": 4.385807462080546e-05, + "loss": 6.1811, + "step": 2365 + }, + { + "epoch": 0.22815814850530375, + "grad_norm": 3.112678289413452, + "learning_rate": 4.3853101560350375e-05, + "loss": 6.6184, + "step": 2366 + }, + { + "epoch": 0.22825458052073289, + "grad_norm": 4.221651077270508, + "learning_rate": 4.384812676958083e-05, + "loss": 5.8283, + "step": 2367 + }, + { + "epoch": 0.228351012536162, + "grad_norm": 3.7902541160583496, + "learning_rate": 4.384315024895342e-05, + "loss": 6.3453, + "step": 2368 + }, + { + "epoch": 0.22844744455159113, + "grad_norm": 3.7785608768463135, + "learning_rate": 4.383817199892487e-05, + "loss": 6.1778, + "step": 2369 + }, + { + "epoch": 0.22854387656702024, + "grad_norm": 3.354947566986084, + "learning_rate": 4.3833192019952075e-05, + "loss": 6.2194, + "step": 2370 + }, + { + "epoch": 0.22864030858244938, + "grad_norm": 3.7581019401550293, + "learning_rate": 4.38282103124921e-05, + "loss": 6.0914, + "step": 2371 + }, + { + "epoch": 0.2287367405978785, + "grad_norm": 4.816232681274414, + "learning_rate": 4.3823226877002154e-05, + "loss": 6.6162, + "step": 2372 + }, + { + "epoch": 0.22883317261330763, + "grad_norm": 3.377927541732788, + "learning_rate": 4.3818241713939615e-05, + "loss": 6.1992, + "step": 2373 + }, + { + "epoch": 0.22892960462873674, + "grad_norm": 2.291630983352661, + "learning_rate": 4.381325482376201e-05, + "loss": 6.3053, + "step": 2374 + }, + { + "epoch": 0.22902603664416588, + "grad_norm": 3.7891273498535156, + "learning_rate": 4.380826620692704e-05, + "loss": 6.1612, + "step": 2375 + }, + { + "epoch": 0.22912246865959499, + "grad_norm": 3.9082462787628174, + "learning_rate": 4.3803275863892534e-05, + "loss": 6.1446, + "step": 2376 + }, + { + "epoch": 0.2292189006750241, + "grad_norm": 3.6934618949890137, + "learning_rate": 4.3798283795116527e-05, + "loss": 6.4565, + "step": 2377 + }, + { + "epoch": 0.22931533269045323, + "grad_norm": 3.4459922313690186, + "learning_rate": 4.379329000105716e-05, + "loss": 6.2049, + "step": 2378 + }, + { + "epoch": 0.22941176470588234, + "grad_norm": 2.4776570796966553, + "learning_rate": 4.3788294482172766e-05, + "loss": 6.2954, + "step": 2379 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 4.036329746246338, + "learning_rate": 4.378329723892184e-05, + "loss": 6.0793, + "step": 2380 + }, + { + "epoch": 0.2296046287367406, + "grad_norm": 4.187005996704102, + "learning_rate": 4.3778298271762995e-05, + "loss": 6.1935, + "step": 2381 + }, + { + "epoch": 0.22970106075216973, + "grad_norm": 2.5582141876220703, + "learning_rate": 4.377329758115506e-05, + "loss": 5.779, + "step": 2382 + }, + { + "epoch": 0.22979749276759884, + "grad_norm": 3.692258358001709, + "learning_rate": 4.376829516755698e-05, + "loss": 6.7197, + "step": 2383 + }, + { + "epoch": 0.22989392478302798, + "grad_norm": 3.4048027992248535, + "learning_rate": 4.376329103142787e-05, + "loss": 6.3748, + "step": 2384 + }, + { + "epoch": 0.22999035679845709, + "grad_norm": 4.138377666473389, + "learning_rate": 4.3758285173226996e-05, + "loss": 6.2235, + "step": 2385 + }, + { + "epoch": 0.23008678881388622, + "grad_norm": 3.549295425415039, + "learning_rate": 4.37532775934138e-05, + "loss": 6.4579, + "step": 2386 + }, + { + "epoch": 0.23018322082931533, + "grad_norm": 3.453984260559082, + "learning_rate": 4.374826829244786e-05, + "loss": 6.2052, + "step": 2387 + }, + { + "epoch": 0.23027965284474444, + "grad_norm": 2.541659116744995, + "learning_rate": 4.374325727078894e-05, + "loss": 6.0389, + "step": 2388 + }, + { + "epoch": 0.23037608486017358, + "grad_norm": 3.7628278732299805, + "learning_rate": 4.3738244528896935e-05, + "loss": 6.6013, + "step": 2389 + }, + { + "epoch": 0.2304725168756027, + "grad_norm": 4.102034091949463, + "learning_rate": 4.373323006723191e-05, + "loss": 6.5917, + "step": 2390 + }, + { + "epoch": 0.23056894889103183, + "grad_norm": 3.9945781230926514, + "learning_rate": 4.372821388625409e-05, + "loss": 6.422, + "step": 2391 + }, + { + "epoch": 0.23066538090646094, + "grad_norm": 2.740091562271118, + "learning_rate": 4.372319598642385e-05, + "loss": 6.4541, + "step": 2392 + }, + { + "epoch": 0.23076181292189007, + "grad_norm": 2.8329131603240967, + "learning_rate": 4.3718176368201724e-05, + "loss": 6.383, + "step": 2393 + }, + { + "epoch": 0.23085824493731918, + "grad_norm": 3.4842958450317383, + "learning_rate": 4.371315503204841e-05, + "loss": 6.6944, + "step": 2394 + }, + { + "epoch": 0.23095467695274832, + "grad_norm": 3.063483476638794, + "learning_rate": 4.370813197842476e-05, + "loss": 6.4496, + "step": 2395 + }, + { + "epoch": 0.23105110896817743, + "grad_norm": 2.9642302989959717, + "learning_rate": 4.3703107207791776e-05, + "loss": 6.6172, + "step": 2396 + }, + { + "epoch": 0.23114754098360657, + "grad_norm": 2.911972761154175, + "learning_rate": 4.3698080720610643e-05, + "loss": 6.6497, + "step": 2397 + }, + { + "epoch": 0.23124397299903568, + "grad_norm": 3.38628888130188, + "learning_rate": 4.3693052517342664e-05, + "loss": 6.1331, + "step": 2398 + }, + { + "epoch": 0.2313404050144648, + "grad_norm": 4.108190059661865, + "learning_rate": 4.368802259844934e-05, + "loss": 6.5506, + "step": 2399 + }, + { + "epoch": 0.23143683702989393, + "grad_norm": 3.473762273788452, + "learning_rate": 4.3682990964392304e-05, + "loss": 6.3181, + "step": 2400 + }, + { + "epoch": 0.23153326904532304, + "grad_norm": 2.8229315280914307, + "learning_rate": 4.3677957615633344e-05, + "loss": 6.0998, + "step": 2401 + }, + { + "epoch": 0.23162970106075217, + "grad_norm": 3.072849988937378, + "learning_rate": 4.367292255263443e-05, + "loss": 6.2026, + "step": 2402 + }, + { + "epoch": 0.23172613307618128, + "grad_norm": 2.479041337966919, + "learning_rate": 4.3667885775857665e-05, + "loss": 6.2299, + "step": 2403 + }, + { + "epoch": 0.23182256509161042, + "grad_norm": 2.547006368637085, + "learning_rate": 4.366284728576532e-05, + "loss": 6.2572, + "step": 2404 + }, + { + "epoch": 0.23191899710703953, + "grad_norm": 2.944714069366455, + "learning_rate": 4.3657807082819825e-05, + "loss": 6.3799, + "step": 2405 + }, + { + "epoch": 0.23201542912246867, + "grad_norm": 3.008943796157837, + "learning_rate": 4.365276516748376e-05, + "loss": 6.2775, + "step": 2406 + }, + { + "epoch": 0.23211186113789778, + "grad_norm": 3.2817063331604004, + "learning_rate": 4.364772154021986e-05, + "loss": 6.239, + "step": 2407 + }, + { + "epoch": 0.23220829315332692, + "grad_norm": 4.856213092803955, + "learning_rate": 4.364267620149103e-05, + "loss": 6.2405, + "step": 2408 + }, + { + "epoch": 0.23230472516875603, + "grad_norm": 3.6438937187194824, + "learning_rate": 4.363762915176032e-05, + "loss": 6.3811, + "step": 2409 + }, + { + "epoch": 0.23240115718418514, + "grad_norm": 3.4227046966552734, + "learning_rate": 4.363258039149095e-05, + "loss": 6.4846, + "step": 2410 + }, + { + "epoch": 0.23249758919961427, + "grad_norm": 4.258471965789795, + "learning_rate": 4.362752992114629e-05, + "loss": 6.4839, + "step": 2411 + }, + { + "epoch": 0.23259402121504338, + "grad_norm": 3.3375813961029053, + "learning_rate": 4.362247774118985e-05, + "loss": 6.4177, + "step": 2412 + }, + { + "epoch": 0.23269045323047252, + "grad_norm": 3.339277505874634, + "learning_rate": 4.361742385208534e-05, + "loss": 6.5153, + "step": 2413 + }, + { + "epoch": 0.23278688524590163, + "grad_norm": 2.526686191558838, + "learning_rate": 4.3612368254296565e-05, + "loss": 6.4233, + "step": 2414 + }, + { + "epoch": 0.23288331726133077, + "grad_norm": 2.667335033416748, + "learning_rate": 4.360731094828755e-05, + "loss": 6.4497, + "step": 2415 + }, + { + "epoch": 0.23297974927675988, + "grad_norm": 2.9063587188720703, + "learning_rate": 4.360225193452243e-05, + "loss": 6.5016, + "step": 2416 + }, + { + "epoch": 0.23307618129218902, + "grad_norm": 2.472663402557373, + "learning_rate": 4.3597191213465535e-05, + "loss": 6.5441, + "step": 2417 + }, + { + "epoch": 0.23317261330761813, + "grad_norm": 2.6598684787750244, + "learning_rate": 4.359212878558131e-05, + "loss": 6.2343, + "step": 2418 + }, + { + "epoch": 0.23326904532304726, + "grad_norm": 2.9159133434295654, + "learning_rate": 4.358706465133439e-05, + "loss": 6.2509, + "step": 2419 + }, + { + "epoch": 0.23336547733847637, + "grad_norm": 3.052659511566162, + "learning_rate": 4.3581998811189554e-05, + "loss": 6.1495, + "step": 2420 + }, + { + "epoch": 0.23346190935390548, + "grad_norm": 2.2152912616729736, + "learning_rate": 4.357693126561174e-05, + "loss": 5.8954, + "step": 2421 + }, + { + "epoch": 0.23355834136933462, + "grad_norm": 2.2952966690063477, + "learning_rate": 4.357186201506603e-05, + "loss": 6.2572, + "step": 2422 + }, + { + "epoch": 0.23365477338476373, + "grad_norm": 2.483711004257202, + "learning_rate": 4.35667910600177e-05, + "loss": 6.2838, + "step": 2423 + }, + { + "epoch": 0.23375120540019287, + "grad_norm": 2.9915294647216797, + "learning_rate": 4.3561718400932125e-05, + "loss": 6.257, + "step": 2424 + }, + { + "epoch": 0.23384763741562198, + "grad_norm": 2.842257022857666, + "learning_rate": 4.355664403827489e-05, + "loss": 6.4396, + "step": 2425 + }, + { + "epoch": 0.23394406943105112, + "grad_norm": 2.441336154937744, + "learning_rate": 4.355156797251169e-05, + "loss": 6.5517, + "step": 2426 + }, + { + "epoch": 0.23404050144648023, + "grad_norm": 1.9322004318237305, + "learning_rate": 4.3546490204108425e-05, + "loss": 6.5189, + "step": 2427 + }, + { + "epoch": 0.23413693346190936, + "grad_norm": 3.8704230785369873, + "learning_rate": 4.354141073353112e-05, + "loss": 6.5217, + "step": 2428 + }, + { + "epoch": 0.23423336547733847, + "grad_norm": 3.050701856613159, + "learning_rate": 4.3536329561245946e-05, + "loss": 6.6491, + "step": 2429 + }, + { + "epoch": 0.2343297974927676, + "grad_norm": 2.7428410053253174, + "learning_rate": 4.353124668771927e-05, + "loss": 6.5129, + "step": 2430 + }, + { + "epoch": 0.23442622950819672, + "grad_norm": 3.405979871749878, + "learning_rate": 4.352616211341758e-05, + "loss": 6.4269, + "step": 2431 + }, + { + "epoch": 0.23452266152362583, + "grad_norm": 2.6658451557159424, + "learning_rate": 4.352107583880753e-05, + "loss": 6.3542, + "step": 2432 + }, + { + "epoch": 0.23461909353905497, + "grad_norm": 4.325198173522949, + "learning_rate": 4.3515987864355936e-05, + "loss": 6.4606, + "step": 2433 + }, + { + "epoch": 0.23471552555448408, + "grad_norm": 2.7958297729492188, + "learning_rate": 4.3510898190529767e-05, + "loss": 6.4841, + "step": 2434 + }, + { + "epoch": 0.23481195756991322, + "grad_norm": 2.9648568630218506, + "learning_rate": 4.3505806817796144e-05, + "loss": 6.4324, + "step": 2435 + }, + { + "epoch": 0.23490838958534233, + "grad_norm": 3.56257700920105, + "learning_rate": 4.3500713746622345e-05, + "loss": 6.3722, + "step": 2436 + }, + { + "epoch": 0.23500482160077146, + "grad_norm": 3.5455169677734375, + "learning_rate": 4.349561897747582e-05, + "loss": 6.1272, + "step": 2437 + }, + { + "epoch": 0.23510125361620057, + "grad_norm": 2.826754331588745, + "learning_rate": 4.349052251082413e-05, + "loss": 6.0057, + "step": 2438 + }, + { + "epoch": 0.2351976856316297, + "grad_norm": 3.339639186859131, + "learning_rate": 4.3485424347135055e-05, + "loss": 5.9203, + "step": 2439 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 3.5092380046844482, + "learning_rate": 4.3480324486876486e-05, + "loss": 6.2929, + "step": 2440 + }, + { + "epoch": 0.23539054966248796, + "grad_norm": 2.946491241455078, + "learning_rate": 4.347522293051648e-05, + "loss": 6.3184, + "step": 2441 + }, + { + "epoch": 0.23548698167791707, + "grad_norm": 4.113331317901611, + "learning_rate": 4.347011967852325e-05, + "loss": 6.6174, + "step": 2442 + }, + { + "epoch": 0.23558341369334618, + "grad_norm": 3.715235948562622, + "learning_rate": 4.3465014731365174e-05, + "loss": 6.5823, + "step": 2443 + }, + { + "epoch": 0.23567984570877532, + "grad_norm": 3.8785240650177, + "learning_rate": 4.345990808951077e-05, + "loss": 6.1747, + "step": 2444 + }, + { + "epoch": 0.23577627772420442, + "grad_norm": 5.473963737487793, + "learning_rate": 4.345479975342872e-05, + "loss": 6.3636, + "step": 2445 + }, + { + "epoch": 0.23587270973963356, + "grad_norm": 4.136098861694336, + "learning_rate": 4.3449689723587865e-05, + "loss": 6.4293, + "step": 2446 + }, + { + "epoch": 0.23596914175506267, + "grad_norm": 2.9653573036193848, + "learning_rate": 4.3444578000457195e-05, + "loss": 6.5391, + "step": 2447 + }, + { + "epoch": 0.2360655737704918, + "grad_norm": 1.9727054834365845, + "learning_rate": 4.3439464584505864e-05, + "loss": 6.5262, + "step": 2448 + }, + { + "epoch": 0.23616200578592092, + "grad_norm": 4.5308637619018555, + "learning_rate": 4.3434349476203155e-05, + "loss": 5.8588, + "step": 2449 + }, + { + "epoch": 0.23625843780135006, + "grad_norm": 3.331206798553467, + "learning_rate": 4.342923267601855e-05, + "loss": 6.2365, + "step": 2450 + }, + { + "epoch": 0.23635486981677917, + "grad_norm": 2.7342827320098877, + "learning_rate": 4.342411418442165e-05, + "loss": 6.6409, + "step": 2451 + }, + { + "epoch": 0.2364513018322083, + "grad_norm": 3.5463271141052246, + "learning_rate": 4.3418994001882225e-05, + "loss": 6.5354, + "step": 2452 + }, + { + "epoch": 0.23654773384763741, + "grad_norm": 4.789860725402832, + "learning_rate": 4.34138721288702e-05, + "loss": 6.5236, + "step": 2453 + }, + { + "epoch": 0.23664416586306652, + "grad_norm": 3.5300822257995605, + "learning_rate": 4.340874856585566e-05, + "loss": 6.365, + "step": 2454 + }, + { + "epoch": 0.23674059787849566, + "grad_norm": 3.340303659439087, + "learning_rate": 4.340362331330883e-05, + "loss": 6.4261, + "step": 2455 + }, + { + "epoch": 0.23683702989392477, + "grad_norm": 2.6061015129089355, + "learning_rate": 4.3398496371700104e-05, + "loss": 6.4261, + "step": 2456 + }, + { + "epoch": 0.2369334619093539, + "grad_norm": 6.427543640136719, + "learning_rate": 4.339336774150002e-05, + "loss": 6.3942, + "step": 2457 + }, + { + "epoch": 0.23702989392478302, + "grad_norm": 4.979696273803711, + "learning_rate": 4.338823742317929e-05, + "loss": 6.2708, + "step": 2458 + }, + { + "epoch": 0.23712632594021216, + "grad_norm": 3.024454355239868, + "learning_rate": 4.338310541720877e-05, + "loss": 6.4342, + "step": 2459 + }, + { + "epoch": 0.23722275795564127, + "grad_norm": 2.2102088928222656, + "learning_rate": 4.337797172405945e-05, + "loss": 6.3234, + "step": 2460 + }, + { + "epoch": 0.2373191899710704, + "grad_norm": 6.378385543823242, + "learning_rate": 4.337283634420251e-05, + "loss": 6.5837, + "step": 2461 + }, + { + "epoch": 0.23741562198649951, + "grad_norm": 4.741101264953613, + "learning_rate": 4.3367699278109254e-05, + "loss": 6.5333, + "step": 2462 + }, + { + "epoch": 0.23751205400192865, + "grad_norm": 5.128906726837158, + "learning_rate": 4.336256052625117e-05, + "loss": 6.4262, + "step": 2463 + }, + { + "epoch": 0.23760848601735776, + "grad_norm": 4.032852649688721, + "learning_rate": 4.3357420089099884e-05, + "loss": 6.4107, + "step": 2464 + }, + { + "epoch": 0.23770491803278687, + "grad_norm": 4.3456621170043945, + "learning_rate": 4.3352277967127175e-05, + "loss": 6.3123, + "step": 2465 + }, + { + "epoch": 0.237801350048216, + "grad_norm": 3.082078695297241, + "learning_rate": 4.334713416080498e-05, + "loss": 6.3326, + "step": 2466 + }, + { + "epoch": 0.23789778206364512, + "grad_norm": 5.558025360107422, + "learning_rate": 4.3341988670605396e-05, + "loss": 6.2694, + "step": 2467 + }, + { + "epoch": 0.23799421407907426, + "grad_norm": 5.0994343757629395, + "learning_rate": 4.333684149700067e-05, + "loss": 6.2516, + "step": 2468 + }, + { + "epoch": 0.23809064609450337, + "grad_norm": 8.611335754394531, + "learning_rate": 4.3331692640463196e-05, + "loss": 6.5158, + "step": 2469 + }, + { + "epoch": 0.2381870781099325, + "grad_norm": 4.60468053817749, + "learning_rate": 4.332654210146554e-05, + "loss": 6.168, + "step": 2470 + }, + { + "epoch": 0.23828351012536161, + "grad_norm": 8.996051788330078, + "learning_rate": 4.332138988048039e-05, + "loss": 6.3008, + "step": 2471 + }, + { + "epoch": 0.23837994214079075, + "grad_norm": 8.7940092086792, + "learning_rate": 4.331623597798065e-05, + "loss": 6.3706, + "step": 2472 + }, + { + "epoch": 0.23847637415621986, + "grad_norm": 5.431901454925537, + "learning_rate": 4.3311080394439294e-05, + "loss": 6.3414, + "step": 2473 + }, + { + "epoch": 0.238572806171649, + "grad_norm": 3.6391496658325195, + "learning_rate": 4.330592313032953e-05, + "loss": 6.3817, + "step": 2474 + }, + { + "epoch": 0.2386692381870781, + "grad_norm": 5.076580047607422, + "learning_rate": 4.330076418612467e-05, + "loss": 6.4371, + "step": 2475 + }, + { + "epoch": 0.23876567020250722, + "grad_norm": 8.466988563537598, + "learning_rate": 4.3295603562298195e-05, + "loss": 6.3667, + "step": 2476 + }, + { + "epoch": 0.23886210221793636, + "grad_norm": 4.468249320983887, + "learning_rate": 4.3290441259323745e-05, + "loss": 6.1262, + "step": 2477 + }, + { + "epoch": 0.23895853423336547, + "grad_norm": 2.9675941467285156, + "learning_rate": 4.328527727767511e-05, + "loss": 6.2458, + "step": 2478 + }, + { + "epoch": 0.2390549662487946, + "grad_norm": 4.361372470855713, + "learning_rate": 4.328011161782623e-05, + "loss": 6.468, + "step": 2479 + }, + { + "epoch": 0.2391513982642237, + "grad_norm": 4.541140556335449, + "learning_rate": 4.3274944280251206e-05, + "loss": 6.4199, + "step": 2480 + }, + { + "epoch": 0.23924783027965285, + "grad_norm": 5.535056114196777, + "learning_rate": 4.326977526542428e-05, + "loss": 6.3219, + "step": 2481 + }, + { + "epoch": 0.23934426229508196, + "grad_norm": 6.617415904998779, + "learning_rate": 4.326460457381988e-05, + "loss": 6.4103, + "step": 2482 + }, + { + "epoch": 0.2394406943105111, + "grad_norm": 3.635340929031372, + "learning_rate": 4.325943220591254e-05, + "loss": 6.2572, + "step": 2483 + }, + { + "epoch": 0.2395371263259402, + "grad_norm": 6.6261091232299805, + "learning_rate": 4.3254258162176996e-05, + "loss": 5.9338, + "step": 2484 + }, + { + "epoch": 0.23963355834136935, + "grad_norm": 4.610671520233154, + "learning_rate": 4.3249082443088095e-05, + "loss": 6.3227, + "step": 2485 + }, + { + "epoch": 0.23972999035679846, + "grad_norm": 3.7062325477600098, + "learning_rate": 4.324390504912088e-05, + "loss": 6.2214, + "step": 2486 + }, + { + "epoch": 0.2398264223722276, + "grad_norm": 2.848259925842285, + "learning_rate": 4.3238725980750506e-05, + "loss": 6.3567, + "step": 2487 + }, + { + "epoch": 0.2399228543876567, + "grad_norm": 3.6504673957824707, + "learning_rate": 4.323354523845231e-05, + "loss": 6.3323, + "step": 2488 + }, + { + "epoch": 0.2400192864030858, + "grad_norm": 3.6859524250030518, + "learning_rate": 4.322836282270179e-05, + "loss": 6.5801, + "step": 2489 + }, + { + "epoch": 0.24011571841851495, + "grad_norm": 3.247249126434326, + "learning_rate": 4.322317873397454e-05, + "loss": 6.5922, + "step": 2490 + }, + { + "epoch": 0.24021215043394406, + "grad_norm": 2.995450019836426, + "learning_rate": 4.321799297274639e-05, + "loss": 6.5511, + "step": 2491 + }, + { + "epoch": 0.2403085824493732, + "grad_norm": 3.1514153480529785, + "learning_rate": 4.321280553949326e-05, + "loss": 6.4137, + "step": 2492 + }, + { + "epoch": 0.2404050144648023, + "grad_norm": 3.7935502529144287, + "learning_rate": 4.3207616434691264e-05, + "loss": 6.6692, + "step": 2493 + }, + { + "epoch": 0.24050144648023145, + "grad_norm": 3.4839694499969482, + "learning_rate": 4.3202425658816636e-05, + "loss": 6.3383, + "step": 2494 + }, + { + "epoch": 0.24059787849566056, + "grad_norm": 5.823157787322998, + "learning_rate": 4.319723321234578e-05, + "loss": 6.1101, + "step": 2495 + }, + { + "epoch": 0.2406943105110897, + "grad_norm": 4.542881965637207, + "learning_rate": 4.319203909575527e-05, + "loss": 6.515, + "step": 2496 + }, + { + "epoch": 0.2407907425265188, + "grad_norm": 5.521268844604492, + "learning_rate": 4.3186843309521786e-05, + "loss": 6.6105, + "step": 2497 + }, + { + "epoch": 0.24088717454194794, + "grad_norm": 4.674287796020508, + "learning_rate": 4.3181645854122216e-05, + "loss": 6.6488, + "step": 2498 + }, + { + "epoch": 0.24098360655737705, + "grad_norm": 3.5909717082977295, + "learning_rate": 4.317644673003357e-05, + "loss": 6.4542, + "step": 2499 + }, + { + "epoch": 0.24108003857280616, + "grad_norm": 4.462187767028809, + "learning_rate": 4.3171245937733006e-05, + "loss": 6.4688, + "step": 2500 + }, + { + "epoch": 0.2411764705882353, + "grad_norm": 4.410665512084961, + "learning_rate": 4.316604347769786e-05, + "loss": 6.2044, + "step": 2501 + }, + { + "epoch": 0.2412729026036644, + "grad_norm": 3.980656862258911, + "learning_rate": 4.3160839350405606e-05, + "loss": 6.0717, + "step": 2502 + }, + { + "epoch": 0.24136933461909355, + "grad_norm": 3.8383185863494873, + "learning_rate": 4.315563355633386e-05, + "loss": 6.0557, + "step": 2503 + }, + { + "epoch": 0.24146576663452265, + "grad_norm": 2.4644100666046143, + "learning_rate": 4.315042609596042e-05, + "loss": 6.3023, + "step": 2504 + }, + { + "epoch": 0.2415621986499518, + "grad_norm": 3.183685541152954, + "learning_rate": 4.3145216969763216e-05, + "loss": 6.3616, + "step": 2505 + }, + { + "epoch": 0.2416586306653809, + "grad_norm": 4.512371063232422, + "learning_rate": 4.314000617822032e-05, + "loss": 6.0912, + "step": 2506 + }, + { + "epoch": 0.24175506268081004, + "grad_norm": 2.5915544033050537, + "learning_rate": 4.3134793721809995e-05, + "loss": 6.2117, + "step": 2507 + }, + { + "epoch": 0.24185149469623915, + "grad_norm": 3.2424986362457275, + "learning_rate": 4.312957960101062e-05, + "loss": 6.3687, + "step": 2508 + }, + { + "epoch": 0.2419479267116683, + "grad_norm": 3.2766482830047607, + "learning_rate": 4.312436381630074e-05, + "loss": 6.2802, + "step": 2509 + }, + { + "epoch": 0.2420443587270974, + "grad_norm": 3.1253128051757812, + "learning_rate": 4.3119146368159046e-05, + "loss": 6.323, + "step": 2510 + }, + { + "epoch": 0.2421407907425265, + "grad_norm": 3.3623976707458496, + "learning_rate": 4.311392725706441e-05, + "loss": 6.4781, + "step": 2511 + }, + { + "epoch": 0.24223722275795564, + "grad_norm": 3.2844133377075195, + "learning_rate": 4.310870648349583e-05, + "loss": 6.458, + "step": 2512 + }, + { + "epoch": 0.24233365477338475, + "grad_norm": 3.6644866466522217, + "learning_rate": 4.310348404793245e-05, + "loss": 6.3576, + "step": 2513 + }, + { + "epoch": 0.2424300867888139, + "grad_norm": 2.204587697982788, + "learning_rate": 4.3098259950853594e-05, + "loss": 6.002, + "step": 2514 + }, + { + "epoch": 0.242526518804243, + "grad_norm": 3.1084935665130615, + "learning_rate": 4.3093034192738703e-05, + "loss": 6.2952, + "step": 2515 + }, + { + "epoch": 0.24262295081967214, + "grad_norm": 3.868126392364502, + "learning_rate": 4.308780677406741e-05, + "loss": 6.447, + "step": 2516 + }, + { + "epoch": 0.24271938283510125, + "grad_norm": 2.9150471687316895, + "learning_rate": 4.308257769531947e-05, + "loss": 6.3233, + "step": 2517 + }, + { + "epoch": 0.2428158148505304, + "grad_norm": 2.295461654663086, + "learning_rate": 4.307734695697481e-05, + "loss": 6.589, + "step": 2518 + }, + { + "epoch": 0.2429122468659595, + "grad_norm": 2.3408279418945312, + "learning_rate": 4.307211455951349e-05, + "loss": 6.5371, + "step": 2519 + }, + { + "epoch": 0.24300867888138863, + "grad_norm": 7.011969089508057, + "learning_rate": 4.306688050341575e-05, + "loss": 6.1645, + "step": 2520 + }, + { + "epoch": 0.24310511089681774, + "grad_norm": 4.772875785827637, + "learning_rate": 4.3061644789161943e-05, + "loss": 5.8875, + "step": 2521 + }, + { + "epoch": 0.24320154291224685, + "grad_norm": 3.0984246730804443, + "learning_rate": 4.3056407417232617e-05, + "loss": 5.534, + "step": 2522 + }, + { + "epoch": 0.243297974927676, + "grad_norm": 3.434644937515259, + "learning_rate": 4.3051168388108434e-05, + "loss": 6.0363, + "step": 2523 + }, + { + "epoch": 0.2433944069431051, + "grad_norm": 4.058459281921387, + "learning_rate": 4.304592770227023e-05, + "loss": 6.2337, + "step": 2524 + }, + { + "epoch": 0.24349083895853424, + "grad_norm": 3.682527542114258, + "learning_rate": 4.3040685360198994e-05, + "loss": 6.3035, + "step": 2525 + }, + { + "epoch": 0.24358727097396335, + "grad_norm": 4.116654396057129, + "learning_rate": 4.303544136237587e-05, + "loss": 6.39, + "step": 2526 + }, + { + "epoch": 0.2436837029893925, + "grad_norm": 3.528627872467041, + "learning_rate": 4.303019570928213e-05, + "loss": 6.1765, + "step": 2527 + }, + { + "epoch": 0.2437801350048216, + "grad_norm": 4.167971611022949, + "learning_rate": 4.302494840139922e-05, + "loss": 6.2349, + "step": 2528 + }, + { + "epoch": 0.24387656702025073, + "grad_norm": 5.036700248718262, + "learning_rate": 4.301969943920873e-05, + "loss": 6.2879, + "step": 2529 + }, + { + "epoch": 0.24397299903567984, + "grad_norm": 4.0204620361328125, + "learning_rate": 4.3014448823192396e-05, + "loss": 6.4285, + "step": 2530 + }, + { + "epoch": 0.24406943105110898, + "grad_norm": 2.684178352355957, + "learning_rate": 4.300919655383214e-05, + "loss": 6.065, + "step": 2531 + }, + { + "epoch": 0.2441658630665381, + "grad_norm": 2.6779541969299316, + "learning_rate": 4.300394263160997e-05, + "loss": 6.0849, + "step": 2532 + }, + { + "epoch": 0.2442622950819672, + "grad_norm": 3.2387588024139404, + "learning_rate": 4.2998687057008126e-05, + "loss": 6.1268, + "step": 2533 + }, + { + "epoch": 0.24435872709739634, + "grad_norm": 2.6934468746185303, + "learning_rate": 4.299342983050892e-05, + "loss": 6.2196, + "step": 2534 + }, + { + "epoch": 0.24445515911282545, + "grad_norm": 2.4953784942626953, + "learning_rate": 4.2988170952594876e-05, + "loss": 6.4347, + "step": 2535 + }, + { + "epoch": 0.2445515911282546, + "grad_norm": 3.869065999984741, + "learning_rate": 4.2982910423748634e-05, + "loss": 6.7261, + "step": 2536 + }, + { + "epoch": 0.2446480231436837, + "grad_norm": 3.2702133655548096, + "learning_rate": 4.2977648244453015e-05, + "loss": 6.5411, + "step": 2537 + }, + { + "epoch": 0.24474445515911283, + "grad_norm": 3.026217460632324, + "learning_rate": 4.297238441519097e-05, + "loss": 6.5322, + "step": 2538 + }, + { + "epoch": 0.24484088717454194, + "grad_norm": 3.9936439990997314, + "learning_rate": 4.2967118936445596e-05, + "loss": 6.2595, + "step": 2539 + }, + { + "epoch": 0.24493731918997108, + "grad_norm": 8.055558204650879, + "learning_rate": 4.2961851808700165e-05, + "loss": 6.3958, + "step": 2540 + }, + { + "epoch": 0.2450337512054002, + "grad_norm": 6.547336101531982, + "learning_rate": 4.295658303243808e-05, + "loss": 6.1186, + "step": 2541 + }, + { + "epoch": 0.24513018322082933, + "grad_norm": 3.8323307037353516, + "learning_rate": 4.2951312608142906e-05, + "loss": 6.0925, + "step": 2542 + }, + { + "epoch": 0.24522661523625844, + "grad_norm": 4.677115440368652, + "learning_rate": 4.294604053629836e-05, + "loss": 6.3106, + "step": 2543 + }, + { + "epoch": 0.24532304725168755, + "grad_norm": 7.698746681213379, + "learning_rate": 4.294076681738829e-05, + "loss": 6.32, + "step": 2544 + }, + { + "epoch": 0.24541947926711669, + "grad_norm": 7.921199321746826, + "learning_rate": 4.293549145189674e-05, + "loss": 6.2125, + "step": 2545 + }, + { + "epoch": 0.2455159112825458, + "grad_norm": 3.9489197731018066, + "learning_rate": 4.293021444030784e-05, + "loss": 6.3075, + "step": 2546 + }, + { + "epoch": 0.24561234329797493, + "grad_norm": 3.854572057723999, + "learning_rate": 4.292493578310594e-05, + "loss": 6.3662, + "step": 2547 + }, + { + "epoch": 0.24570877531340404, + "grad_norm": 5.498396873474121, + "learning_rate": 4.2919655480775486e-05, + "loss": 6.3265, + "step": 2548 + }, + { + "epoch": 0.24580520732883318, + "grad_norm": 5.614107131958008, + "learning_rate": 4.291437353380112e-05, + "loss": 6.2773, + "step": 2549 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 3.2648026943206787, + "learning_rate": 4.290908994266759e-05, + "loss": 6.3155, + "step": 2550 + }, + { + "epoch": 0.24599807135969143, + "grad_norm": 2.907696485519409, + "learning_rate": 4.2903804707859835e-05, + "loss": 6.3243, + "step": 2551 + }, + { + "epoch": 0.24609450337512054, + "grad_norm": 2.2572805881500244, + "learning_rate": 4.2898517829862914e-05, + "loss": 6.1637, + "step": 2552 + }, + { + "epoch": 0.24619093539054968, + "grad_norm": 3.769501209259033, + "learning_rate": 4.289322930916206e-05, + "loss": 6.2673, + "step": 2553 + }, + { + "epoch": 0.24628736740597879, + "grad_norm": 2.8473873138427734, + "learning_rate": 4.288793914624264e-05, + "loss": 6.0999, + "step": 2554 + }, + { + "epoch": 0.2463837994214079, + "grad_norm": 2.3658008575439453, + "learning_rate": 4.288264734159019e-05, + "loss": 6.1638, + "step": 2555 + }, + { + "epoch": 0.24648023143683703, + "grad_norm": 3.0956852436065674, + "learning_rate": 4.2877353895690364e-05, + "loss": 6.0462, + "step": 2556 + }, + { + "epoch": 0.24657666345226614, + "grad_norm": 3.8975870609283447, + "learning_rate": 4.287205880902901e-05, + "loss": 6.2513, + "step": 2557 + }, + { + "epoch": 0.24667309546769528, + "grad_norm": 2.9388461112976074, + "learning_rate": 4.286676208209209e-05, + "loss": 6.1129, + "step": 2558 + }, + { + "epoch": 0.2467695274831244, + "grad_norm": 2.7319040298461914, + "learning_rate": 4.286146371536574e-05, + "loss": 5.404, + "step": 2559 + }, + { + "epoch": 0.24686595949855353, + "grad_norm": 3.0193307399749756, + "learning_rate": 4.285616370933623e-05, + "loss": 5.7386, + "step": 2560 + }, + { + "epoch": 0.24696239151398264, + "grad_norm": 2.817427396774292, + "learning_rate": 4.285086206449e-05, + "loss": 6.0571, + "step": 2561 + }, + { + "epoch": 0.24705882352941178, + "grad_norm": 1.980094075202942, + "learning_rate": 4.284555878131361e-05, + "loss": 6.1359, + "step": 2562 + }, + { + "epoch": 0.24715525554484088, + "grad_norm": 3.1679840087890625, + "learning_rate": 4.284025386029381e-05, + "loss": 6.117, + "step": 2563 + }, + { + "epoch": 0.24725168756027002, + "grad_norm": 2.9146134853363037, + "learning_rate": 4.283494730191746e-05, + "loss": 6.0891, + "step": 2564 + }, + { + "epoch": 0.24734811957569913, + "grad_norm": 2.3531837463378906, + "learning_rate": 4.28296391066716e-05, + "loss": 6.3754, + "step": 2565 + }, + { + "epoch": 0.24744455159112824, + "grad_norm": 2.0899553298950195, + "learning_rate": 4.282432927504342e-05, + "loss": 6.1417, + "step": 2566 + }, + { + "epoch": 0.24754098360655738, + "grad_norm": 2.7552669048309326, + "learning_rate": 4.281901780752022e-05, + "loss": 6.1334, + "step": 2567 + }, + { + "epoch": 0.2476374156219865, + "grad_norm": 3.234947443008423, + "learning_rate": 4.28137047045895e-05, + "loss": 6.4015, + "step": 2568 + }, + { + "epoch": 0.24773384763741563, + "grad_norm": 3.2856497764587402, + "learning_rate": 4.2808389966738894e-05, + "loss": 6.4544, + "step": 2569 + }, + { + "epoch": 0.24783027965284474, + "grad_norm": 2.3960251808166504, + "learning_rate": 4.2803073594456175e-05, + "loss": 6.2568, + "step": 2570 + }, + { + "epoch": 0.24792671166827387, + "grad_norm": 3.2969980239868164, + "learning_rate": 4.279775558822927e-05, + "loss": 6.4303, + "step": 2571 + }, + { + "epoch": 0.24802314368370298, + "grad_norm": 2.9245707988739014, + "learning_rate": 4.2792435948546264e-05, + "loss": 6.247, + "step": 2572 + }, + { + "epoch": 0.24811957569913212, + "grad_norm": 2.6668002605438232, + "learning_rate": 4.278711467589538e-05, + "loss": 6.3252, + "step": 2573 + }, + { + "epoch": 0.24821600771456123, + "grad_norm": 2.483442544937134, + "learning_rate": 4.278179177076501e-05, + "loss": 6.4904, + "step": 2574 + }, + { + "epoch": 0.24831243972999037, + "grad_norm": 5.069437026977539, + "learning_rate": 4.277646723364368e-05, + "loss": 6.6486, + "step": 2575 + }, + { + "epoch": 0.24840887174541948, + "grad_norm": 4.192676067352295, + "learning_rate": 4.277114106502006e-05, + "loss": 6.2185, + "step": 2576 + }, + { + "epoch": 0.2485053037608486, + "grad_norm": 3.0957252979278564, + "learning_rate": 4.276581326538299e-05, + "loss": 6.4595, + "step": 2577 + }, + { + "epoch": 0.24860173577627773, + "grad_norm": 4.53892183303833, + "learning_rate": 4.2760483835221444e-05, + "loss": 6.4829, + "step": 2578 + }, + { + "epoch": 0.24869816779170684, + "grad_norm": 2.916905164718628, + "learning_rate": 4.2755152775024544e-05, + "loss": 6.0839, + "step": 2579 + }, + { + "epoch": 0.24879459980713597, + "grad_norm": 4.4698991775512695, + "learning_rate": 4.2749820085281584e-05, + "loss": 6.099, + "step": 2580 + }, + { + "epoch": 0.24889103182256508, + "grad_norm": 5.046087265014648, + "learning_rate": 4.274448576648198e-05, + "loss": 5.9061, + "step": 2581 + }, + { + "epoch": 0.24898746383799422, + "grad_norm": 3.1112897396087646, + "learning_rate": 4.273914981911531e-05, + "loss": 6.2759, + "step": 2582 + }, + { + "epoch": 0.24908389585342333, + "grad_norm": 3.4088315963745117, + "learning_rate": 4.273381224367131e-05, + "loss": 6.2086, + "step": 2583 + }, + { + "epoch": 0.24918032786885247, + "grad_norm": 3.8974761962890625, + "learning_rate": 4.2728473040639837e-05, + "loss": 6.2651, + "step": 2584 + }, + { + "epoch": 0.24927675988428158, + "grad_norm": 3.1163723468780518, + "learning_rate": 4.272313221051094e-05, + "loss": 6.3437, + "step": 2585 + }, + { + "epoch": 0.24937319189971072, + "grad_norm": 2.316321611404419, + "learning_rate": 4.271778975377478e-05, + "loss": 6.1711, + "step": 2586 + }, + { + "epoch": 0.24946962391513983, + "grad_norm": 3.1361160278320312, + "learning_rate": 4.2712445670921676e-05, + "loss": 6.1028, + "step": 2587 + }, + { + "epoch": 0.24956605593056894, + "grad_norm": 2.7497572898864746, + "learning_rate": 4.270709996244212e-05, + "loss": 6.2386, + "step": 2588 + }, + { + "epoch": 0.24966248794599807, + "grad_norm": 1.8555560111999512, + "learning_rate": 4.2701752628826717e-05, + "loss": 6.2555, + "step": 2589 + }, + { + "epoch": 0.24975891996142718, + "grad_norm": 2.2397756576538086, + "learning_rate": 4.2696403670566245e-05, + "loss": 6.1187, + "step": 2590 + }, + { + "epoch": 0.24985535197685632, + "grad_norm": 2.8257827758789062, + "learning_rate": 4.269105308815162e-05, + "loss": 6.378, + "step": 2591 + }, + { + "epoch": 0.24995178399228543, + "grad_norm": 2.8321378231048584, + "learning_rate": 4.2685700882073925e-05, + "loss": 6.3352, + "step": 2592 + }, + { + "epoch": 0.25004821600771454, + "grad_norm": 2.3920438289642334, + "learning_rate": 4.268034705282437e-05, + "loss": 6.3257, + "step": 2593 + }, + { + "epoch": 0.2501446480231437, + "grad_norm": 1.9947044849395752, + "learning_rate": 4.267499160089432e-05, + "loss": 6.249, + "step": 2594 + }, + { + "epoch": 0.2502410800385728, + "grad_norm": 2.6631059646606445, + "learning_rate": 4.26696345267753e-05, + "loss": 6.3894, + "step": 2595 + }, + { + "epoch": 0.25033751205400195, + "grad_norm": 3.800062894821167, + "learning_rate": 4.266427583095897e-05, + "loss": 6.285, + "step": 2596 + }, + { + "epoch": 0.25043394406943104, + "grad_norm": 3.0871880054473877, + "learning_rate": 4.265891551393714e-05, + "loss": 6.0574, + "step": 2597 + }, + { + "epoch": 0.2505303760848602, + "grad_norm": 3.3304595947265625, + "learning_rate": 4.265355357620178e-05, + "loss": 5.979, + "step": 2598 + }, + { + "epoch": 0.2506268081002893, + "grad_norm": 2.4138550758361816, + "learning_rate": 4.2648190018245004e-05, + "loss": 5.8523, + "step": 2599 + }, + { + "epoch": 0.2507232401157184, + "grad_norm": 3.1784863471984863, + "learning_rate": 4.264282484055907e-05, + "loss": 5.7945, + "step": 2600 + }, + { + "epoch": 0.25081967213114753, + "grad_norm": 3.1307246685028076, + "learning_rate": 4.263745804363638e-05, + "loss": 6.0253, + "step": 2601 + }, + { + "epoch": 0.25091610414657667, + "grad_norm": 2.6479997634887695, + "learning_rate": 4.263208962796951e-05, + "loss": 6.3535, + "step": 2602 + }, + { + "epoch": 0.2510125361620058, + "grad_norm": 3.616344451904297, + "learning_rate": 4.262671959405114e-05, + "loss": 6.1095, + "step": 2603 + }, + { + "epoch": 0.2511089681774349, + "grad_norm": 3.9931294918060303, + "learning_rate": 4.2621347942374156e-05, + "loss": 6.1538, + "step": 2604 + }, + { + "epoch": 0.251205400192864, + "grad_norm": 3.5847373008728027, + "learning_rate": 4.261597467343154e-05, + "loss": 6.2225, + "step": 2605 + }, + { + "epoch": 0.25130183220829316, + "grad_norm": 2.829317569732666, + "learning_rate": 4.2610599787716455e-05, + "loss": 6.3284, + "step": 2606 + }, + { + "epoch": 0.2513982642237223, + "grad_norm": 3.3168649673461914, + "learning_rate": 4.260522328572218e-05, + "loss": 6.1079, + "step": 2607 + }, + { + "epoch": 0.2514946962391514, + "grad_norm": 3.8287715911865234, + "learning_rate": 4.25998451679422e-05, + "loss": 6.2271, + "step": 2608 + }, + { + "epoch": 0.2515911282545805, + "grad_norm": 2.57678484916687, + "learning_rate": 4.259446543487009e-05, + "loss": 6.1643, + "step": 2609 + }, + { + "epoch": 0.25168756027000966, + "grad_norm": 2.1102592945098877, + "learning_rate": 4.2589084086999586e-05, + "loss": 6.2874, + "step": 2610 + }, + { + "epoch": 0.25178399228543874, + "grad_norm": 2.791104555130005, + "learning_rate": 4.2583701124824596e-05, + "loss": 6.0248, + "step": 2611 + }, + { + "epoch": 0.2518804243008679, + "grad_norm": 2.3070778846740723, + "learning_rate": 4.257831654883917e-05, + "loss": 6.0045, + "step": 2612 + }, + { + "epoch": 0.251976856316297, + "grad_norm": 2.039466142654419, + "learning_rate": 4.2572930359537475e-05, + "loss": 6.2365, + "step": 2613 + }, + { + "epoch": 0.25207328833172615, + "grad_norm": 2.2642529010772705, + "learning_rate": 4.256754255741387e-05, + "loss": 6.4118, + "step": 2614 + }, + { + "epoch": 0.25216972034715524, + "grad_norm": 3.3164021968841553, + "learning_rate": 4.256215314296282e-05, + "loss": 6.5213, + "step": 2615 + }, + { + "epoch": 0.2522661523625844, + "grad_norm": 3.421574354171753, + "learning_rate": 4.255676211667898e-05, + "loss": 6.4928, + "step": 2616 + }, + { + "epoch": 0.2523625843780135, + "grad_norm": 3.897257089614868, + "learning_rate": 4.255136947905711e-05, + "loss": 6.4801, + "step": 2617 + }, + { + "epoch": 0.25245901639344265, + "grad_norm": 4.194745063781738, + "learning_rate": 4.2545975230592164e-05, + "loss": 6.4381, + "step": 2618 + }, + { + "epoch": 0.25255544840887173, + "grad_norm": 3.4859063625335693, + "learning_rate": 4.25405793717792e-05, + "loss": 6.3792, + "step": 2619 + }, + { + "epoch": 0.25265188042430087, + "grad_norm": 3.8383214473724365, + "learning_rate": 4.2535181903113454e-05, + "loss": 6.0758, + "step": 2620 + }, + { + "epoch": 0.25274831243973, + "grad_norm": 3.3463032245635986, + "learning_rate": 4.252978282509029e-05, + "loss": 5.8458, + "step": 2621 + }, + { + "epoch": 0.2528447444551591, + "grad_norm": 4.978990077972412, + "learning_rate": 4.252438213820523e-05, + "loss": 6.3349, + "step": 2622 + }, + { + "epoch": 0.2529411764705882, + "grad_norm": 3.7660839557647705, + "learning_rate": 4.251897984295397e-05, + "loss": 6.3438, + "step": 2623 + }, + { + "epoch": 0.25303760848601736, + "grad_norm": 3.266092538833618, + "learning_rate": 4.2513575939832275e-05, + "loss": 6.3807, + "step": 2624 + }, + { + "epoch": 0.2531340405014465, + "grad_norm": 3.1613640785217285, + "learning_rate": 4.2508170429336144e-05, + "loss": 6.2571, + "step": 2625 + }, + { + "epoch": 0.2532304725168756, + "grad_norm": 3.668449878692627, + "learning_rate": 4.250276331196169e-05, + "loss": 6.1051, + "step": 2626 + }, + { + "epoch": 0.2533269045323047, + "grad_norm": 5.5806169509887695, + "learning_rate": 4.249735458820515e-05, + "loss": 5.8513, + "step": 2627 + }, + { + "epoch": 0.25342333654773386, + "grad_norm": 5.101857662200928, + "learning_rate": 4.2491944258562946e-05, + "loss": 6.2132, + "step": 2628 + }, + { + "epoch": 0.253519768563163, + "grad_norm": 3.800794839859009, + "learning_rate": 4.248653232353162e-05, + "loss": 6.409, + "step": 2629 + }, + { + "epoch": 0.2536162005785921, + "grad_norm": 3.6170082092285156, + "learning_rate": 4.248111878360789e-05, + "loss": 6.397, + "step": 2630 + }, + { + "epoch": 0.2537126325940212, + "grad_norm": 4.575811386108398, + "learning_rate": 4.2475703639288595e-05, + "loss": 5.591, + "step": 2631 + }, + { + "epoch": 0.25380906460945035, + "grad_norm": 4.24979305267334, + "learning_rate": 4.247028689107072e-05, + "loss": 6.1799, + "step": 2632 + }, + { + "epoch": 0.25390549662487943, + "grad_norm": 3.7969889640808105, + "learning_rate": 4.2464868539451425e-05, + "loss": 6.2597, + "step": 2633 + }, + { + "epoch": 0.25400192864030857, + "grad_norm": 5.035332679748535, + "learning_rate": 4.2459448584927994e-05, + "loss": 5.8986, + "step": 2634 + }, + { + "epoch": 0.2540983606557377, + "grad_norm": 4.767864227294922, + "learning_rate": 4.245402702799785e-05, + "loss": 6.2406, + "step": 2635 + }, + { + "epoch": 0.25419479267116685, + "grad_norm": 3.9456636905670166, + "learning_rate": 4.2448603869158587e-05, + "loss": 6.4261, + "step": 2636 + }, + { + "epoch": 0.25429122468659593, + "grad_norm": 4.573615550994873, + "learning_rate": 4.244317910890795e-05, + "loss": 6.1051, + "step": 2637 + }, + { + "epoch": 0.25438765670202507, + "grad_norm": 3.7944023609161377, + "learning_rate": 4.243775274774379e-05, + "loss": 6.3443, + "step": 2638 + }, + { + "epoch": 0.2544840887174542, + "grad_norm": 3.1435484886169434, + "learning_rate": 4.243232478616415e-05, + "loss": 6.3791, + "step": 2639 + }, + { + "epoch": 0.25458052073288334, + "grad_norm": 4.7915849685668945, + "learning_rate": 4.242689522466719e-05, + "loss": 6.4, + "step": 2640 + }, + { + "epoch": 0.2546769527483124, + "grad_norm": 2.9969096183776855, + "learning_rate": 4.2421464063751245e-05, + "loss": 6.2608, + "step": 2641 + }, + { + "epoch": 0.25477338476374156, + "grad_norm": 3.416996717453003, + "learning_rate": 4.241603130391477e-05, + "loss": 6.0756, + "step": 2642 + }, + { + "epoch": 0.2548698167791707, + "grad_norm": 3.8557159900665283, + "learning_rate": 4.241059694565638e-05, + "loss": 5.6314, + "step": 2643 + }, + { + "epoch": 0.2549662487945998, + "grad_norm": 4.262914180755615, + "learning_rate": 4.240516098947482e-05, + "loss": 5.7695, + "step": 2644 + }, + { + "epoch": 0.2550626808100289, + "grad_norm": 5.618409633636475, + "learning_rate": 4.239972343586902e-05, + "loss": 6.455, + "step": 2645 + }, + { + "epoch": 0.25515911282545806, + "grad_norm": 3.5418972969055176, + "learning_rate": 4.2394284285338015e-05, + "loss": 6.4708, + "step": 2646 + }, + { + "epoch": 0.2552555448408872, + "grad_norm": 4.3428826332092285, + "learning_rate": 4.2388843538381006e-05, + "loss": 6.3279, + "step": 2647 + }, + { + "epoch": 0.2553519768563163, + "grad_norm": 7.32571268081665, + "learning_rate": 4.238340119549733e-05, + "loss": 5.8729, + "step": 2648 + }, + { + "epoch": 0.2554484088717454, + "grad_norm": 4.56308126449585, + "learning_rate": 4.23779572571865e-05, + "loss": 6.1107, + "step": 2649 + }, + { + "epoch": 0.25554484088717455, + "grad_norm": 3.8891353607177734, + "learning_rate": 4.237251172394814e-05, + "loss": 5.6918, + "step": 2650 + }, + { + "epoch": 0.2556412729026037, + "grad_norm": 4.670422077178955, + "learning_rate": 4.236706459628203e-05, + "loss": 5.982, + "step": 2651 + }, + { + "epoch": 0.25573770491803277, + "grad_norm": 5.14467716217041, + "learning_rate": 4.236161587468811e-05, + "loss": 6.1295, + "step": 2652 + }, + { + "epoch": 0.2558341369334619, + "grad_norm": 7.6999592781066895, + "learning_rate": 4.235616555966645e-05, + "loss": 6.1926, + "step": 2653 + }, + { + "epoch": 0.25593056894889105, + "grad_norm": 5.869077682495117, + "learning_rate": 4.235071365171728e-05, + "loss": 6.1607, + "step": 2654 + }, + { + "epoch": 0.25602700096432013, + "grad_norm": 6.2603888511657715, + "learning_rate": 4.234526015134097e-05, + "loss": 6.3366, + "step": 2655 + }, + { + "epoch": 0.25612343297974927, + "grad_norm": 8.295364379882812, + "learning_rate": 4.233980505903803e-05, + "loss": 6.2381, + "step": 2656 + }, + { + "epoch": 0.2562198649951784, + "grad_norm": 6.257622718811035, + "learning_rate": 4.233434837530912e-05, + "loss": 6.3958, + "step": 2657 + }, + { + "epoch": 0.25631629701060754, + "grad_norm": 6.711035251617432, + "learning_rate": 4.232889010065505e-05, + "loss": 6.5224, + "step": 2658 + }, + { + "epoch": 0.2564127290260366, + "grad_norm": 5.385359764099121, + "learning_rate": 4.232343023557678e-05, + "loss": 6.525, + "step": 2659 + }, + { + "epoch": 0.25650916104146576, + "grad_norm": 6.348348140716553, + "learning_rate": 4.231796878057541e-05, + "loss": 6.3265, + "step": 2660 + }, + { + "epoch": 0.2566055930568949, + "grad_norm": 5.206521034240723, + "learning_rate": 4.231250573615217e-05, + "loss": 6.054, + "step": 2661 + }, + { + "epoch": 0.25670202507232404, + "grad_norm": 5.283237934112549, + "learning_rate": 4.230704110280847e-05, + "loss": 6.0453, + "step": 2662 + }, + { + "epoch": 0.2567984570877531, + "grad_norm": 7.028003692626953, + "learning_rate": 4.230157488104583e-05, + "loss": 6.0868, + "step": 2663 + }, + { + "epoch": 0.25689488910318226, + "grad_norm": 6.638134479522705, + "learning_rate": 4.229610707136595e-05, + "loss": 6.2236, + "step": 2664 + }, + { + "epoch": 0.2569913211186114, + "grad_norm": 4.003936767578125, + "learning_rate": 4.229063767427064e-05, + "loss": 6.0594, + "step": 2665 + }, + { + "epoch": 0.2570877531340405, + "grad_norm": 4.130794048309326, + "learning_rate": 4.2285166690261894e-05, + "loss": 6.2806, + "step": 2666 + }, + { + "epoch": 0.2571841851494696, + "grad_norm": 4.44627571105957, + "learning_rate": 4.227969411984183e-05, + "loss": 6.1267, + "step": 2667 + }, + { + "epoch": 0.25728061716489875, + "grad_norm": 3.9592349529266357, + "learning_rate": 4.22742199635127e-05, + "loss": 6.2619, + "step": 2668 + }, + { + "epoch": 0.2573770491803279, + "grad_norm": 2.3213484287261963, + "learning_rate": 4.226874422177692e-05, + "loss": 6.1929, + "step": 2669 + }, + { + "epoch": 0.25747348119575697, + "grad_norm": 4.133720397949219, + "learning_rate": 4.226326689513705e-05, + "loss": 6.1609, + "step": 2670 + }, + { + "epoch": 0.2575699132111861, + "grad_norm": 3.863013744354248, + "learning_rate": 4.22577879840958e-05, + "loss": 6.2643, + "step": 2671 + }, + { + "epoch": 0.25766634522661525, + "grad_norm": 2.8901400566101074, + "learning_rate": 4.2252307489156006e-05, + "loss": 6.2914, + "step": 2672 + }, + { + "epoch": 0.2577627772420444, + "grad_norm": 3.153256893157959, + "learning_rate": 4.2246825410820654e-05, + "loss": 6.0384, + "step": 2673 + }, + { + "epoch": 0.25785920925747347, + "grad_norm": 2.875628709793091, + "learning_rate": 4.224134174959291e-05, + "loss": 6.0638, + "step": 2674 + }, + { + "epoch": 0.2579556412729026, + "grad_norm": 4.34543514251709, + "learning_rate": 4.223585650597603e-05, + "loss": 6.3429, + "step": 2675 + }, + { + "epoch": 0.25805207328833174, + "grad_norm": 3.7597951889038086, + "learning_rate": 4.2230369680473456e-05, + "loss": 6.3718, + "step": 2676 + }, + { + "epoch": 0.2581485053037608, + "grad_norm": 2.7526326179504395, + "learning_rate": 4.222488127358876e-05, + "loss": 6.3296, + "step": 2677 + }, + { + "epoch": 0.25824493731918996, + "grad_norm": 2.9276134967803955, + "learning_rate": 4.2219391285825664e-05, + "loss": 6.2904, + "step": 2678 + }, + { + "epoch": 0.2583413693346191, + "grad_norm": 2.416642189025879, + "learning_rate": 4.221389971768803e-05, + "loss": 6.463, + "step": 2679 + }, + { + "epoch": 0.25843780135004824, + "grad_norm": 2.3717706203460693, + "learning_rate": 4.220840656967986e-05, + "loss": 6.22, + "step": 2680 + }, + { + "epoch": 0.2585342333654773, + "grad_norm": 2.860429048538208, + "learning_rate": 4.220291184230532e-05, + "loss": 6.2057, + "step": 2681 + }, + { + "epoch": 0.25863066538090645, + "grad_norm": 1.9467512369155884, + "learning_rate": 4.2197415536068705e-05, + "loss": 5.8346, + "step": 2682 + }, + { + "epoch": 0.2587270973963356, + "grad_norm": 3.7727866172790527, + "learning_rate": 4.219191765147446e-05, + "loss": 5.7953, + "step": 2683 + }, + { + "epoch": 0.25882352941176473, + "grad_norm": 5.114905834197998, + "learning_rate": 4.218641818902717e-05, + "loss": 6.1289, + "step": 2684 + }, + { + "epoch": 0.2589199614271938, + "grad_norm": 4.408492088317871, + "learning_rate": 4.218091714923157e-05, + "loss": 5.9115, + "step": 2685 + }, + { + "epoch": 0.25901639344262295, + "grad_norm": 3.7936058044433594, + "learning_rate": 4.2175414532592546e-05, + "loss": 5.953, + "step": 2686 + }, + { + "epoch": 0.2591128254580521, + "grad_norm": 2.586876392364502, + "learning_rate": 4.216991033961511e-05, + "loss": 5.9541, + "step": 2687 + }, + { + "epoch": 0.25920925747348117, + "grad_norm": 3.596069097518921, + "learning_rate": 4.216440457080444e-05, + "loss": 6.0411, + "step": 2688 + }, + { + "epoch": 0.2593056894889103, + "grad_norm": 4.6695451736450195, + "learning_rate": 4.215889722666584e-05, + "loss": 6.1282, + "step": 2689 + }, + { + "epoch": 0.25940212150433944, + "grad_norm": 3.6417012214660645, + "learning_rate": 4.215338830770477e-05, + "loss": 6.166, + "step": 2690 + }, + { + "epoch": 0.2594985535197686, + "grad_norm": 2.5034282207489014, + "learning_rate": 4.214787781442684e-05, + "loss": 6.2856, + "step": 2691 + }, + { + "epoch": 0.25959498553519766, + "grad_norm": 4.0593390464782715, + "learning_rate": 4.214236574733779e-05, + "loss": 6.2301, + "step": 2692 + }, + { + "epoch": 0.2596914175506268, + "grad_norm": 3.493927001953125, + "learning_rate": 4.21368521069435e-05, + "loss": 6.1297, + "step": 2693 + }, + { + "epoch": 0.25978784956605594, + "grad_norm": 2.2333805561065674, + "learning_rate": 4.2131336893750026e-05, + "loss": 6.2592, + "step": 2694 + }, + { + "epoch": 0.2598842815814851, + "grad_norm": 3.4195337295532227, + "learning_rate": 4.2125820108263534e-05, + "loss": 6.4449, + "step": 2695 + }, + { + "epoch": 0.25998071359691416, + "grad_norm": 4.093400478363037, + "learning_rate": 4.2120301750990355e-05, + "loss": 6.5865, + "step": 2696 + }, + { + "epoch": 0.2600771456123433, + "grad_norm": 2.3434927463531494, + "learning_rate": 4.211478182243695e-05, + "loss": 6.4211, + "step": 2697 + }, + { + "epoch": 0.26017357762777243, + "grad_norm": 2.796804666519165, + "learning_rate": 4.210926032310993e-05, + "loss": 6.4622, + "step": 2698 + }, + { + "epoch": 0.2602700096432015, + "grad_norm": 4.198972225189209, + "learning_rate": 4.210373725351606e-05, + "loss": 6.2723, + "step": 2699 + }, + { + "epoch": 0.26036644165863065, + "grad_norm": 4.191365718841553, + "learning_rate": 4.2098212614162234e-05, + "loss": 6.2032, + "step": 2700 + }, + { + "epoch": 0.2604628736740598, + "grad_norm": 4.732014179229736, + "learning_rate": 4.2092686405555505e-05, + "loss": 6.104, + "step": 2701 + }, + { + "epoch": 0.26055930568948893, + "grad_norm": 3.5281951427459717, + "learning_rate": 4.208715862820305e-05, + "loss": 6.0059, + "step": 2702 + }, + { + "epoch": 0.260655737704918, + "grad_norm": 4.0133867263793945, + "learning_rate": 4.2081629282612204e-05, + "loss": 6.2388, + "step": 2703 + }, + { + "epoch": 0.26075216972034715, + "grad_norm": 3.2230167388916016, + "learning_rate": 4.207609836929045e-05, + "loss": 6.2036, + "step": 2704 + }, + { + "epoch": 0.2608486017357763, + "grad_norm": 3.9064438343048096, + "learning_rate": 4.2070565888745404e-05, + "loss": 6.2155, + "step": 2705 + }, + { + "epoch": 0.2609450337512054, + "grad_norm": 4.106882095336914, + "learning_rate": 4.206503184148484e-05, + "loss": 6.133, + "step": 2706 + }, + { + "epoch": 0.2610414657666345, + "grad_norm": 2.705230474472046, + "learning_rate": 4.205949622801664e-05, + "loss": 6.3068, + "step": 2707 + }, + { + "epoch": 0.26113789778206364, + "grad_norm": 4.307644367218018, + "learning_rate": 4.205395904884889e-05, + "loss": 6.2232, + "step": 2708 + }, + { + "epoch": 0.2612343297974928, + "grad_norm": 3.49135684967041, + "learning_rate": 4.2048420304489754e-05, + "loss": 5.8593, + "step": 2709 + }, + { + "epoch": 0.26133076181292186, + "grad_norm": 3.212977409362793, + "learning_rate": 4.204287999544759e-05, + "loss": 6.0647, + "step": 2710 + }, + { + "epoch": 0.261427193828351, + "grad_norm": 2.962817668914795, + "learning_rate": 4.203733812223088e-05, + "loss": 5.7101, + "step": 2711 + }, + { + "epoch": 0.26152362584378014, + "grad_norm": 3.9159433841705322, + "learning_rate": 4.2031794685348245e-05, + "loss": 6.3343, + "step": 2712 + }, + { + "epoch": 0.2616200578592093, + "grad_norm": 5.022986888885498, + "learning_rate": 4.202624968530846e-05, + "loss": 6.2024, + "step": 2713 + }, + { + "epoch": 0.26171648987463836, + "grad_norm": 3.995145559310913, + "learning_rate": 4.202070312262043e-05, + "loss": 6.1878, + "step": 2714 + }, + { + "epoch": 0.2618129218900675, + "grad_norm": 4.44966459274292, + "learning_rate": 4.201515499779322e-05, + "loss": 6.1459, + "step": 2715 + }, + { + "epoch": 0.26190935390549663, + "grad_norm": 4.748875141143799, + "learning_rate": 4.2009605311336025e-05, + "loss": 6.1888, + "step": 2716 + }, + { + "epoch": 0.26200578592092577, + "grad_norm": 3.124241352081299, + "learning_rate": 4.200405406375819e-05, + "loss": 6.2204, + "step": 2717 + }, + { + "epoch": 0.26210221793635485, + "grad_norm": 1.9426532983779907, + "learning_rate": 4.19985012555692e-05, + "loss": 6.0937, + "step": 2718 + }, + { + "epoch": 0.262198649951784, + "grad_norm": 3.925856828689575, + "learning_rate": 4.199294688727869e-05, + "loss": 6.1263, + "step": 2719 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 3.061558961868286, + "learning_rate": 4.1987390959396425e-05, + "loss": 6.2824, + "step": 2720 + }, + { + "epoch": 0.2623915139826422, + "grad_norm": 2.860872268676758, + "learning_rate": 4.198183347243233e-05, + "loss": 5.4982, + "step": 2721 + }, + { + "epoch": 0.26248794599807135, + "grad_norm": 2.889092206954956, + "learning_rate": 4.197627442689646e-05, + "loss": 5.7571, + "step": 2722 + }, + { + "epoch": 0.2625843780135005, + "grad_norm": 2.375014066696167, + "learning_rate": 4.197071382329901e-05, + "loss": 5.6351, + "step": 2723 + }, + { + "epoch": 0.2626808100289296, + "grad_norm": 2.061694383621216, + "learning_rate": 4.196515166215034e-05, + "loss": 5.4942, + "step": 2724 + }, + { + "epoch": 0.2627772420443587, + "grad_norm": 2.176910400390625, + "learning_rate": 4.1959587943960935e-05, + "loss": 5.697, + "step": 2725 + }, + { + "epoch": 0.26287367405978784, + "grad_norm": 2.77319598197937, + "learning_rate": 4.1954022669241415e-05, + "loss": 6.2204, + "step": 2726 + }, + { + "epoch": 0.262970106075217, + "grad_norm": 2.623314380645752, + "learning_rate": 4.194845583850256e-05, + "loss": 6.094, + "step": 2727 + }, + { + "epoch": 0.2630665380906461, + "grad_norm": 3.6177120208740234, + "learning_rate": 4.19428874522553e-05, + "loss": 6.1228, + "step": 2728 + }, + { + "epoch": 0.2631629701060752, + "grad_norm": 3.581498384475708, + "learning_rate": 4.193731751101067e-05, + "loss": 6.0475, + "step": 2729 + }, + { + "epoch": 0.26325940212150434, + "grad_norm": 5.377962112426758, + "learning_rate": 4.1931746015279895e-05, + "loss": 6.0917, + "step": 2730 + }, + { + "epoch": 0.2633558341369335, + "grad_norm": 2.8611855506896973, + "learning_rate": 4.192617296557431e-05, + "loss": 6.1622, + "step": 2731 + }, + { + "epoch": 0.26345226615236256, + "grad_norm": 3.339935779571533, + "learning_rate": 4.192059836240539e-05, + "loss": 6.0957, + "step": 2732 + }, + { + "epoch": 0.2635486981677917, + "grad_norm": 3.574680805206299, + "learning_rate": 4.191502220628479e-05, + "loss": 6.1141, + "step": 2733 + }, + { + "epoch": 0.26364513018322083, + "grad_norm": 3.8642566204071045, + "learning_rate": 4.1909444497724274e-05, + "loss": 6.3677, + "step": 2734 + }, + { + "epoch": 0.26374156219864997, + "grad_norm": 5.4310455322265625, + "learning_rate": 4.190386523723574e-05, + "loss": 6.2731, + "step": 2735 + }, + { + "epoch": 0.26383799421407905, + "grad_norm": 3.5841891765594482, + "learning_rate": 4.189828442533127e-05, + "loss": 5.9703, + "step": 2736 + }, + { + "epoch": 0.2639344262295082, + "grad_norm": 3.314037799835205, + "learning_rate": 4.189270206252305e-05, + "loss": 5.9334, + "step": 2737 + }, + { + "epoch": 0.2640308582449373, + "grad_norm": 4.575343608856201, + "learning_rate": 4.188711814932342e-05, + "loss": 5.6641, + "step": 2738 + }, + { + "epoch": 0.26412729026036647, + "grad_norm": 3.9690513610839844, + "learning_rate": 4.188153268624489e-05, + "loss": 6.009, + "step": 2739 + }, + { + "epoch": 0.26422372227579555, + "grad_norm": 4.795828819274902, + "learning_rate": 4.1875945673800045e-05, + "loss": 6.1683, + "step": 2740 + }, + { + "epoch": 0.2643201542912247, + "grad_norm": 3.866668939590454, + "learning_rate": 4.18703571125017e-05, + "loss": 6.3062, + "step": 2741 + }, + { + "epoch": 0.2644165863066538, + "grad_norm": 2.9760215282440186, + "learning_rate": 4.186476700286272e-05, + "loss": 6.1173, + "step": 2742 + }, + { + "epoch": 0.2645130183220829, + "grad_norm": 3.222210645675659, + "learning_rate": 4.1859175345396196e-05, + "loss": 6.2149, + "step": 2743 + }, + { + "epoch": 0.26460945033751204, + "grad_norm": 3.856815814971924, + "learning_rate": 4.185358214061531e-05, + "loss": 6.1998, + "step": 2744 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 3.0701427459716797, + "learning_rate": 4.18479873890334e-05, + "loss": 6.2911, + "step": 2745 + }, + { + "epoch": 0.2648023143683703, + "grad_norm": 3.298832654953003, + "learning_rate": 4.184239109116393e-05, + "loss": 6.4482, + "step": 2746 + }, + { + "epoch": 0.2648987463837994, + "grad_norm": 2.853731870651245, + "learning_rate": 4.1836793247520544e-05, + "loss": 6.2473, + "step": 2747 + }, + { + "epoch": 0.26499517839922854, + "grad_norm": 3.1054155826568604, + "learning_rate": 4.1831193858617005e-05, + "loss": 6.2826, + "step": 2748 + }, + { + "epoch": 0.2650916104146577, + "grad_norm": 4.170516490936279, + "learning_rate": 4.1825592924967195e-05, + "loss": 6.1983, + "step": 2749 + }, + { + "epoch": 0.2651880424300868, + "grad_norm": 2.195793867111206, + "learning_rate": 4.181999044708518e-05, + "loss": 6.1389, + "step": 2750 + }, + { + "epoch": 0.2652844744455159, + "grad_norm": 2.5043342113494873, + "learning_rate": 4.181438642548514e-05, + "loss": 6.1188, + "step": 2751 + }, + { + "epoch": 0.26538090646094503, + "grad_norm": 2.5982666015625, + "learning_rate": 4.180878086068142e-05, + "loss": 5.9195, + "step": 2752 + }, + { + "epoch": 0.26547733847637417, + "grad_norm": 4.454935550689697, + "learning_rate": 4.180317375318847e-05, + "loss": 6.2261, + "step": 2753 + }, + { + "epoch": 0.26557377049180325, + "grad_norm": 2.819617986679077, + "learning_rate": 4.179756510352092e-05, + "loss": 5.9278, + "step": 2754 + }, + { + "epoch": 0.2656702025072324, + "grad_norm": 3.732116222381592, + "learning_rate": 4.179195491219353e-05, + "loss": 6.1841, + "step": 2755 + }, + { + "epoch": 0.2657666345226615, + "grad_norm": 3.5175533294677734, + "learning_rate": 4.178634317972117e-05, + "loss": 6.167, + "step": 2756 + }, + { + "epoch": 0.26586306653809066, + "grad_norm": 3.198301076889038, + "learning_rate": 4.17807299066189e-05, + "loss": 6.2054, + "step": 2757 + }, + { + "epoch": 0.26595949855351975, + "grad_norm": 4.276394844055176, + "learning_rate": 4.177511509340188e-05, + "loss": 6.274, + "step": 2758 + }, + { + "epoch": 0.2660559305689489, + "grad_norm": 2.3221898078918457, + "learning_rate": 4.176949874058546e-05, + "loss": 6.303, + "step": 2759 + }, + { + "epoch": 0.266152362584378, + "grad_norm": 3.832326889038086, + "learning_rate": 4.1763880848685075e-05, + "loss": 6.4121, + "step": 2760 + }, + { + "epoch": 0.26624879459980716, + "grad_norm": 4.161049842834473, + "learning_rate": 4.175826141821635e-05, + "loss": 6.3443, + "step": 2761 + }, + { + "epoch": 0.26634522661523624, + "grad_norm": 3.099820375442505, + "learning_rate": 4.175264044969501e-05, + "loss": 6.3703, + "step": 2762 + }, + { + "epoch": 0.2664416586306654, + "grad_norm": 3.5498809814453125, + "learning_rate": 4.174701794363695e-05, + "loss": 6.2423, + "step": 2763 + }, + { + "epoch": 0.2665380906460945, + "grad_norm": 3.773298978805542, + "learning_rate": 4.174139390055819e-05, + "loss": 6.0246, + "step": 2764 + }, + { + "epoch": 0.2666345226615236, + "grad_norm": 3.4046525955200195, + "learning_rate": 4.173576832097491e-05, + "loss": 5.5613, + "step": 2765 + }, + { + "epoch": 0.26673095467695274, + "grad_norm": 2.488389015197754, + "learning_rate": 4.173014120540342e-05, + "loss": 5.8765, + "step": 2766 + }, + { + "epoch": 0.2668273866923819, + "grad_norm": 2.923875331878662, + "learning_rate": 4.172451255436015e-05, + "loss": 6.1439, + "step": 2767 + }, + { + "epoch": 0.266923818707811, + "grad_norm": 2.867135524749756, + "learning_rate": 4.17188823683617e-05, + "loss": 6.128, + "step": 2768 + }, + { + "epoch": 0.2670202507232401, + "grad_norm": 3.4048469066619873, + "learning_rate": 4.1713250647924804e-05, + "loss": 5.9183, + "step": 2769 + }, + { + "epoch": 0.26711668273866923, + "grad_norm": 2.4777207374572754, + "learning_rate": 4.1707617393566344e-05, + "loss": 5.8595, + "step": 2770 + }, + { + "epoch": 0.26721311475409837, + "grad_norm": 2.864938735961914, + "learning_rate": 4.1701982605803316e-05, + "loss": 5.6731, + "step": 2771 + }, + { + "epoch": 0.2673095467695275, + "grad_norm": 3.7255196571350098, + "learning_rate": 4.1696346285152885e-05, + "loss": 5.7208, + "step": 2772 + }, + { + "epoch": 0.2674059787849566, + "grad_norm": 2.2072484493255615, + "learning_rate": 4.1690708432132336e-05, + "loss": 5.5978, + "step": 2773 + }, + { + "epoch": 0.2675024108003857, + "grad_norm": 3.9270102977752686, + "learning_rate": 4.168506904725912e-05, + "loss": 5.9671, + "step": 2774 + }, + { + "epoch": 0.26759884281581486, + "grad_norm": 4.620362758636475, + "learning_rate": 4.167942813105079e-05, + "loss": 6.3118, + "step": 2775 + }, + { + "epoch": 0.26769527483124395, + "grad_norm": 4.572042465209961, + "learning_rate": 4.167378568402509e-05, + "loss": 6.4201, + "step": 2776 + }, + { + "epoch": 0.2677917068466731, + "grad_norm": 2.8413443565368652, + "learning_rate": 4.166814170669985e-05, + "loss": 6.2459, + "step": 2777 + }, + { + "epoch": 0.2678881388621022, + "grad_norm": 3.648625135421753, + "learning_rate": 4.166249619959308e-05, + "loss": 6.2328, + "step": 2778 + }, + { + "epoch": 0.26798457087753136, + "grad_norm": 3.724036693572998, + "learning_rate": 4.165684916322292e-05, + "loss": 6.3306, + "step": 2779 + }, + { + "epoch": 0.26808100289296044, + "grad_norm": 3.641306161880493, + "learning_rate": 4.165120059810765e-05, + "loss": 6.2296, + "step": 2780 + }, + { + "epoch": 0.2681774349083896, + "grad_norm": 4.330116271972656, + "learning_rate": 4.164555050476567e-05, + "loss": 5.8944, + "step": 2781 + }, + { + "epoch": 0.2682738669238187, + "grad_norm": 3.9918105602264404, + "learning_rate": 4.163989888371557e-05, + "loss": 5.9942, + "step": 2782 + }, + { + "epoch": 0.26837029893924785, + "grad_norm": 3.5615642070770264, + "learning_rate": 4.1634245735476016e-05, + "loss": 6.3377, + "step": 2783 + }, + { + "epoch": 0.26846673095467694, + "grad_norm": 2.9189107418060303, + "learning_rate": 4.162859106056587e-05, + "loss": 6.4661, + "step": 2784 + }, + { + "epoch": 0.2685631629701061, + "grad_norm": 2.415229320526123, + "learning_rate": 4.16229348595041e-05, + "loss": 6.2018, + "step": 2785 + }, + { + "epoch": 0.2686595949855352, + "grad_norm": 3.4503934383392334, + "learning_rate": 4.161727713280983e-05, + "loss": 6.0952, + "step": 2786 + }, + { + "epoch": 0.2687560270009643, + "grad_norm": 3.8431661128997803, + "learning_rate": 4.1611617881002316e-05, + "loss": 6.5183, + "step": 2787 + }, + { + "epoch": 0.26885245901639343, + "grad_norm": 4.285297393798828, + "learning_rate": 4.1605957104600956e-05, + "loss": 6.5706, + "step": 2788 + }, + { + "epoch": 0.26894889103182257, + "grad_norm": 4.435140609741211, + "learning_rate": 4.160029480412529e-05, + "loss": 6.128, + "step": 2789 + }, + { + "epoch": 0.2690453230472517, + "grad_norm": 3.601447820663452, + "learning_rate": 4.1594630980095005e-05, + "loss": 6.0128, + "step": 2790 + }, + { + "epoch": 0.2691417550626808, + "grad_norm": 2.7654964923858643, + "learning_rate": 4.1588965633029906e-05, + "loss": 6.2677, + "step": 2791 + }, + { + "epoch": 0.2692381870781099, + "grad_norm": 2.424598455429077, + "learning_rate": 4.1583298763449965e-05, + "loss": 6.109, + "step": 2792 + }, + { + "epoch": 0.26933461909353906, + "grad_norm": 2.748394012451172, + "learning_rate": 4.1577630371875276e-05, + "loss": 6.1739, + "step": 2793 + }, + { + "epoch": 0.2694310511089682, + "grad_norm": 3.6049606800079346, + "learning_rate": 4.157196045882607e-05, + "loss": 6.065, + "step": 2794 + }, + { + "epoch": 0.2695274831243973, + "grad_norm": 4.951334476470947, + "learning_rate": 4.156628902482273e-05, + "loss": 6.2238, + "step": 2795 + }, + { + "epoch": 0.2696239151398264, + "grad_norm": 4.4064860343933105, + "learning_rate": 4.1560616070385775e-05, + "loss": 6.2158, + "step": 2796 + }, + { + "epoch": 0.26972034715525556, + "grad_norm": 3.100754976272583, + "learning_rate": 4.1554941596035865e-05, + "loss": 5.9676, + "step": 2797 + }, + { + "epoch": 0.26981677917068464, + "grad_norm": 3.8435750007629395, + "learning_rate": 4.1549265602293785e-05, + "loss": 6.4235, + "step": 2798 + }, + { + "epoch": 0.2699132111861138, + "grad_norm": 4.436659336090088, + "learning_rate": 4.1543588089680474e-05, + "loss": 6.233, + "step": 2799 + }, + { + "epoch": 0.2700096432015429, + "grad_norm": 4.202788352966309, + "learning_rate": 4.1537909058717014e-05, + "loss": 6.1126, + "step": 2800 + }, + { + "epoch": 0.27010607521697205, + "grad_norm": 4.636790752410889, + "learning_rate": 4.1532228509924614e-05, + "loss": 5.8038, + "step": 2801 + }, + { + "epoch": 0.27020250723240113, + "grad_norm": 3.7743735313415527, + "learning_rate": 4.152654644382463e-05, + "loss": 6.2335, + "step": 2802 + }, + { + "epoch": 0.27029893924783027, + "grad_norm": 4.994185447692871, + "learning_rate": 4.152086286093856e-05, + "loss": 6.3364, + "step": 2803 + }, + { + "epoch": 0.2703953712632594, + "grad_norm": 3.578732967376709, + "learning_rate": 4.151517776178802e-05, + "loss": 6.2754, + "step": 2804 + }, + { + "epoch": 0.27049180327868855, + "grad_norm": 3.529860019683838, + "learning_rate": 4.15094911468948e-05, + "loss": 6.2425, + "step": 2805 + }, + { + "epoch": 0.27058823529411763, + "grad_norm": 3.6563501358032227, + "learning_rate": 4.1503803016780796e-05, + "loss": 6.1715, + "step": 2806 + }, + { + "epoch": 0.27068466730954677, + "grad_norm": 4.170017242431641, + "learning_rate": 4.149811337196807e-05, + "loss": 6.1646, + "step": 2807 + }, + { + "epoch": 0.2707810993249759, + "grad_norm": 2.9180428981781006, + "learning_rate": 4.14924222129788e-05, + "loss": 6.1728, + "step": 2808 + }, + { + "epoch": 0.27087753134040504, + "grad_norm": 3.430891752243042, + "learning_rate": 4.1486729540335324e-05, + "loss": 6.2315, + "step": 2809 + }, + { + "epoch": 0.2709739633558341, + "grad_norm": 2.9381601810455322, + "learning_rate": 4.148103535456009e-05, + "loss": 6.086, + "step": 2810 + }, + { + "epoch": 0.27107039537126326, + "grad_norm": 3.8950045108795166, + "learning_rate": 4.147533965617573e-05, + "loss": 6.0186, + "step": 2811 + }, + { + "epoch": 0.2711668273866924, + "grad_norm": 4.045165061950684, + "learning_rate": 4.146964244570497e-05, + "loss": 5.894, + "step": 2812 + }, + { + "epoch": 0.2712632594021215, + "grad_norm": 3.019461154937744, + "learning_rate": 4.1463943723670695e-05, + "loss": 6.1357, + "step": 2813 + }, + { + "epoch": 0.2713596914175506, + "grad_norm": 2.210073947906494, + "learning_rate": 4.1458243490595926e-05, + "loss": 6.0529, + "step": 2814 + }, + { + "epoch": 0.27145612343297976, + "grad_norm": 2.090120315551758, + "learning_rate": 4.1452541747003834e-05, + "loss": 5.9553, + "step": 2815 + }, + { + "epoch": 0.2715525554484089, + "grad_norm": 4.492236614227295, + "learning_rate": 4.1446838493417707e-05, + "loss": 6.3702, + "step": 2816 + }, + { + "epoch": 0.271648987463838, + "grad_norm": 3.5500147342681885, + "learning_rate": 4.1441133730360985e-05, + "loss": 6.1066, + "step": 2817 + }, + { + "epoch": 0.2717454194792671, + "grad_norm": 3.8034188747406006, + "learning_rate": 4.143542745835724e-05, + "loss": 5.987, + "step": 2818 + }, + { + "epoch": 0.27184185149469625, + "grad_norm": 2.949716091156006, + "learning_rate": 4.14297196779302e-05, + "loss": 5.8694, + "step": 2819 + }, + { + "epoch": 0.2719382835101254, + "grad_norm": 3.4565813541412354, + "learning_rate": 4.142401038960371e-05, + "loss": 6.1832, + "step": 2820 + }, + { + "epoch": 0.27203471552555447, + "grad_norm": 4.002534866333008, + "learning_rate": 4.141829959390175e-05, + "loss": 6.0641, + "step": 2821 + }, + { + "epoch": 0.2721311475409836, + "grad_norm": 4.91417121887207, + "learning_rate": 4.141258729134847e-05, + "loss": 6.2462, + "step": 2822 + }, + { + "epoch": 0.27222757955641275, + "grad_norm": 3.1631181240081787, + "learning_rate": 4.1406873482468135e-05, + "loss": 6.1743, + "step": 2823 + }, + { + "epoch": 0.27232401157184183, + "grad_norm": 5.465005874633789, + "learning_rate": 4.1401158167785136e-05, + "loss": 5.8868, + "step": 2824 + }, + { + "epoch": 0.27242044358727097, + "grad_norm": 2.8168036937713623, + "learning_rate": 4.139544134782403e-05, + "loss": 5.9834, + "step": 2825 + }, + { + "epoch": 0.2725168756027001, + "grad_norm": 4.215944290161133, + "learning_rate": 4.1389723023109496e-05, + "loss": 6.135, + "step": 2826 + }, + { + "epoch": 0.27261330761812924, + "grad_norm": 3.8766565322875977, + "learning_rate": 4.138400319416635e-05, + "loss": 6.1595, + "step": 2827 + }, + { + "epoch": 0.2727097396335583, + "grad_norm": 3.355452299118042, + "learning_rate": 4.137828186151956e-05, + "loss": 5.6914, + "step": 2828 + }, + { + "epoch": 0.27280617164898746, + "grad_norm": 3.9695546627044678, + "learning_rate": 4.137255902569422e-05, + "loss": 5.832, + "step": 2829 + }, + { + "epoch": 0.2729026036644166, + "grad_norm": 3.053496837615967, + "learning_rate": 4.136683468721557e-05, + "loss": 5.6345, + "step": 2830 + }, + { + "epoch": 0.27299903567984574, + "grad_norm": 2.8513476848602295, + "learning_rate": 4.1361108846608965e-05, + "loss": 6.1437, + "step": 2831 + }, + { + "epoch": 0.2730954676952748, + "grad_norm": 3.927556276321411, + "learning_rate": 4.135538150439993e-05, + "loss": 6.0919, + "step": 2832 + }, + { + "epoch": 0.27319189971070396, + "grad_norm": 2.8892669677734375, + "learning_rate": 4.134965266111411e-05, + "loss": 5.9883, + "step": 2833 + }, + { + "epoch": 0.2732883317261331, + "grad_norm": 2.7641263008117676, + "learning_rate": 4.134392231727731e-05, + "loss": 6.133, + "step": 2834 + }, + { + "epoch": 0.2733847637415622, + "grad_norm": 1.9348996877670288, + "learning_rate": 4.133819047341541e-05, + "loss": 5.961, + "step": 2835 + }, + { + "epoch": 0.2734811957569913, + "grad_norm": 1.9756377935409546, + "learning_rate": 4.1332457130054516e-05, + "loss": 5.8347, + "step": 2836 + }, + { + "epoch": 0.27357762777242045, + "grad_norm": 3.325430154800415, + "learning_rate": 4.13267222877208e-05, + "loss": 5.9416, + "step": 2837 + }, + { + "epoch": 0.2736740597878496, + "grad_norm": 3.198493480682373, + "learning_rate": 4.1320985946940604e-05, + "loss": 6.1815, + "step": 2838 + }, + { + "epoch": 0.27377049180327867, + "grad_norm": 2.3732197284698486, + "learning_rate": 4.1315248108240414e-05, + "loss": 6.028, + "step": 2839 + }, + { + "epoch": 0.2738669238187078, + "grad_norm": 3.239501714706421, + "learning_rate": 4.130950877214683e-05, + "loss": 5.8491, + "step": 2840 + }, + { + "epoch": 0.27396335583413695, + "grad_norm": 2.6887011528015137, + "learning_rate": 4.130376793918661e-05, + "loss": 5.7947, + "step": 2841 + }, + { + "epoch": 0.2740597878495661, + "grad_norm": 3.036472797393799, + "learning_rate": 4.1298025609886624e-05, + "loss": 5.7868, + "step": 2842 + }, + { + "epoch": 0.27415621986499517, + "grad_norm": 2.8093161582946777, + "learning_rate": 4.129228178477391e-05, + "loss": 6.0973, + "step": 2843 + }, + { + "epoch": 0.2742526518804243, + "grad_norm": 4.150613784790039, + "learning_rate": 4.128653646437564e-05, + "loss": 6.1582, + "step": 2844 + }, + { + "epoch": 0.27434908389585344, + "grad_norm": 2.6185052394866943, + "learning_rate": 4.1280789649219085e-05, + "loss": 6.0842, + "step": 2845 + }, + { + "epoch": 0.2744455159112825, + "grad_norm": 3.4805150032043457, + "learning_rate": 4.127504133983169e-05, + "loss": 6.2725, + "step": 2846 + }, + { + "epoch": 0.27454194792671166, + "grad_norm": 3.0672619342803955, + "learning_rate": 4.1269291536741045e-05, + "loss": 6.2601, + "step": 2847 + }, + { + "epoch": 0.2746383799421408, + "grad_norm": 6.129732131958008, + "learning_rate": 4.126354024047484e-05, + "loss": 6.0623, + "step": 2848 + }, + { + "epoch": 0.27473481195756994, + "grad_norm": 5.605488300323486, + "learning_rate": 4.125778745156093e-05, + "loss": 6.2392, + "step": 2849 + }, + { + "epoch": 0.274831243972999, + "grad_norm": 5.605937480926514, + "learning_rate": 4.1252033170527296e-05, + "loss": 6.0879, + "step": 2850 + }, + { + "epoch": 0.27492767598842816, + "grad_norm": 2.889587163925171, + "learning_rate": 4.1246277397902064e-05, + "loss": 6.1171, + "step": 2851 + }, + { + "epoch": 0.2750241080038573, + "grad_norm": 3.6581108570098877, + "learning_rate": 4.124052013421349e-05, + "loss": 5.2391, + "step": 2852 + }, + { + "epoch": 0.27512054001928643, + "grad_norm": 7.151916980743408, + "learning_rate": 4.123476137998996e-05, + "loss": 6.3435, + "step": 2853 + }, + { + "epoch": 0.2752169720347155, + "grad_norm": 7.703052997589111, + "learning_rate": 4.122900113576001e-05, + "loss": 6.1445, + "step": 2854 + }, + { + "epoch": 0.27531340405014465, + "grad_norm": 5.561044692993164, + "learning_rate": 4.122323940205233e-05, + "loss": 6.2171, + "step": 2855 + }, + { + "epoch": 0.2754098360655738, + "grad_norm": 2.912191152572632, + "learning_rate": 4.121747617939569e-05, + "loss": 5.9938, + "step": 2856 + }, + { + "epoch": 0.27550626808100287, + "grad_norm": 5.266725540161133, + "learning_rate": 4.121171146831905e-05, + "loss": 6.2326, + "step": 2857 + }, + { + "epoch": 0.275602700096432, + "grad_norm": 7.032985210418701, + "learning_rate": 4.1205945269351484e-05, + "loss": 6.3559, + "step": 2858 + }, + { + "epoch": 0.27569913211186114, + "grad_norm": 5.070994853973389, + "learning_rate": 4.1200177583022206e-05, + "loss": 6.3444, + "step": 2859 + }, + { + "epoch": 0.2757955641272903, + "grad_norm": 3.6135196685791016, + "learning_rate": 4.119440840986056e-05, + "loss": 5.9992, + "step": 2860 + }, + { + "epoch": 0.27589199614271936, + "grad_norm": 3.024874210357666, + "learning_rate": 4.118863775039606e-05, + "loss": 5.5347, + "step": 2861 + }, + { + "epoch": 0.2759884281581485, + "grad_norm": 4.0282111167907715, + "learning_rate": 4.1182865605158304e-05, + "loss": 6.0648, + "step": 2862 + }, + { + "epoch": 0.27608486017357764, + "grad_norm": 4.382232666015625, + "learning_rate": 4.117709197467706e-05, + "loss": 6.2114, + "step": 2863 + }, + { + "epoch": 0.2761812921890068, + "grad_norm": 3.6341822147369385, + "learning_rate": 4.1171316859482225e-05, + "loss": 6.089, + "step": 2864 + }, + { + "epoch": 0.27627772420443586, + "grad_norm": 2.940389633178711, + "learning_rate": 4.116554026010383e-05, + "loss": 6.2542, + "step": 2865 + }, + { + "epoch": 0.276374156219865, + "grad_norm": 3.2856900691986084, + "learning_rate": 4.1159762177072056e-05, + "loss": 6.1828, + "step": 2866 + }, + { + "epoch": 0.27647058823529413, + "grad_norm": 2.6638646125793457, + "learning_rate": 4.1153982610917184e-05, + "loss": 6.118, + "step": 2867 + }, + { + "epoch": 0.2765670202507232, + "grad_norm": 3.6109440326690674, + "learning_rate": 4.1148201562169685e-05, + "loss": 6.0818, + "step": 2868 + }, + { + "epoch": 0.27666345226615235, + "grad_norm": 3.4430642127990723, + "learning_rate": 4.114241903136012e-05, + "loss": 5.6292, + "step": 2869 + }, + { + "epoch": 0.2767598842815815, + "grad_norm": 5.043337821960449, + "learning_rate": 4.11366350190192e-05, + "loss": 5.9214, + "step": 2870 + }, + { + "epoch": 0.27685631629701063, + "grad_norm": 5.715664863586426, + "learning_rate": 4.113084952567777e-05, + "loss": 5.3928, + "step": 2871 + }, + { + "epoch": 0.2769527483124397, + "grad_norm": 4.983113765716553, + "learning_rate": 4.112506255186684e-05, + "loss": 5.9488, + "step": 2872 + }, + { + "epoch": 0.27704918032786885, + "grad_norm": 2.3944878578186035, + "learning_rate": 4.111927409811751e-05, + "loss": 5.9754, + "step": 2873 + }, + { + "epoch": 0.277145612343298, + "grad_norm": 3.219247817993164, + "learning_rate": 4.1113484164961044e-05, + "loss": 6.0279, + "step": 2874 + }, + { + "epoch": 0.2772420443587271, + "grad_norm": 2.9468624591827393, + "learning_rate": 4.110769275292883e-05, + "loss": 6.0106, + "step": 2875 + }, + { + "epoch": 0.2773384763741562, + "grad_norm": 4.732926845550537, + "learning_rate": 4.11018998625524e-05, + "loss": 5.9406, + "step": 2876 + }, + { + "epoch": 0.27743490838958534, + "grad_norm": 3.3450582027435303, + "learning_rate": 4.109610549436342e-05, + "loss": 5.8262, + "step": 2877 + }, + { + "epoch": 0.2775313404050145, + "grad_norm": 3.601963758468628, + "learning_rate": 4.1090309648893686e-05, + "loss": 5.4359, + "step": 2878 + }, + { + "epoch": 0.27762777242044356, + "grad_norm": 2.5859947204589844, + "learning_rate": 4.108451232667514e-05, + "loss": 5.3534, + "step": 2879 + }, + { + "epoch": 0.2777242044358727, + "grad_norm": 3.6944522857666016, + "learning_rate": 4.107871352823985e-05, + "loss": 5.8544, + "step": 2880 + }, + { + "epoch": 0.27782063645130184, + "grad_norm": 4.077560901641846, + "learning_rate": 4.107291325412003e-05, + "loss": 6.1869, + "step": 2881 + }, + { + "epoch": 0.277917068466731, + "grad_norm": 3.030552625656128, + "learning_rate": 4.1067111504847996e-05, + "loss": 6.0732, + "step": 2882 + }, + { + "epoch": 0.27801350048216006, + "grad_norm": 5.072360992431641, + "learning_rate": 4.1061308280956256e-05, + "loss": 6.2271, + "step": 2883 + }, + { + "epoch": 0.2781099324975892, + "grad_norm": 5.801494598388672, + "learning_rate": 4.1055503582977414e-05, + "loss": 6.0868, + "step": 2884 + }, + { + "epoch": 0.27820636451301833, + "grad_norm": 4.12561559677124, + "learning_rate": 4.104969741144421e-05, + "loss": 5.9386, + "step": 2885 + }, + { + "epoch": 0.27830279652844747, + "grad_norm": 3.2038402557373047, + "learning_rate": 4.104388976688953e-05, + "loss": 6.097, + "step": 2886 + }, + { + "epoch": 0.27839922854387655, + "grad_norm": 2.9108917713165283, + "learning_rate": 4.103808064984639e-05, + "loss": 6.1745, + "step": 2887 + }, + { + "epoch": 0.2784956605593057, + "grad_norm": 2.8603804111480713, + "learning_rate": 4.103227006084795e-05, + "loss": 6.2178, + "step": 2888 + }, + { + "epoch": 0.27859209257473483, + "grad_norm": 2.910346746444702, + "learning_rate": 4.10264580004275e-05, + "loss": 6.1889, + "step": 2889 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 3.4570152759552, + "learning_rate": 4.102064446911845e-05, + "loss": 6.2257, + "step": 2890 + }, + { + "epoch": 0.27878495660559305, + "grad_norm": 2.429029703140259, + "learning_rate": 4.101482946745439e-05, + "loss": 6.2255, + "step": 2891 + }, + { + "epoch": 0.2788813886210222, + "grad_norm": 2.5487143993377686, + "learning_rate": 4.100901299596897e-05, + "loss": 6.2146, + "step": 2892 + }, + { + "epoch": 0.2789778206364513, + "grad_norm": 2.481706380844116, + "learning_rate": 4.1003195055196055e-05, + "loss": 6.2344, + "step": 2893 + }, + { + "epoch": 0.2790742526518804, + "grad_norm": 3.931403875350952, + "learning_rate": 4.099737564566959e-05, + "loss": 5.9555, + "step": 2894 + }, + { + "epoch": 0.27917068466730954, + "grad_norm": 2.480861186981201, + "learning_rate": 4.099155476792367e-05, + "loss": 5.982, + "step": 2895 + }, + { + "epoch": 0.2792671166827387, + "grad_norm": 2.7326595783233643, + "learning_rate": 4.0985732422492543e-05, + "loss": 5.8451, + "step": 2896 + }, + { + "epoch": 0.2793635486981678, + "grad_norm": 3.570857048034668, + "learning_rate": 4.097990860991057e-05, + "loss": 5.7046, + "step": 2897 + }, + { + "epoch": 0.2794599807135969, + "grad_norm": 3.078108549118042, + "learning_rate": 4.0974083330712255e-05, + "loss": 5.9974, + "step": 2898 + }, + { + "epoch": 0.27955641272902604, + "grad_norm": 3.172625780105591, + "learning_rate": 4.096825658543223e-05, + "loss": 5.4413, + "step": 2899 + }, + { + "epoch": 0.2796528447444552, + "grad_norm": 2.1150283813476562, + "learning_rate": 4.0962428374605263e-05, + "loss": 5.753, + "step": 2900 + }, + { + "epoch": 0.27974927675988426, + "grad_norm": 2.7980422973632812, + "learning_rate": 4.095659869876628e-05, + "loss": 6.0408, + "step": 2901 + }, + { + "epoch": 0.2798457087753134, + "grad_norm": 3.7878458499908447, + "learning_rate": 4.09507675584503e-05, + "loss": 6.1164, + "step": 2902 + }, + { + "epoch": 0.27994214079074253, + "grad_norm": 4.971096992492676, + "learning_rate": 4.09449349541925e-05, + "loss": 5.9868, + "step": 2903 + }, + { + "epoch": 0.28003857280617167, + "grad_norm": 3.1327404975891113, + "learning_rate": 4.0939100886528205e-05, + "loss": 6.1906, + "step": 2904 + }, + { + "epoch": 0.28013500482160075, + "grad_norm": 3.6484427452087402, + "learning_rate": 4.0933265355992844e-05, + "loss": 6.0876, + "step": 2905 + }, + { + "epoch": 0.2802314368370299, + "grad_norm": 4.650661945343018, + "learning_rate": 4.092742836312201e-05, + "loss": 6.0954, + "step": 2906 + }, + { + "epoch": 0.28032786885245903, + "grad_norm": 2.701357126235962, + "learning_rate": 4.092158990845139e-05, + "loss": 6.0714, + "step": 2907 + }, + { + "epoch": 0.28042430086788817, + "grad_norm": 2.2899701595306396, + "learning_rate": 4.091574999251685e-05, + "loss": 6.3049, + "step": 2908 + }, + { + "epoch": 0.28052073288331725, + "grad_norm": 2.8341856002807617, + "learning_rate": 4.090990861585437e-05, + "loss": 5.9471, + "step": 2909 + }, + { + "epoch": 0.2806171648987464, + "grad_norm": 3.224904775619507, + "learning_rate": 4.0904065779000045e-05, + "loss": 5.9785, + "step": 2910 + }, + { + "epoch": 0.2807135969141755, + "grad_norm": 2.7321724891662598, + "learning_rate": 4.089822148249014e-05, + "loss": 6.2354, + "step": 2911 + }, + { + "epoch": 0.2808100289296046, + "grad_norm": 2.5391337871551514, + "learning_rate": 4.089237572686104e-05, + "loss": 6.2693, + "step": 2912 + }, + { + "epoch": 0.28090646094503374, + "grad_norm": 3.611737012863159, + "learning_rate": 4.088652851264926e-05, + "loss": 5.9864, + "step": 2913 + }, + { + "epoch": 0.2810028929604629, + "grad_norm": 2.817239284515381, + "learning_rate": 4.088067984039144e-05, + "loss": 5.9388, + "step": 2914 + }, + { + "epoch": 0.281099324975892, + "grad_norm": 1.889172911643982, + "learning_rate": 4.087482971062437e-05, + "loss": 6.1718, + "step": 2915 + }, + { + "epoch": 0.2811957569913211, + "grad_norm": 3.504213571548462, + "learning_rate": 4.0868978123884977e-05, + "loss": 6.1738, + "step": 2916 + }, + { + "epoch": 0.28129218900675024, + "grad_norm": 4.733470439910889, + "learning_rate": 4.0863125080710294e-05, + "loss": 5.924, + "step": 2917 + }, + { + "epoch": 0.2813886210221794, + "grad_norm": 2.966437578201294, + "learning_rate": 4.085727058163752e-05, + "loss": 5.8812, + "step": 2918 + }, + { + "epoch": 0.2814850530376085, + "grad_norm": 2.9167330265045166, + "learning_rate": 4.085141462720397e-05, + "loss": 6.0863, + "step": 2919 + }, + { + "epoch": 0.2815814850530376, + "grad_norm": 3.768296241760254, + "learning_rate": 4.0845557217947096e-05, + "loss": 6.2879, + "step": 2920 + }, + { + "epoch": 0.28167791706846673, + "grad_norm": 3.147967576980591, + "learning_rate": 4.0839698354404476e-05, + "loss": 6.301, + "step": 2921 + }, + { + "epoch": 0.28177434908389587, + "grad_norm": 2.9780256748199463, + "learning_rate": 4.083383803711384e-05, + "loss": 6.0882, + "step": 2922 + }, + { + "epoch": 0.28187078109932495, + "grad_norm": 3.179678440093994, + "learning_rate": 4.0827976266613036e-05, + "loss": 6.1606, + "step": 2923 + }, + { + "epoch": 0.2819672131147541, + "grad_norm": 3.3174521923065186, + "learning_rate": 4.0822113043440056e-05, + "loss": 6.2843, + "step": 2924 + }, + { + "epoch": 0.2820636451301832, + "grad_norm": 4.284568786621094, + "learning_rate": 4.0816248368133016e-05, + "loss": 5.8556, + "step": 2925 + }, + { + "epoch": 0.28216007714561236, + "grad_norm": 2.7894465923309326, + "learning_rate": 4.081038224123016e-05, + "loss": 6.0443, + "step": 2926 + }, + { + "epoch": 0.28225650916104145, + "grad_norm": 2.1447300910949707, + "learning_rate": 4.0804514663269885e-05, + "loss": 6.0536, + "step": 2927 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 3.56453537940979, + "learning_rate": 4.079864563479071e-05, + "loss": 6.1671, + "step": 2928 + }, + { + "epoch": 0.2824493731918997, + "grad_norm": 2.3810715675354004, + "learning_rate": 4.079277515633127e-05, + "loss": 5.9964, + "step": 2929 + }, + { + "epoch": 0.28254580520732886, + "grad_norm": 2.1313934326171875, + "learning_rate": 4.078690322843038e-05, + "loss": 5.5941, + "step": 2930 + }, + { + "epoch": 0.28264223722275794, + "grad_norm": 2.4851534366607666, + "learning_rate": 4.0781029851626936e-05, + "loss": 6.1757, + "step": 2931 + }, + { + "epoch": 0.2827386692381871, + "grad_norm": 2.031181573867798, + "learning_rate": 4.077515502646e-05, + "loss": 6.1793, + "step": 2932 + }, + { + "epoch": 0.2828351012536162, + "grad_norm": 3.224290609359741, + "learning_rate": 4.076927875346874e-05, + "loss": 6.1919, + "step": 2933 + }, + { + "epoch": 0.2829315332690453, + "grad_norm": 3.1684000492095947, + "learning_rate": 4.0763401033192497e-05, + "loss": 6.1667, + "step": 2934 + }, + { + "epoch": 0.28302796528447444, + "grad_norm": 4.041899681091309, + "learning_rate": 4.07575218661707e-05, + "loss": 6.0049, + "step": 2935 + }, + { + "epoch": 0.2831243972999036, + "grad_norm": 2.580751419067383, + "learning_rate": 4.075164125294295e-05, + "loss": 6.1684, + "step": 2936 + }, + { + "epoch": 0.2832208293153327, + "grad_norm": 3.0645153522491455, + "learning_rate": 4.0745759194048946e-05, + "loss": 6.0717, + "step": 2937 + }, + { + "epoch": 0.2833172613307618, + "grad_norm": 2.5822880268096924, + "learning_rate": 4.073987569002854e-05, + "loss": 6.0831, + "step": 2938 + }, + { + "epoch": 0.28341369334619093, + "grad_norm": 3.7166643142700195, + "learning_rate": 4.073399074142172e-05, + "loss": 5.847, + "step": 2939 + }, + { + "epoch": 0.28351012536162007, + "grad_norm": 2.6529648303985596, + "learning_rate": 4.072810434876859e-05, + "loss": 5.7677, + "step": 2940 + }, + { + "epoch": 0.2836065573770492, + "grad_norm": 4.281891822814941, + "learning_rate": 4.072221651260941e-05, + "loss": 5.725, + "step": 2941 + }, + { + "epoch": 0.2837029893924783, + "grad_norm": 3.123727321624756, + "learning_rate": 4.0716327233484546e-05, + "loss": 5.6074, + "step": 2942 + }, + { + "epoch": 0.2837994214079074, + "grad_norm": 3.442678213119507, + "learning_rate": 4.071043651193451e-05, + "loss": 5.9772, + "step": 2943 + }, + { + "epoch": 0.28389585342333656, + "grad_norm": 3.513364791870117, + "learning_rate": 4.070454434849995e-05, + "loss": 6.0654, + "step": 2944 + }, + { + "epoch": 0.28399228543876565, + "grad_norm": 3.4257118701934814, + "learning_rate": 4.069865074372164e-05, + "loss": 6.1807, + "step": 2945 + }, + { + "epoch": 0.2840887174541948, + "grad_norm": 3.74124813079834, + "learning_rate": 4.069275569814048e-05, + "loss": 6.0818, + "step": 2946 + }, + { + "epoch": 0.2841851494696239, + "grad_norm": 4.520610809326172, + "learning_rate": 4.068685921229753e-05, + "loss": 6.1378, + "step": 2947 + }, + { + "epoch": 0.28428158148505306, + "grad_norm": 4.381681442260742, + "learning_rate": 4.068096128673394e-05, + "loss": 6.1575, + "step": 2948 + }, + { + "epoch": 0.28437801350048214, + "grad_norm": 3.408726453781128, + "learning_rate": 4.067506192199102e-05, + "loss": 6.1682, + "step": 2949 + }, + { + "epoch": 0.2844744455159113, + "grad_norm": 5.009002208709717, + "learning_rate": 4.0669161118610214e-05, + "loss": 6.5303, + "step": 2950 + }, + { + "epoch": 0.2845708775313404, + "grad_norm": 3.565664768218994, + "learning_rate": 4.066325887713308e-05, + "loss": 5.9447, + "step": 2951 + }, + { + "epoch": 0.28466730954676955, + "grad_norm": 3.7285311222076416, + "learning_rate": 4.065735519810133e-05, + "loss": 5.9575, + "step": 2952 + }, + { + "epoch": 0.28476374156219864, + "grad_norm": 3.933889865875244, + "learning_rate": 4.0651450082056796e-05, + "loss": 5.8489, + "step": 2953 + }, + { + "epoch": 0.2848601735776278, + "grad_norm": 3.2781894207000732, + "learning_rate": 4.064554352954143e-05, + "loss": 6.1079, + "step": 2954 + }, + { + "epoch": 0.2849566055930569, + "grad_norm": 2.532658100128174, + "learning_rate": 4.063963554109734e-05, + "loss": 6.1538, + "step": 2955 + }, + { + "epoch": 0.285053037608486, + "grad_norm": 3.7848165035247803, + "learning_rate": 4.0633726117266746e-05, + "loss": 6.0266, + "step": 2956 + }, + { + "epoch": 0.28514946962391513, + "grad_norm": 4.1846604347229, + "learning_rate": 4.062781525859201e-05, + "loss": 5.9221, + "step": 2957 + }, + { + "epoch": 0.28524590163934427, + "grad_norm": 3.861973524093628, + "learning_rate": 4.0621902965615626e-05, + "loss": 6.2387, + "step": 2958 + }, + { + "epoch": 0.2853423336547734, + "grad_norm": 4.6053900718688965, + "learning_rate": 4.0615989238880213e-05, + "loss": 6.218, + "step": 2959 + }, + { + "epoch": 0.2854387656702025, + "grad_norm": 4.997434139251709, + "learning_rate": 4.061007407892853e-05, + "loss": 6.3035, + "step": 2960 + }, + { + "epoch": 0.2855351976856316, + "grad_norm": 5.3291826248168945, + "learning_rate": 4.0604157486303464e-05, + "loss": 6.0362, + "step": 2961 + }, + { + "epoch": 0.28563162970106076, + "grad_norm": 4.167816638946533, + "learning_rate": 4.059823946154803e-05, + "loss": 6.2693, + "step": 2962 + }, + { + "epoch": 0.2857280617164899, + "grad_norm": 2.3569397926330566, + "learning_rate": 4.0592320005205365e-05, + "loss": 6.1779, + "step": 2963 + }, + { + "epoch": 0.285824493731919, + "grad_norm": 2.7579598426818848, + "learning_rate": 4.058639911781877e-05, + "loss": 6.1169, + "step": 2964 + }, + { + "epoch": 0.2859209257473481, + "grad_norm": 4.7769622802734375, + "learning_rate": 4.058047679993165e-05, + "loss": 5.9996, + "step": 2965 + }, + { + "epoch": 0.28601735776277726, + "grad_norm": 3.7462711334228516, + "learning_rate": 4.057455305208754e-05, + "loss": 6.0283, + "step": 2966 + }, + { + "epoch": 0.28611378977820634, + "grad_norm": 2.8514513969421387, + "learning_rate": 4.0568627874830126e-05, + "loss": 5.971, + "step": 2967 + }, + { + "epoch": 0.2862102217936355, + "grad_norm": 3.6455352306365967, + "learning_rate": 4.0562701268703205e-05, + "loss": 5.9447, + "step": 2968 + }, + { + "epoch": 0.2863066538090646, + "grad_norm": 3.0910871028900146, + "learning_rate": 4.055677323425071e-05, + "loss": 6.0461, + "step": 2969 + }, + { + "epoch": 0.28640308582449375, + "grad_norm": 2.7130444049835205, + "learning_rate": 4.0550843772016724e-05, + "loss": 6.0789, + "step": 2970 + }, + { + "epoch": 0.28649951783992283, + "grad_norm": 3.1442747116088867, + "learning_rate": 4.0544912882545425e-05, + "loss": 6.261, + "step": 2971 + }, + { + "epoch": 0.286595949855352, + "grad_norm": 3.4809556007385254, + "learning_rate": 4.053898056638117e-05, + "loss": 5.9111, + "step": 2972 + }, + { + "epoch": 0.2866923818707811, + "grad_norm": 3.16986346244812, + "learning_rate": 4.0533046824068385e-05, + "loss": 6.0738, + "step": 2973 + }, + { + "epoch": 0.28678881388621025, + "grad_norm": 4.072354793548584, + "learning_rate": 4.05271116561517e-05, + "loss": 6.238, + "step": 2974 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 5.150450229644775, + "learning_rate": 4.05211750631758e-05, + "loss": 6.1423, + "step": 2975 + }, + { + "epoch": 0.28698167791706847, + "grad_norm": 5.492907524108887, + "learning_rate": 4.051523704568557e-05, + "loss": 6.0137, + "step": 2976 + }, + { + "epoch": 0.2870781099324976, + "grad_norm": 3.2752761840820312, + "learning_rate": 4.050929760422598e-05, + "loss": 6.2009, + "step": 2977 + }, + { + "epoch": 0.2871745419479267, + "grad_norm": 4.937628746032715, + "learning_rate": 4.0503356739342136e-05, + "loss": 6.1596, + "step": 2978 + }, + { + "epoch": 0.2872709739633558, + "grad_norm": 4.760725975036621, + "learning_rate": 4.04974144515793e-05, + "loss": 6.178, + "step": 2979 + }, + { + "epoch": 0.28736740597878496, + "grad_norm": 3.5678353309631348, + "learning_rate": 4.049147074148283e-05, + "loss": 6.0686, + "step": 2980 + }, + { + "epoch": 0.2874638379942141, + "grad_norm": 2.937267303466797, + "learning_rate": 4.0485525609598254e-05, + "loss": 6.0836, + "step": 2981 + }, + { + "epoch": 0.2875602700096432, + "grad_norm": 3.3555970191955566, + "learning_rate": 4.047957905647119e-05, + "loss": 6.1054, + "step": 2982 + }, + { + "epoch": 0.2876567020250723, + "grad_norm": 3.801151990890503, + "learning_rate": 4.047363108264741e-05, + "loss": 6.0771, + "step": 2983 + }, + { + "epoch": 0.28775313404050146, + "grad_norm": 2.6038706302642822, + "learning_rate": 4.046768168867282e-05, + "loss": 6.1108, + "step": 2984 + }, + { + "epoch": 0.2878495660559306, + "grad_norm": 2.8446531295776367, + "learning_rate": 4.046173087509344e-05, + "loss": 6.0686, + "step": 2985 + }, + { + "epoch": 0.2879459980713597, + "grad_norm": 2.8533618450164795, + "learning_rate": 4.0455778642455435e-05, + "loss": 6.0568, + "step": 2986 + }, + { + "epoch": 0.2880424300867888, + "grad_norm": 3.3963489532470703, + "learning_rate": 4.044982499130509e-05, + "loss": 5.9041, + "step": 2987 + }, + { + "epoch": 0.28813886210221795, + "grad_norm": 3.4438233375549316, + "learning_rate": 4.0443869922188815e-05, + "loss": 5.9375, + "step": 2988 + }, + { + "epoch": 0.28823529411764703, + "grad_norm": 3.0849428176879883, + "learning_rate": 4.0437913435653165e-05, + "loss": 5.9805, + "step": 2989 + }, + { + "epoch": 0.28833172613307617, + "grad_norm": 6.46971321105957, + "learning_rate": 4.0431955532244827e-05, + "loss": 6.2811, + "step": 2990 + }, + { + "epoch": 0.2884281581485053, + "grad_norm": 5.302934169769287, + "learning_rate": 4.04259962125106e-05, + "loss": 6.0835, + "step": 2991 + }, + { + "epoch": 0.28852459016393445, + "grad_norm": 3.518386125564575, + "learning_rate": 4.0420035476997434e-05, + "loss": 6.1176, + "step": 2992 + }, + { + "epoch": 0.28862102217936353, + "grad_norm": 4.129462242126465, + "learning_rate": 4.041407332625238e-05, + "loss": 6.0762, + "step": 2993 + }, + { + "epoch": 0.28871745419479267, + "grad_norm": 5.444161415100098, + "learning_rate": 4.040810976082266e-05, + "loss": 5.9602, + "step": 2994 + }, + { + "epoch": 0.2888138862102218, + "grad_norm": 6.340167045593262, + "learning_rate": 4.040214478125558e-05, + "loss": 6.1107, + "step": 2995 + }, + { + "epoch": 0.28891031822565094, + "grad_norm": 5.238665580749512, + "learning_rate": 4.0396178388098615e-05, + "loss": 5.9633, + "step": 2996 + }, + { + "epoch": 0.28900675024108, + "grad_norm": 3.779142379760742, + "learning_rate": 4.039021058189935e-05, + "loss": 6.1208, + "step": 2997 + }, + { + "epoch": 0.28910318225650916, + "grad_norm": 4.652892112731934, + "learning_rate": 4.0384241363205496e-05, + "loss": 6.3031, + "step": 2998 + }, + { + "epoch": 0.2891996142719383, + "grad_norm": 4.523927211761475, + "learning_rate": 4.0378270732564905e-05, + "loss": 6.2629, + "step": 2999 + }, + { + "epoch": 0.2892960462873674, + "grad_norm": 4.462098598480225, + "learning_rate": 4.0372298690525546e-05, + "loss": 6.2735, + "step": 3000 + }, + { + "epoch": 0.2893924783027965, + "grad_norm": 4.127562522888184, + "learning_rate": 4.036632523763555e-05, + "loss": 6.1601, + "step": 3001 + }, + { + "epoch": 0.28948891031822566, + "grad_norm": 3.383232355117798, + "learning_rate": 4.036035037444312e-05, + "loss": 6.1824, + "step": 3002 + }, + { + "epoch": 0.2895853423336548, + "grad_norm": 2.6900155544281006, + "learning_rate": 4.035437410149665e-05, + "loss": 6.0397, + "step": 3003 + }, + { + "epoch": 0.2896817743490839, + "grad_norm": 2.999457359313965, + "learning_rate": 4.034839641934462e-05, + "loss": 6.0583, + "step": 3004 + }, + { + "epoch": 0.289778206364513, + "grad_norm": 4.061649799346924, + "learning_rate": 4.0342417328535656e-05, + "loss": 5.8943, + "step": 3005 + }, + { + "epoch": 0.28987463837994215, + "grad_norm": 3.568506956100464, + "learning_rate": 4.0336436829618506e-05, + "loss": 5.4929, + "step": 3006 + }, + { + "epoch": 0.2899710703953713, + "grad_norm": 3.459714412689209, + "learning_rate": 4.0330454923142066e-05, + "loss": 5.7563, + "step": 3007 + }, + { + "epoch": 0.29006750241080037, + "grad_norm": 3.921916961669922, + "learning_rate": 4.032447160965534e-05, + "loss": 6.0939, + "step": 3008 + }, + { + "epoch": 0.2901639344262295, + "grad_norm": 2.4805421829223633, + "learning_rate": 4.031848688970748e-05, + "loss": 6.1251, + "step": 3009 + }, + { + "epoch": 0.29026036644165865, + "grad_norm": 5.25581169128418, + "learning_rate": 4.0312500763847736e-05, + "loss": 5.6126, + "step": 3010 + }, + { + "epoch": 0.29035679845708773, + "grad_norm": 5.273101806640625, + "learning_rate": 4.0306513232625524e-05, + "loss": 6.1298, + "step": 3011 + }, + { + "epoch": 0.29045323047251687, + "grad_norm": 4.185126781463623, + "learning_rate": 4.030052429659037e-05, + "loss": 5.6251, + "step": 3012 + }, + { + "epoch": 0.290549662487946, + "grad_norm": 4.948699474334717, + "learning_rate": 4.0294533956291926e-05, + "loss": 5.838, + "step": 3013 + }, + { + "epoch": 0.29064609450337514, + "grad_norm": 3.018796920776367, + "learning_rate": 4.0288542212279976e-05, + "loss": 6.023, + "step": 3014 + }, + { + "epoch": 0.2907425265188042, + "grad_norm": 3.3411829471588135, + "learning_rate": 4.0282549065104436e-05, + "loss": 5.6589, + "step": 3015 + }, + { + "epoch": 0.29083895853423336, + "grad_norm": 4.75632381439209, + "learning_rate": 4.0276554515315364e-05, + "loss": 5.8337, + "step": 3016 + }, + { + "epoch": 0.2909353905496625, + "grad_norm": 3.281482696533203, + "learning_rate": 4.027055856346291e-05, + "loss": 6.0929, + "step": 3017 + }, + { + "epoch": 0.29103182256509164, + "grad_norm": 4.503252029418945, + "learning_rate": 4.026456121009739e-05, + "loss": 6.2247, + "step": 3018 + }, + { + "epoch": 0.2911282545805207, + "grad_norm": 5.870284557342529, + "learning_rate": 4.0258562455769236e-05, + "loss": 5.9858, + "step": 3019 + }, + { + "epoch": 0.29122468659594986, + "grad_norm": 6.695756912231445, + "learning_rate": 4.025256230102899e-05, + "loss": 6.0047, + "step": 3020 + }, + { + "epoch": 0.291321118611379, + "grad_norm": 4.959413051605225, + "learning_rate": 4.0246560746427353e-05, + "loss": 6.0518, + "step": 3021 + }, + { + "epoch": 0.2914175506268081, + "grad_norm": 3.165940999984741, + "learning_rate": 4.024055779251513e-05, + "loss": 5.9759, + "step": 3022 + }, + { + "epoch": 0.2915139826422372, + "grad_norm": 5.737679481506348, + "learning_rate": 4.0234553439843285e-05, + "loss": 6.1677, + "step": 3023 + }, + { + "epoch": 0.29161041465766635, + "grad_norm": 5.9143829345703125, + "learning_rate": 4.022854768896287e-05, + "loss": 6.1892, + "step": 3024 + }, + { + "epoch": 0.2917068466730955, + "grad_norm": 4.738935947418213, + "learning_rate": 4.022254054042509e-05, + "loss": 5.9127, + "step": 3025 + }, + { + "epoch": 0.29180327868852457, + "grad_norm": 3.679778575897217, + "learning_rate": 4.021653199478127e-05, + "loss": 6.1712, + "step": 3026 + }, + { + "epoch": 0.2918997107039537, + "grad_norm": 5.126358509063721, + "learning_rate": 4.0210522052582874e-05, + "loss": 6.2288, + "step": 3027 + }, + { + "epoch": 0.29199614271938285, + "grad_norm": 4.32521390914917, + "learning_rate": 4.0204510714381494e-05, + "loss": 6.1658, + "step": 3028 + }, + { + "epoch": 0.292092574734812, + "grad_norm": 3.4167962074279785, + "learning_rate": 4.019849798072882e-05, + "loss": 6.0529, + "step": 3029 + }, + { + "epoch": 0.29218900675024106, + "grad_norm": 3.4468233585357666, + "learning_rate": 4.019248385217672e-05, + "loss": 5.8933, + "step": 3030 + }, + { + "epoch": 0.2922854387656702, + "grad_norm": 3.278836727142334, + "learning_rate": 4.018646832927715e-05, + "loss": 6.0691, + "step": 3031 + }, + { + "epoch": 0.29238187078109934, + "grad_norm": 2.238499402999878, + "learning_rate": 4.0180451412582205e-05, + "loss": 5.7065, + "step": 3032 + }, + { + "epoch": 0.2924783027965284, + "grad_norm": 2.9425671100616455, + "learning_rate": 4.017443310264411e-05, + "loss": 5.381, + "step": 3033 + }, + { + "epoch": 0.29257473481195756, + "grad_norm": 4.3243889808654785, + "learning_rate": 4.016841340001523e-05, + "loss": 5.8269, + "step": 3034 + }, + { + "epoch": 0.2926711668273867, + "grad_norm": 2.3794867992401123, + "learning_rate": 4.016239230524803e-05, + "loss": 5.9943, + "step": 3035 + }, + { + "epoch": 0.29276759884281583, + "grad_norm": 3.271749973297119, + "learning_rate": 4.015636981889513e-05, + "loss": 5.7897, + "step": 3036 + }, + { + "epoch": 0.2928640308582449, + "grad_norm": 4.155463218688965, + "learning_rate": 4.015034594150926e-05, + "loss": 5.9892, + "step": 3037 + }, + { + "epoch": 0.29296046287367405, + "grad_norm": 4.262062072753906, + "learning_rate": 4.0144320673643286e-05, + "loss": 6.086, + "step": 3038 + }, + { + "epoch": 0.2930568948891032, + "grad_norm": 3.851783514022827, + "learning_rate": 4.01382940158502e-05, + "loss": 6.0636, + "step": 3039 + }, + { + "epoch": 0.29315332690453233, + "grad_norm": 3.0588886737823486, + "learning_rate": 4.013226596868313e-05, + "loss": 5.969, + "step": 3040 + }, + { + "epoch": 0.2932497589199614, + "grad_norm": 3.467594623565674, + "learning_rate": 4.01262365326953e-05, + "loss": 6.1908, + "step": 3041 + }, + { + "epoch": 0.29334619093539055, + "grad_norm": 3.609900712966919, + "learning_rate": 4.0120205708440105e-05, + "loss": 6.3347, + "step": 3042 + }, + { + "epoch": 0.2934426229508197, + "grad_norm": 3.7012429237365723, + "learning_rate": 4.0114173496471044e-05, + "loss": 6.2929, + "step": 3043 + }, + { + "epoch": 0.29353905496624877, + "grad_norm": 6.0408854484558105, + "learning_rate": 4.0108139897341734e-05, + "loss": 6.17, + "step": 3044 + }, + { + "epoch": 0.2936354869816779, + "grad_norm": 3.9312891960144043, + "learning_rate": 4.0102104911605956e-05, + "loss": 6.0127, + "step": 3045 + }, + { + "epoch": 0.29373191899710704, + "grad_norm": 4.111096382141113, + "learning_rate": 4.009606853981756e-05, + "loss": 6.0503, + "step": 3046 + }, + { + "epoch": 0.2938283510125362, + "grad_norm": 3.81131649017334, + "learning_rate": 4.0090030782530575e-05, + "loss": 6.0019, + "step": 3047 + }, + { + "epoch": 0.29392478302796526, + "grad_norm": 3.5714402198791504, + "learning_rate": 4.008399164029915e-05, + "loss": 5.9967, + "step": 3048 + }, + { + "epoch": 0.2940212150433944, + "grad_norm": 4.053592681884766, + "learning_rate": 4.0077951113677525e-05, + "loss": 5.34, + "step": 3049 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 2.883660316467285, + "learning_rate": 4.007190920322011e-05, + "loss": 5.9239, + "step": 3050 + }, + { + "epoch": 0.2942140790742527, + "grad_norm": 2.8234128952026367, + "learning_rate": 4.0065865909481417e-05, + "loss": 5.6625, + "step": 3051 + }, + { + "epoch": 0.29431051108968176, + "grad_norm": 2.219249963760376, + "learning_rate": 4.005982123301609e-05, + "loss": 5.7249, + "step": 3052 + }, + { + "epoch": 0.2944069431051109, + "grad_norm": 3.7176098823547363, + "learning_rate": 4.005377517437891e-05, + "loss": 5.4788, + "step": 3053 + }, + { + "epoch": 0.29450337512054003, + "grad_norm": 4.01483154296875, + "learning_rate": 4.004772773412478e-05, + "loss": 5.0069, + "step": 3054 + }, + { + "epoch": 0.2945998071359691, + "grad_norm": 2.4800970554351807, + "learning_rate": 4.00416789128087e-05, + "loss": 5.5625, + "step": 3055 + }, + { + "epoch": 0.29469623915139825, + "grad_norm": 2.644002914428711, + "learning_rate": 4.003562871098586e-05, + "loss": 5.7966, + "step": 3056 + }, + { + "epoch": 0.2947926711668274, + "grad_norm": 2.129859209060669, + "learning_rate": 4.002957712921151e-05, + "loss": 5.9925, + "step": 3057 + }, + { + "epoch": 0.29488910318225653, + "grad_norm": 5.141940116882324, + "learning_rate": 4.0023524168041084e-05, + "loss": 5.9014, + "step": 3058 + }, + { + "epoch": 0.2949855351976856, + "grad_norm": 3.6972155570983887, + "learning_rate": 4.00174698280301e-05, + "loss": 5.8798, + "step": 3059 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 3.063411235809326, + "learning_rate": 4.001141410973421e-05, + "loss": 6.1642, + "step": 3060 + }, + { + "epoch": 0.2951783992285439, + "grad_norm": 1.993366003036499, + "learning_rate": 4.000535701370921e-05, + "loss": 5.9739, + "step": 3061 + }, + { + "epoch": 0.295274831243973, + "grad_norm": 4.032447814941406, + "learning_rate": 3.999929854051101e-05, + "loss": 5.9191, + "step": 3062 + }, + { + "epoch": 0.2953712632594021, + "grad_norm": 2.701646089553833, + "learning_rate": 3.9993238690695655e-05, + "loss": 5.9004, + "step": 3063 + }, + { + "epoch": 0.29546769527483124, + "grad_norm": 2.6417009830474854, + "learning_rate": 3.998717746481931e-05, + "loss": 5.8336, + "step": 3064 + }, + { + "epoch": 0.2955641272902604, + "grad_norm": 3.8518917560577393, + "learning_rate": 3.9981114863438255e-05, + "loss": 5.8812, + "step": 3065 + }, + { + "epoch": 0.29566055930568946, + "grad_norm": 2.6296756267547607, + "learning_rate": 3.997505088710892e-05, + "loss": 6.0558, + "step": 3066 + }, + { + "epoch": 0.2957569913211186, + "grad_norm": 2.6598799228668213, + "learning_rate": 3.996898553638785e-05, + "loss": 6.076, + "step": 3067 + }, + { + "epoch": 0.29585342333654774, + "grad_norm": 2.6068644523620605, + "learning_rate": 3.996291881183171e-05, + "loss": 6.0479, + "step": 3068 + }, + { + "epoch": 0.2959498553519769, + "grad_norm": 2.7932980060577393, + "learning_rate": 3.9956850713997295e-05, + "loss": 6.0393, + "step": 3069 + }, + { + "epoch": 0.29604628736740596, + "grad_norm": 2.40055513381958, + "learning_rate": 3.995078124344152e-05, + "loss": 5.8407, + "step": 3070 + }, + { + "epoch": 0.2961427193828351, + "grad_norm": 2.1961100101470947, + "learning_rate": 3.9944710400721466e-05, + "loss": 5.5902, + "step": 3071 + }, + { + "epoch": 0.29623915139826423, + "grad_norm": 3.9942333698272705, + "learning_rate": 3.993863818639427e-05, + "loss": 5.614, + "step": 3072 + }, + { + "epoch": 0.29633558341369337, + "grad_norm": 4.684289455413818, + "learning_rate": 3.9932564601017255e-05, + "loss": 5.7014, + "step": 3073 + }, + { + "epoch": 0.29643201542912245, + "grad_norm": 4.760704040527344, + "learning_rate": 3.9926489645147836e-05, + "loss": 5.8798, + "step": 3074 + }, + { + "epoch": 0.2965284474445516, + "grad_norm": 4.581045150756836, + "learning_rate": 3.992041331934358e-05, + "loss": 5.8537, + "step": 3075 + }, + { + "epoch": 0.29662487945998073, + "grad_norm": 3.290006399154663, + "learning_rate": 3.991433562416214e-05, + "loss": 6.0906, + "step": 3076 + }, + { + "epoch": 0.2967213114754098, + "grad_norm": 6.388213157653809, + "learning_rate": 3.990825656016134e-05, + "loss": 6.1354, + "step": 3077 + }, + { + "epoch": 0.29681774349083895, + "grad_norm": 4.6247029304504395, + "learning_rate": 3.9902176127899095e-05, + "loss": 6.2652, + "step": 3078 + }, + { + "epoch": 0.2969141755062681, + "grad_norm": 4.623328685760498, + "learning_rate": 3.989609432793347e-05, + "loss": 6.0467, + "step": 3079 + }, + { + "epoch": 0.2970106075216972, + "grad_norm": 3.3086905479431152, + "learning_rate": 3.989001116082264e-05, + "loss": 6.0417, + "step": 3080 + }, + { + "epoch": 0.2971070395371263, + "grad_norm": 3.143657684326172, + "learning_rate": 3.988392662712491e-05, + "loss": 6.1213, + "step": 3081 + }, + { + "epoch": 0.29720347155255544, + "grad_norm": 2.841519594192505, + "learning_rate": 3.987784072739873e-05, + "loss": 6.2249, + "step": 3082 + }, + { + "epoch": 0.2972999035679846, + "grad_norm": 3.0081586837768555, + "learning_rate": 3.987175346220262e-05, + "loss": 5.9577, + "step": 3083 + }, + { + "epoch": 0.2973963355834137, + "grad_norm": 2.365391254425049, + "learning_rate": 3.986566483209529e-05, + "loss": 5.8259, + "step": 3084 + }, + { + "epoch": 0.2974927675988428, + "grad_norm": 2.2780368328094482, + "learning_rate": 3.985957483763554e-05, + "loss": 5.7962, + "step": 3085 + }, + { + "epoch": 0.29758919961427194, + "grad_norm": 2.4774351119995117, + "learning_rate": 3.9853483479382295e-05, + "loss": 6.0256, + "step": 3086 + }, + { + "epoch": 0.2976856316297011, + "grad_norm": 2.900034189224243, + "learning_rate": 3.984739075789462e-05, + "loss": 5.791, + "step": 3087 + }, + { + "epoch": 0.29778206364513016, + "grad_norm": 2.7048916816711426, + "learning_rate": 3.98412966737317e-05, + "loss": 6.2755, + "step": 3088 + }, + { + "epoch": 0.2978784956605593, + "grad_norm": 2.33697509765625, + "learning_rate": 3.983520122745283e-05, + "loss": 6.0739, + "step": 3089 + }, + { + "epoch": 0.29797492767598843, + "grad_norm": 3.061190366744995, + "learning_rate": 3.982910441961746e-05, + "loss": 6.1174, + "step": 3090 + }, + { + "epoch": 0.29807135969141757, + "grad_norm": 3.0562634468078613, + "learning_rate": 3.982300625078513e-05, + "loss": 5.9222, + "step": 3091 + }, + { + "epoch": 0.29816779170684665, + "grad_norm": 2.6993486881256104, + "learning_rate": 3.981690672151554e-05, + "loss": 6.0068, + "step": 3092 + }, + { + "epoch": 0.2982642237222758, + "grad_norm": 2.194938898086548, + "learning_rate": 3.981080583236848e-05, + "loss": 5.8773, + "step": 3093 + }, + { + "epoch": 0.2983606557377049, + "grad_norm": 2.05233097076416, + "learning_rate": 3.980470358390389e-05, + "loss": 5.8078, + "step": 3094 + }, + { + "epoch": 0.29845708775313406, + "grad_norm": 2.072021961212158, + "learning_rate": 3.979859997668182e-05, + "loss": 5.8675, + "step": 3095 + }, + { + "epoch": 0.29855351976856315, + "grad_norm": 2.470069169998169, + "learning_rate": 3.9792495011262464e-05, + "loss": 4.8697, + "step": 3096 + }, + { + "epoch": 0.2986499517839923, + "grad_norm": 2.2118265628814697, + "learning_rate": 3.9786388688206134e-05, + "loss": 4.8032, + "step": 3097 + }, + { + "epoch": 0.2987463837994214, + "grad_norm": 3.5178020000457764, + "learning_rate": 3.978028100807323e-05, + "loss": 5.9716, + "step": 3098 + }, + { + "epoch": 0.2988428158148505, + "grad_norm": 2.6539204120635986, + "learning_rate": 3.977417197142434e-05, + "loss": 5.8928, + "step": 3099 + }, + { + "epoch": 0.29893924783027964, + "grad_norm": 4.616934776306152, + "learning_rate": 3.976806157882012e-05, + "loss": 5.7971, + "step": 3100 + }, + { + "epoch": 0.2990356798457088, + "grad_norm": 3.3652830123901367, + "learning_rate": 3.97619498308214e-05, + "loss": 5.8411, + "step": 3101 + }, + { + "epoch": 0.2991321118611379, + "grad_norm": 3.093714952468872, + "learning_rate": 3.975583672798908e-05, + "loss": 5.7956, + "step": 3102 + }, + { + "epoch": 0.299228543876567, + "grad_norm": 4.377259731292725, + "learning_rate": 3.974972227088423e-05, + "loss": 5.996, + "step": 3103 + }, + { + "epoch": 0.29932497589199614, + "grad_norm": 4.287552356719971, + "learning_rate": 3.9743606460068025e-05, + "loss": 5.9931, + "step": 3104 + }, + { + "epoch": 0.2994214079074253, + "grad_norm": 2.140568733215332, + "learning_rate": 3.973748929610177e-05, + "loss": 5.8772, + "step": 3105 + }, + { + "epoch": 0.2995178399228544, + "grad_norm": 4.387476921081543, + "learning_rate": 3.9731370779546886e-05, + "loss": 5.7656, + "step": 3106 + }, + { + "epoch": 0.2996142719382835, + "grad_norm": 3.0103776454925537, + "learning_rate": 3.9725250910964914e-05, + "loss": 5.8618, + "step": 3107 + }, + { + "epoch": 0.29971070395371263, + "grad_norm": 4.443232536315918, + "learning_rate": 3.971912969091755e-05, + "loss": 5.8085, + "step": 3108 + }, + { + "epoch": 0.29980713596914177, + "grad_norm": 6.172327995300293, + "learning_rate": 3.971300711996657e-05, + "loss": 5.6784, + "step": 3109 + }, + { + "epoch": 0.29990356798457085, + "grad_norm": 5.547931671142578, + "learning_rate": 3.9706883198673907e-05, + "loss": 5.8257, + "step": 3110 + }, + { + "epoch": 0.3, + "grad_norm": 3.947927474975586, + "learning_rate": 3.97007579276016e-05, + "loss": 5.9062, + "step": 3111 + }, + { + "epoch": 0.3000964320154291, + "grad_norm": 4.75912618637085, + "learning_rate": 3.969463130731183e-05, + "loss": 5.94, + "step": 3112 + }, + { + "epoch": 0.30019286403085826, + "grad_norm": 3.7087912559509277, + "learning_rate": 3.968850333836689e-05, + "loss": 5.9607, + "step": 3113 + }, + { + "epoch": 0.30028929604628735, + "grad_norm": 3.5822129249572754, + "learning_rate": 3.9682374021329184e-05, + "loss": 5.9085, + "step": 3114 + }, + { + "epoch": 0.3003857280617165, + "grad_norm": 3.3473961353302, + "learning_rate": 3.967624335676127e-05, + "loss": 5.8792, + "step": 3115 + }, + { + "epoch": 0.3004821600771456, + "grad_norm": 2.5062239170074463, + "learning_rate": 3.967011134522579e-05, + "loss": 5.7872, + "step": 3116 + }, + { + "epoch": 0.30057859209257476, + "grad_norm": 3.3981502056121826, + "learning_rate": 3.966397798728555e-05, + "loss": 6.048, + "step": 3117 + }, + { + "epoch": 0.30067502410800384, + "grad_norm": 3.4163691997528076, + "learning_rate": 3.965784328350347e-05, + "loss": 6.0666, + "step": 3118 + }, + { + "epoch": 0.300771456123433, + "grad_norm": 2.522456169128418, + "learning_rate": 3.965170723444257e-05, + "loss": 5.833, + "step": 3119 + }, + { + "epoch": 0.3008678881388621, + "grad_norm": 2.7990145683288574, + "learning_rate": 3.964556984066601e-05, + "loss": 5.8841, + "step": 3120 + }, + { + "epoch": 0.3009643201542912, + "grad_norm": 4.132803916931152, + "learning_rate": 3.963943110273707e-05, + "loss": 5.5571, + "step": 3121 + }, + { + "epoch": 0.30106075216972034, + "grad_norm": 2.983212471008301, + "learning_rate": 3.963329102121918e-05, + "loss": 5.7988, + "step": 3122 + }, + { + "epoch": 0.3011571841851495, + "grad_norm": 5.207912445068359, + "learning_rate": 3.962714959667583e-05, + "loss": 5.8245, + "step": 3123 + }, + { + "epoch": 0.3012536162005786, + "grad_norm": 5.053807258605957, + "learning_rate": 3.9621006829670705e-05, + "loss": 5.9149, + "step": 3124 + }, + { + "epoch": 0.3013500482160077, + "grad_norm": 4.174320697784424, + "learning_rate": 3.9614862720767564e-05, + "loss": 5.7351, + "step": 3125 + }, + { + "epoch": 0.30144648023143683, + "grad_norm": 2.8446602821350098, + "learning_rate": 3.9608717270530315e-05, + "loss": 5.9175, + "step": 3126 + }, + { + "epoch": 0.30154291224686597, + "grad_norm": 4.920058727264404, + "learning_rate": 3.9602570479522974e-05, + "loss": 5.9889, + "step": 3127 + }, + { + "epoch": 0.3016393442622951, + "grad_norm": 4.444602012634277, + "learning_rate": 3.959642234830968e-05, + "loss": 6.0385, + "step": 3128 + }, + { + "epoch": 0.3017357762777242, + "grad_norm": 2.7776198387145996, + "learning_rate": 3.9590272877454714e-05, + "loss": 5.9103, + "step": 3129 + }, + { + "epoch": 0.3018322082931533, + "grad_norm": 3.647942304611206, + "learning_rate": 3.9584122067522464e-05, + "loss": 5.8422, + "step": 3130 + }, + { + "epoch": 0.30192864030858246, + "grad_norm": 3.9833273887634277, + "learning_rate": 3.957796991907744e-05, + "loss": 5.9594, + "step": 3131 + }, + { + "epoch": 0.30202507232401155, + "grad_norm": 2.1494789123535156, + "learning_rate": 3.957181643268428e-05, + "loss": 5.9562, + "step": 3132 + }, + { + "epoch": 0.3021215043394407, + "grad_norm": 3.3291049003601074, + "learning_rate": 3.956566160890774e-05, + "loss": 6.1597, + "step": 3133 + }, + { + "epoch": 0.3022179363548698, + "grad_norm": 3.3286166191101074, + "learning_rate": 3.9559505448312706e-05, + "loss": 6.0183, + "step": 3134 + }, + { + "epoch": 0.30231436837029896, + "grad_norm": 3.174647092819214, + "learning_rate": 3.955334795146418e-05, + "loss": 5.8109, + "step": 3135 + }, + { + "epoch": 0.30241080038572804, + "grad_norm": 2.9420359134674072, + "learning_rate": 3.9547189118927285e-05, + "loss": 6.0225, + "step": 3136 + }, + { + "epoch": 0.3025072324011572, + "grad_norm": 2.6841089725494385, + "learning_rate": 3.9541028951267287e-05, + "loss": 6.066, + "step": 3137 + }, + { + "epoch": 0.3026036644165863, + "grad_norm": 3.1342597007751465, + "learning_rate": 3.953486744904955e-05, + "loss": 5.8556, + "step": 3138 + }, + { + "epoch": 0.30270009643201545, + "grad_norm": 5.021661281585693, + "learning_rate": 3.952870461283956e-05, + "loss": 6.0595, + "step": 3139 + }, + { + "epoch": 0.30279652844744454, + "grad_norm": 3.952075958251953, + "learning_rate": 3.952254044320295e-05, + "loss": 6.1526, + "step": 3140 + }, + { + "epoch": 0.3028929604628737, + "grad_norm": 2.6090118885040283, + "learning_rate": 3.9516374940705456e-05, + "loss": 6.0913, + "step": 3141 + }, + { + "epoch": 0.3029893924783028, + "grad_norm": 3.3122775554656982, + "learning_rate": 3.951020810591293e-05, + "loss": 6.0843, + "step": 3142 + }, + { + "epoch": 0.3030858244937319, + "grad_norm": 3.4112966060638428, + "learning_rate": 3.950403993939136e-05, + "loss": 5.969, + "step": 3143 + }, + { + "epoch": 0.30318225650916103, + "grad_norm": 2.860302448272705, + "learning_rate": 3.949787044170686e-05, + "loss": 5.9105, + "step": 3144 + }, + { + "epoch": 0.30327868852459017, + "grad_norm": 2.7063660621643066, + "learning_rate": 3.949169961342566e-05, + "loss": 5.9949, + "step": 3145 + }, + { + "epoch": 0.3033751205400193, + "grad_norm": 3.8145101070404053, + "learning_rate": 3.948552745511409e-05, + "loss": 6.0382, + "step": 3146 + }, + { + "epoch": 0.3034715525554484, + "grad_norm": 3.658442258834839, + "learning_rate": 3.947935396733865e-05, + "loss": 5.7684, + "step": 3147 + }, + { + "epoch": 0.3035679845708775, + "grad_norm": 3.5809690952301025, + "learning_rate": 3.947317915066593e-05, + "loss": 6.1099, + "step": 3148 + }, + { + "epoch": 0.30366441658630666, + "grad_norm": 3.1612870693206787, + "learning_rate": 3.946700300566263e-05, + "loss": 6.004, + "step": 3149 + }, + { + "epoch": 0.3037608486017358, + "grad_norm": 3.456150770187378, + "learning_rate": 3.946082553289561e-05, + "loss": 5.9256, + "step": 3150 + }, + { + "epoch": 0.3038572806171649, + "grad_norm": 3.347240924835205, + "learning_rate": 3.945464673293181e-05, + "loss": 5.7911, + "step": 3151 + }, + { + "epoch": 0.303953712632594, + "grad_norm": 3.194305419921875, + "learning_rate": 3.944846660633833e-05, + "loss": 6.0032, + "step": 3152 + }, + { + "epoch": 0.30405014464802316, + "grad_norm": 3.3642451763153076, + "learning_rate": 3.944228515368237e-05, + "loss": 6.0216, + "step": 3153 + }, + { + "epoch": 0.30414657666345224, + "grad_norm": 7.604219913482666, + "learning_rate": 3.943610237553125e-05, + "loss": 5.473, + "step": 3154 + }, + { + "epoch": 0.3042430086788814, + "grad_norm": 4.933492660522461, + "learning_rate": 3.9429918272452426e-05, + "loss": 5.9137, + "step": 3155 + }, + { + "epoch": 0.3043394406943105, + "grad_norm": 3.440952777862549, + "learning_rate": 3.942373284501346e-05, + "loss": 5.7971, + "step": 3156 + }, + { + "epoch": 0.30443587270973965, + "grad_norm": 6.707437992095947, + "learning_rate": 3.9417546093782044e-05, + "loss": 6.0927, + "step": 3157 + }, + { + "epoch": 0.30453230472516873, + "grad_norm": 7.494584560394287, + "learning_rate": 3.9411358019326e-05, + "loss": 5.9331, + "step": 3158 + }, + { + "epoch": 0.30462873674059787, + "grad_norm": 7.044229507446289, + "learning_rate": 3.9405168622213245e-05, + "loss": 5.9297, + "step": 3159 + }, + { + "epoch": 0.304725168756027, + "grad_norm": 3.9547877311706543, + "learning_rate": 3.939897790301185e-05, + "loss": 6.1013, + "step": 3160 + }, + { + "epoch": 0.30482160077145615, + "grad_norm": 2.9565072059631348, + "learning_rate": 3.939278586228998e-05, + "loss": 6.1015, + "step": 3161 + }, + { + "epoch": 0.30491803278688523, + "grad_norm": 2.697701930999756, + "learning_rate": 3.9386592500615945e-05, + "loss": 5.991, + "step": 3162 + }, + { + "epoch": 0.30501446480231437, + "grad_norm": 3.159118890762329, + "learning_rate": 3.938039781855816e-05, + "loss": 6.0251, + "step": 3163 + }, + { + "epoch": 0.3051108968177435, + "grad_norm": 2.635448932647705, + "learning_rate": 3.9374201816685155e-05, + "loss": 5.8856, + "step": 3164 + }, + { + "epoch": 0.3052073288331726, + "grad_norm": 1.9397687911987305, + "learning_rate": 3.9368004495565595e-05, + "loss": 6.0422, + "step": 3165 + }, + { + "epoch": 0.3053037608486017, + "grad_norm": 3.8211143016815186, + "learning_rate": 3.9361805855768276e-05, + "loss": 5.7409, + "step": 3166 + }, + { + "epoch": 0.30540019286403086, + "grad_norm": 2.04115629196167, + "learning_rate": 3.9355605897862093e-05, + "loss": 5.4546, + "step": 3167 + }, + { + "epoch": 0.30549662487946, + "grad_norm": 2.9906086921691895, + "learning_rate": 3.9349404622416056e-05, + "loss": 5.8571, + "step": 3168 + }, + { + "epoch": 0.3055930568948891, + "grad_norm": 2.989257574081421, + "learning_rate": 3.934320202999934e-05, + "loss": 5.9632, + "step": 3169 + }, + { + "epoch": 0.3056894889103182, + "grad_norm": 2.916571855545044, + "learning_rate": 3.933699812118119e-05, + "loss": 6.0374, + "step": 3170 + }, + { + "epoch": 0.30578592092574736, + "grad_norm": 3.0393857955932617, + "learning_rate": 3.9330792896530996e-05, + "loss": 6.2655, + "step": 3171 + }, + { + "epoch": 0.3058823529411765, + "grad_norm": 2.290205955505371, + "learning_rate": 3.932458635661827e-05, + "loss": 5.7422, + "step": 3172 + }, + { + "epoch": 0.3059787849566056, + "grad_norm": 2.5776329040527344, + "learning_rate": 3.931837850201263e-05, + "loss": 6.0826, + "step": 3173 + }, + { + "epoch": 0.3060752169720347, + "grad_norm": 2.2079918384552, + "learning_rate": 3.9312169333283844e-05, + "loss": 6.1166, + "step": 3174 + }, + { + "epoch": 0.30617164898746385, + "grad_norm": 2.745556116104126, + "learning_rate": 3.930595885100177e-05, + "loss": 6.0674, + "step": 3175 + }, + { + "epoch": 0.30626808100289293, + "grad_norm": 2.792916774749756, + "learning_rate": 3.92997470557364e-05, + "loss": 6.038, + "step": 3176 + }, + { + "epoch": 0.30636451301832207, + "grad_norm": 2.381934881210327, + "learning_rate": 3.9293533948057845e-05, + "loss": 5.8208, + "step": 3177 + }, + { + "epoch": 0.3064609450337512, + "grad_norm": 2.924964189529419, + "learning_rate": 3.9287319528536346e-05, + "loss": 5.8919, + "step": 3178 + }, + { + "epoch": 0.30655737704918035, + "grad_norm": 2.3941001892089844, + "learning_rate": 3.928110379774223e-05, + "loss": 5.883, + "step": 3179 + }, + { + "epoch": 0.30665380906460943, + "grad_norm": 2.038952350616455, + "learning_rate": 3.927488675624599e-05, + "loss": 6.1808, + "step": 3180 + }, + { + "epoch": 0.30675024108003857, + "grad_norm": 3.4005188941955566, + "learning_rate": 3.926866840461822e-05, + "loss": 6.077, + "step": 3181 + }, + { + "epoch": 0.3068466730954677, + "grad_norm": 3.0280027389526367, + "learning_rate": 3.926244874342962e-05, + "loss": 5.8723, + "step": 3182 + }, + { + "epoch": 0.30694310511089684, + "grad_norm": 2.8424482345581055, + "learning_rate": 3.925622777325103e-05, + "loss": 5.5375, + "step": 3183 + }, + { + "epoch": 0.3070395371263259, + "grad_norm": 2.748077154159546, + "learning_rate": 3.9250005494653396e-05, + "loss": 5.7447, + "step": 3184 + }, + { + "epoch": 0.30713596914175506, + "grad_norm": 3.1706647872924805, + "learning_rate": 3.924378190820781e-05, + "loss": 5.7201, + "step": 3185 + }, + { + "epoch": 0.3072324011571842, + "grad_norm": 4.719724178314209, + "learning_rate": 3.923755701448544e-05, + "loss": 5.7604, + "step": 3186 + }, + { + "epoch": 0.3073288331726133, + "grad_norm": 4.071478366851807, + "learning_rate": 3.923133081405761e-05, + "loss": 6.0534, + "step": 3187 + }, + { + "epoch": 0.3074252651880424, + "grad_norm": 4.364805698394775, + "learning_rate": 3.922510330749576e-05, + "loss": 5.9008, + "step": 3188 + }, + { + "epoch": 0.30752169720347156, + "grad_norm": 3.698451519012451, + "learning_rate": 3.921887449537143e-05, + "loss": 5.9916, + "step": 3189 + }, + { + "epoch": 0.3076181292189007, + "grad_norm": 3.3803720474243164, + "learning_rate": 3.921264437825631e-05, + "loss": 6.0929, + "step": 3190 + }, + { + "epoch": 0.3077145612343298, + "grad_norm": 2.4773976802825928, + "learning_rate": 3.920641295672217e-05, + "loss": 5.8959, + "step": 3191 + }, + { + "epoch": 0.3078109932497589, + "grad_norm": 3.1648213863372803, + "learning_rate": 3.9200180231340944e-05, + "loss": 6.0782, + "step": 3192 + }, + { + "epoch": 0.30790742526518805, + "grad_norm": 5.380361557006836, + "learning_rate": 3.919394620268465e-05, + "loss": 6.0068, + "step": 3193 + }, + { + "epoch": 0.3080038572806172, + "grad_norm": 5.189523220062256, + "learning_rate": 3.9187710871325444e-05, + "loss": 5.7941, + "step": 3194 + }, + { + "epoch": 0.30810028929604627, + "grad_norm": 4.684502601623535, + "learning_rate": 3.918147423783559e-05, + "loss": 5.976, + "step": 3195 + }, + { + "epoch": 0.3081967213114754, + "grad_norm": 5.6627373695373535, + "learning_rate": 3.9175236302787495e-05, + "loss": 5.9995, + "step": 3196 + }, + { + "epoch": 0.30829315332690455, + "grad_norm": 4.966180324554443, + "learning_rate": 3.916899706675365e-05, + "loss": 5.7128, + "step": 3197 + }, + { + "epoch": 0.3083895853423336, + "grad_norm": 3.754716396331787, + "learning_rate": 3.91627565303067e-05, + "loss": 5.7779, + "step": 3198 + }, + { + "epoch": 0.30848601735776277, + "grad_norm": 3.0923585891723633, + "learning_rate": 3.915651469401939e-05, + "loss": 5.7585, + "step": 3199 + }, + { + "epoch": 0.3085824493731919, + "grad_norm": 4.568547248840332, + "learning_rate": 3.915027155846458e-05, + "loss": 5.8232, + "step": 3200 + }, + { + "epoch": 0.30867888138862104, + "grad_norm": 3.8644769191741943, + "learning_rate": 3.914402712421526e-05, + "loss": 5.7774, + "step": 3201 + }, + { + "epoch": 0.3087753134040501, + "grad_norm": 2.7786262035369873, + "learning_rate": 3.913778139184454e-05, + "loss": 5.8102, + "step": 3202 + }, + { + "epoch": 0.30887174541947926, + "grad_norm": 2.457841634750366, + "learning_rate": 3.913153436192565e-05, + "loss": 5.7515, + "step": 3203 + }, + { + "epoch": 0.3089681774349084, + "grad_norm": 3.2196688652038574, + "learning_rate": 3.9125286035031925e-05, + "loss": 5.7674, + "step": 3204 + }, + { + "epoch": 0.30906460945033754, + "grad_norm": 2.5960123538970947, + "learning_rate": 3.911903641173683e-05, + "loss": 5.696, + "step": 3205 + }, + { + "epoch": 0.3091610414657666, + "grad_norm": 2.403545379638672, + "learning_rate": 3.9112785492613954e-05, + "loss": 5.7588, + "step": 3206 + }, + { + "epoch": 0.30925747348119575, + "grad_norm": 2.3148977756500244, + "learning_rate": 3.9106533278237e-05, + "loss": 5.6647, + "step": 3207 + }, + { + "epoch": 0.3093539054966249, + "grad_norm": 2.810197591781616, + "learning_rate": 3.910027976917978e-05, + "loss": 5.8381, + "step": 3208 + }, + { + "epoch": 0.309450337512054, + "grad_norm": 2.083678722381592, + "learning_rate": 3.9094024966016236e-05, + "loss": 5.9815, + "step": 3209 + }, + { + "epoch": 0.3095467695274831, + "grad_norm": 2.078726291656494, + "learning_rate": 3.908776886932043e-05, + "loss": 6.0105, + "step": 3210 + }, + { + "epoch": 0.30964320154291225, + "grad_norm": 2.6201584339141846, + "learning_rate": 3.908151147966653e-05, + "loss": 5.9396, + "step": 3211 + }, + { + "epoch": 0.3097396335583414, + "grad_norm": 2.0175604820251465, + "learning_rate": 3.9075252797628846e-05, + "loss": 5.8518, + "step": 3212 + }, + { + "epoch": 0.30983606557377047, + "grad_norm": 2.6301233768463135, + "learning_rate": 3.906899282378178e-05, + "loss": 6.0613, + "step": 3213 + }, + { + "epoch": 0.3099324975891996, + "grad_norm": 3.306739568710327, + "learning_rate": 3.9062731558699875e-05, + "loss": 5.9184, + "step": 3214 + }, + { + "epoch": 0.31002892960462874, + "grad_norm": 3.336148500442505, + "learning_rate": 3.9056469002957774e-05, + "loss": 6.02, + "step": 3215 + }, + { + "epoch": 0.3101253616200579, + "grad_norm": 2.6603786945343018, + "learning_rate": 3.9050205157130244e-05, + "loss": 5.8783, + "step": 3216 + }, + { + "epoch": 0.31022179363548696, + "grad_norm": 3.0773887634277344, + "learning_rate": 3.904394002179218e-05, + "loss": 5.9866, + "step": 3217 + }, + { + "epoch": 0.3103182256509161, + "grad_norm": 3.6862986087799072, + "learning_rate": 3.903767359751859e-05, + "loss": 5.7415, + "step": 3218 + }, + { + "epoch": 0.31041465766634524, + "grad_norm": 2.512472629547119, + "learning_rate": 3.903140588488459e-05, + "loss": 5.7036, + "step": 3219 + }, + { + "epoch": 0.3105110896817743, + "grad_norm": 2.8599510192871094, + "learning_rate": 3.902513688446542e-05, + "loss": 5.3631, + "step": 3220 + }, + { + "epoch": 0.31060752169720346, + "grad_norm": 2.6779210567474365, + "learning_rate": 3.9018866596836464e-05, + "loss": 5.371, + "step": 3221 + }, + { + "epoch": 0.3107039537126326, + "grad_norm": 3.726637363433838, + "learning_rate": 3.901259502257317e-05, + "loss": 5.7891, + "step": 3222 + }, + { + "epoch": 0.31080038572806173, + "grad_norm": 4.1398515701293945, + "learning_rate": 3.9006322162251167e-05, + "loss": 5.484, + "step": 3223 + }, + { + "epoch": 0.3108968177434908, + "grad_norm": 3.484829902648926, + "learning_rate": 3.9000048016446145e-05, + "loss": 5.6258, + "step": 3224 + }, + { + "epoch": 0.31099324975891995, + "grad_norm": 4.655187129974365, + "learning_rate": 3.899377258573396e-05, + "loss": 5.7084, + "step": 3225 + }, + { + "epoch": 0.3110896817743491, + "grad_norm": 3.3069324493408203, + "learning_rate": 3.8987495870690535e-05, + "loss": 5.6669, + "step": 3226 + }, + { + "epoch": 0.31118611378977823, + "grad_norm": 2.608201742172241, + "learning_rate": 3.898121787189196e-05, + "loss": 5.8176, + "step": 3227 + }, + { + "epoch": 0.3112825458052073, + "grad_norm": 3.778625249862671, + "learning_rate": 3.8974938589914425e-05, + "loss": 5.8505, + "step": 3228 + }, + { + "epoch": 0.31137897782063645, + "grad_norm": 2.1982505321502686, + "learning_rate": 3.896865802533422e-05, + "loss": 5.9755, + "step": 3229 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 3.736710548400879, + "learning_rate": 3.8962376178727766e-05, + "loss": 5.9487, + "step": 3230 + }, + { + "epoch": 0.31157184185149467, + "grad_norm": 2.9246585369110107, + "learning_rate": 3.895609305067162e-05, + "loss": 5.7893, + "step": 3231 + }, + { + "epoch": 0.3116682738669238, + "grad_norm": 3.969374895095825, + "learning_rate": 3.894980864174244e-05, + "loss": 6.1207, + "step": 3232 + }, + { + "epoch": 0.31176470588235294, + "grad_norm": 3.758080244064331, + "learning_rate": 3.894352295251697e-05, + "loss": 5.9378, + "step": 3233 + }, + { + "epoch": 0.3118611378977821, + "grad_norm": 3.1458852291107178, + "learning_rate": 3.893723598357214e-05, + "loss": 5.6098, + "step": 3234 + }, + { + "epoch": 0.31195756991321116, + "grad_norm": 2.5999131202697754, + "learning_rate": 3.893094773548495e-05, + "loss": 5.3968, + "step": 3235 + }, + { + "epoch": 0.3120540019286403, + "grad_norm": 3.4534597396850586, + "learning_rate": 3.892465820883251e-05, + "loss": 5.9678, + "step": 3236 + }, + { + "epoch": 0.31215043394406944, + "grad_norm": 3.415152072906494, + "learning_rate": 3.891836740419209e-05, + "loss": 5.9241, + "step": 3237 + }, + { + "epoch": 0.3122468659594986, + "grad_norm": 3.161853551864624, + "learning_rate": 3.891207532214103e-05, + "loss": 6.0447, + "step": 3238 + }, + { + "epoch": 0.31234329797492766, + "grad_norm": 2.884063482284546, + "learning_rate": 3.890578196325682e-05, + "loss": 6.0597, + "step": 3239 + }, + { + "epoch": 0.3124397299903568, + "grad_norm": 3.4985005855560303, + "learning_rate": 3.8899487328117064e-05, + "loss": 6.1632, + "step": 3240 + }, + { + "epoch": 0.31253616200578593, + "grad_norm": 2.6127827167510986, + "learning_rate": 3.889319141729946e-05, + "loss": 6.084, + "step": 3241 + }, + { + "epoch": 0.31263259402121507, + "grad_norm": 3.055210590362549, + "learning_rate": 3.8886894231381855e-05, + "loss": 5.986, + "step": 3242 + }, + { + "epoch": 0.31272902603664415, + "grad_norm": 2.5980920791625977, + "learning_rate": 3.888059577094218e-05, + "loss": 6.0299, + "step": 3243 + }, + { + "epoch": 0.3128254580520733, + "grad_norm": 2.616189956665039, + "learning_rate": 3.887429603655853e-05, + "loss": 5.8761, + "step": 3244 + }, + { + "epoch": 0.31292189006750243, + "grad_norm": 4.1597161293029785, + "learning_rate": 3.886799502880905e-05, + "loss": 5.9269, + "step": 3245 + }, + { + "epoch": 0.3130183220829315, + "grad_norm": 4.052279472351074, + "learning_rate": 3.8861692748272056e-05, + "loss": 5.8498, + "step": 3246 + }, + { + "epoch": 0.31311475409836065, + "grad_norm": 2.102712869644165, + "learning_rate": 3.885538919552597e-05, + "loss": 5.9616, + "step": 3247 + }, + { + "epoch": 0.3132111861137898, + "grad_norm": 3.6686019897460938, + "learning_rate": 3.884908437114931e-05, + "loss": 6.2486, + "step": 3248 + }, + { + "epoch": 0.3133076181292189, + "grad_norm": 4.1373162269592285, + "learning_rate": 3.8842778275720724e-05, + "loss": 5.8793, + "step": 3249 + }, + { + "epoch": 0.313404050144648, + "grad_norm": 2.376222610473633, + "learning_rate": 3.8836470909819e-05, + "loss": 5.9296, + "step": 3250 + }, + { + "epoch": 0.31350048216007714, + "grad_norm": 2.185614585876465, + "learning_rate": 3.8830162274022994e-05, + "loss": 5.8257, + "step": 3251 + }, + { + "epoch": 0.3135969141755063, + "grad_norm": 3.5288186073303223, + "learning_rate": 3.882385236891173e-05, + "loss": 6.0691, + "step": 3252 + }, + { + "epoch": 0.3136933461909354, + "grad_norm": 3.230849027633667, + "learning_rate": 3.88175411950643e-05, + "loss": 5.8638, + "step": 3253 + }, + { + "epoch": 0.3137897782063645, + "grad_norm": 2.766366720199585, + "learning_rate": 3.881122875305995e-05, + "loss": 5.6335, + "step": 3254 + }, + { + "epoch": 0.31388621022179364, + "grad_norm": 2.7666144371032715, + "learning_rate": 3.8804915043478024e-05, + "loss": 5.668, + "step": 3255 + }, + { + "epoch": 0.3139826422372228, + "grad_norm": 3.33162784576416, + "learning_rate": 3.879860006689798e-05, + "loss": 5.6151, + "step": 3256 + }, + { + "epoch": 0.31407907425265186, + "grad_norm": 3.0793583393096924, + "learning_rate": 3.879228382389941e-05, + "loss": 6.2592, + "step": 3257 + }, + { + "epoch": 0.314175506268081, + "grad_norm": 3.240081787109375, + "learning_rate": 3.8785966315062014e-05, + "loss": 5.6874, + "step": 3258 + }, + { + "epoch": 0.31427193828351013, + "grad_norm": 2.1868159770965576, + "learning_rate": 3.8779647540965594e-05, + "loss": 5.7521, + "step": 3259 + }, + { + "epoch": 0.31436837029893927, + "grad_norm": 3.399886131286621, + "learning_rate": 3.877332750219007e-05, + "loss": 5.5649, + "step": 3260 + }, + { + "epoch": 0.31446480231436835, + "grad_norm": 3.061436176300049, + "learning_rate": 3.8767006199315506e-05, + "loss": 5.4791, + "step": 3261 + }, + { + "epoch": 0.3145612343297975, + "grad_norm": 2.169909715652466, + "learning_rate": 3.876068363292207e-05, + "loss": 5.6573, + "step": 3262 + }, + { + "epoch": 0.3146576663452266, + "grad_norm": 2.716198205947876, + "learning_rate": 3.8754359803590014e-05, + "loss": 6.1205, + "step": 3263 + }, + { + "epoch": 0.31475409836065577, + "grad_norm": 2.397146463394165, + "learning_rate": 3.874803471189974e-05, + "loss": 6.084, + "step": 3264 + }, + { + "epoch": 0.31485053037608485, + "grad_norm": 2.68815541267395, + "learning_rate": 3.8741708358431774e-05, + "loss": 5.8105, + "step": 3265 + }, + { + "epoch": 0.314946962391514, + "grad_norm": 2.6849803924560547, + "learning_rate": 3.873538074376672e-05, + "loss": 6.0133, + "step": 3266 + }, + { + "epoch": 0.3150433944069431, + "grad_norm": 2.184983968734741, + "learning_rate": 3.872905186848533e-05, + "loss": 5.8482, + "step": 3267 + }, + { + "epoch": 0.3151398264223722, + "grad_norm": 3.728670597076416, + "learning_rate": 3.8722721733168455e-05, + "loss": 5.5802, + "step": 3268 + }, + { + "epoch": 0.31523625843780134, + "grad_norm": 2.669064521789551, + "learning_rate": 3.8716390338397076e-05, + "loss": 5.6433, + "step": 3269 + }, + { + "epoch": 0.3153326904532305, + "grad_norm": 3.2996881008148193, + "learning_rate": 3.8710057684752266e-05, + "loss": 5.8996, + "step": 3270 + }, + { + "epoch": 0.3154291224686596, + "grad_norm": 3.333810806274414, + "learning_rate": 3.870372377281524e-05, + "loss": 6.0167, + "step": 3271 + }, + { + "epoch": 0.3155255544840887, + "grad_norm": 2.132932186126709, + "learning_rate": 3.8697388603167304e-05, + "loss": 5.8726, + "step": 3272 + }, + { + "epoch": 0.31562198649951784, + "grad_norm": 2.7255783081054688, + "learning_rate": 3.869105217638991e-05, + "loss": 5.8922, + "step": 3273 + }, + { + "epoch": 0.315718418514947, + "grad_norm": 2.765026092529297, + "learning_rate": 3.86847144930646e-05, + "loss": 6.0078, + "step": 3274 + }, + { + "epoch": 0.3158148505303761, + "grad_norm": 2.3352627754211426, + "learning_rate": 3.867837555377303e-05, + "loss": 6.0278, + "step": 3275 + }, + { + "epoch": 0.3159112825458052, + "grad_norm": 2.093775510787964, + "learning_rate": 3.867203535909699e-05, + "loss": 5.966, + "step": 3276 + }, + { + "epoch": 0.31600771456123433, + "grad_norm": 2.6044986248016357, + "learning_rate": 3.8665693909618374e-05, + "loss": 5.9076, + "step": 3277 + }, + { + "epoch": 0.31610414657666347, + "grad_norm": 2.2412829399108887, + "learning_rate": 3.865935120591919e-05, + "loss": 5.7324, + "step": 3278 + }, + { + "epoch": 0.31620057859209255, + "grad_norm": 2.8519809246063232, + "learning_rate": 3.865300724858156e-05, + "loss": 5.1473, + "step": 3279 + }, + { + "epoch": 0.3162970106075217, + "grad_norm": 3.5648269653320312, + "learning_rate": 3.864666203818773e-05, + "loss": 5.5608, + "step": 3280 + }, + { + "epoch": 0.3163934426229508, + "grad_norm": 3.8974828720092773, + "learning_rate": 3.864031557532006e-05, + "loss": 5.9194, + "step": 3281 + }, + { + "epoch": 0.31648987463837996, + "grad_norm": 1.9154852628707886, + "learning_rate": 3.863396786056102e-05, + "loss": 5.7768, + "step": 3282 + }, + { + "epoch": 0.31658630665380905, + "grad_norm": 3.3607144355773926, + "learning_rate": 3.862761889449317e-05, + "loss": 5.7559, + "step": 3283 + }, + { + "epoch": 0.3166827386692382, + "grad_norm": 3.7723546028137207, + "learning_rate": 3.8621268677699245e-05, + "loss": 5.8421, + "step": 3284 + }, + { + "epoch": 0.3167791706846673, + "grad_norm": 2.090240478515625, + "learning_rate": 3.8614917210762047e-05, + "loss": 5.7984, + "step": 3285 + }, + { + "epoch": 0.31687560270009646, + "grad_norm": 2.4092440605163574, + "learning_rate": 3.86085644942645e-05, + "loss": 5.8128, + "step": 3286 + }, + { + "epoch": 0.31697203471552554, + "grad_norm": 3.1836397647857666, + "learning_rate": 3.8602210528789654e-05, + "loss": 5.859, + "step": 3287 + }, + { + "epoch": 0.3170684667309547, + "grad_norm": 3.392986536026001, + "learning_rate": 3.859585531492067e-05, + "loss": 5.6282, + "step": 3288 + }, + { + "epoch": 0.3171648987463838, + "grad_norm": 2.2846248149871826, + "learning_rate": 3.858949885324081e-05, + "loss": 5.3999, + "step": 3289 + }, + { + "epoch": 0.3172613307618129, + "grad_norm": 3.1389408111572266, + "learning_rate": 3.858314114433348e-05, + "loss": 5.8435, + "step": 3290 + }, + { + "epoch": 0.31735776277724204, + "grad_norm": 2.2691993713378906, + "learning_rate": 3.8576782188782174e-05, + "loss": 6.0448, + "step": 3291 + }, + { + "epoch": 0.3174541947926712, + "grad_norm": 2.4538779258728027, + "learning_rate": 3.85704219871705e-05, + "loss": 5.9919, + "step": 3292 + }, + { + "epoch": 0.3175506268081003, + "grad_norm": 3.411463737487793, + "learning_rate": 3.856406054008221e-05, + "loss": 5.978, + "step": 3293 + }, + { + "epoch": 0.3176470588235294, + "grad_norm": 2.539898157119751, + "learning_rate": 3.855769784810114e-05, + "loss": 5.8991, + "step": 3294 + }, + { + "epoch": 0.31774349083895853, + "grad_norm": 2.755080223083496, + "learning_rate": 3.855133391181124e-05, + "loss": 5.8453, + "step": 3295 + }, + { + "epoch": 0.31783992285438767, + "grad_norm": 2.636625289916992, + "learning_rate": 3.854496873179659e-05, + "loss": 5.8351, + "step": 3296 + }, + { + "epoch": 0.3179363548698168, + "grad_norm": 2.4064550399780273, + "learning_rate": 3.853860230864139e-05, + "loss": 5.807, + "step": 3297 + }, + { + "epoch": 0.3180327868852459, + "grad_norm": 3.657604217529297, + "learning_rate": 3.853223464292992e-05, + "loss": 5.7873, + "step": 3298 + }, + { + "epoch": 0.318129218900675, + "grad_norm": 3.4392971992492676, + "learning_rate": 3.852586573524662e-05, + "loss": 5.8493, + "step": 3299 + }, + { + "epoch": 0.31822565091610416, + "grad_norm": 2.3869616985321045, + "learning_rate": 3.8519495586176014e-05, + "loss": 5.9331, + "step": 3300 + }, + { + "epoch": 0.31832208293153325, + "grad_norm": 2.3812954425811768, + "learning_rate": 3.851312419630273e-05, + "loss": 5.3608, + "step": 3301 + }, + { + "epoch": 0.3184185149469624, + "grad_norm": 2.658649206161499, + "learning_rate": 3.8506751566211555e-05, + "loss": 6.0144, + "step": 3302 + }, + { + "epoch": 0.3185149469623915, + "grad_norm": 2.9149248600006104, + "learning_rate": 3.8500377696487334e-05, + "loss": 5.9155, + "step": 3303 + }, + { + "epoch": 0.31861137897782066, + "grad_norm": 1.893820881843567, + "learning_rate": 3.849400258771507e-05, + "loss": 6.097, + "step": 3304 + }, + { + "epoch": 0.31870781099324974, + "grad_norm": 2.0067219734191895, + "learning_rate": 3.848762624047985e-05, + "loss": 6.0537, + "step": 3305 + }, + { + "epoch": 0.3188042430086789, + "grad_norm": 2.3595242500305176, + "learning_rate": 3.8481248655366906e-05, + "loss": 6.0819, + "step": 3306 + }, + { + "epoch": 0.318900675024108, + "grad_norm": 2.758989095687866, + "learning_rate": 3.847486983296155e-05, + "loss": 6.1764, + "step": 3307 + }, + { + "epoch": 0.31899710703953715, + "grad_norm": 2.6762256622314453, + "learning_rate": 3.8468489773849215e-05, + "loss": 5.8599, + "step": 3308 + }, + { + "epoch": 0.31909353905496624, + "grad_norm": 3.110621929168701, + "learning_rate": 3.846210847861547e-05, + "loss": 5.9965, + "step": 3309 + }, + { + "epoch": 0.3191899710703954, + "grad_norm": 1.9111173152923584, + "learning_rate": 3.845572594784599e-05, + "loss": 5.6256, + "step": 3310 + }, + { + "epoch": 0.3192864030858245, + "grad_norm": 2.871267080307007, + "learning_rate": 3.8449342182126523e-05, + "loss": 5.8676, + "step": 3311 + }, + { + "epoch": 0.3193828351012536, + "grad_norm": 3.223795175552368, + "learning_rate": 3.8442957182043e-05, + "loss": 6.0173, + "step": 3312 + }, + { + "epoch": 0.31947926711668273, + "grad_norm": 1.816216230392456, + "learning_rate": 3.843657094818141e-05, + "loss": 6.0014, + "step": 3313 + }, + { + "epoch": 0.31957569913211187, + "grad_norm": 2.916996717453003, + "learning_rate": 3.843018348112787e-05, + "loss": 6.0655, + "step": 3314 + }, + { + "epoch": 0.319672131147541, + "grad_norm": 2.9443113803863525, + "learning_rate": 3.8423794781468626e-05, + "loss": 6.0285, + "step": 3315 + }, + { + "epoch": 0.3197685631629701, + "grad_norm": 2.70186710357666, + "learning_rate": 3.841740484979002e-05, + "loss": 5.9338, + "step": 3316 + }, + { + "epoch": 0.3198649951783992, + "grad_norm": 3.9520456790924072, + "learning_rate": 3.841101368667851e-05, + "loss": 5.5606, + "step": 3317 + }, + { + "epoch": 0.31996142719382836, + "grad_norm": 2.1204323768615723, + "learning_rate": 3.840462129272067e-05, + "loss": 5.4247, + "step": 3318 + }, + { + "epoch": 0.3200578592092575, + "grad_norm": 3.834857940673828, + "learning_rate": 3.839822766850317e-05, + "loss": 5.6608, + "step": 3319 + }, + { + "epoch": 0.3201542912246866, + "grad_norm": 3.3942346572875977, + "learning_rate": 3.8391832814612844e-05, + "loss": 5.7597, + "step": 3320 + }, + { + "epoch": 0.3202507232401157, + "grad_norm": 2.9308254718780518, + "learning_rate": 3.838543673163658e-05, + "loss": 5.7761, + "step": 3321 + }, + { + "epoch": 0.32034715525554486, + "grad_norm": 3.6958115100860596, + "learning_rate": 3.83790394201614e-05, + "loss": 5.959, + "step": 3322 + }, + { + "epoch": 0.32044358727097394, + "grad_norm": 3.126063108444214, + "learning_rate": 3.837264088077446e-05, + "loss": 5.7323, + "step": 3323 + }, + { + "epoch": 0.3205400192864031, + "grad_norm": 3.3346996307373047, + "learning_rate": 3.836624111406299e-05, + "loss": 5.5397, + "step": 3324 + }, + { + "epoch": 0.3206364513018322, + "grad_norm": 3.1548304557800293, + "learning_rate": 3.8359840120614376e-05, + "loss": 5.677, + "step": 3325 + }, + { + "epoch": 0.32073288331726135, + "grad_norm": 3.8403093814849854, + "learning_rate": 3.8353437901016064e-05, + "loss": 5.9089, + "step": 3326 + }, + { + "epoch": 0.32082931533269043, + "grad_norm": 2.813530206680298, + "learning_rate": 3.834703445585567e-05, + "loss": 5.9025, + "step": 3327 + }, + { + "epoch": 0.32092574734811957, + "grad_norm": 3.2847111225128174, + "learning_rate": 3.834062978572087e-05, + "loss": 5.9699, + "step": 3328 + }, + { + "epoch": 0.3210221793635487, + "grad_norm": 3.7716290950775146, + "learning_rate": 3.8334223891199494e-05, + "loss": 5.9706, + "step": 3329 + }, + { + "epoch": 0.32111861137897785, + "grad_norm": 5.761679172515869, + "learning_rate": 3.8327816772879454e-05, + "loss": 5.539, + "step": 3330 + }, + { + "epoch": 0.32121504339440693, + "grad_norm": 3.4514639377593994, + "learning_rate": 3.83214084313488e-05, + "loss": 5.5021, + "step": 3331 + }, + { + "epoch": 0.32131147540983607, + "grad_norm": 5.306517124176025, + "learning_rate": 3.831499886719568e-05, + "loss": 5.9255, + "step": 3332 + }, + { + "epoch": 0.3214079074252652, + "grad_norm": 5.241827011108398, + "learning_rate": 3.830858808100834e-05, + "loss": 5.8168, + "step": 3333 + }, + { + "epoch": 0.3215043394406943, + "grad_norm": 6.087399005889893, + "learning_rate": 3.8302176073375186e-05, + "loss": 5.7473, + "step": 3334 + }, + { + "epoch": 0.3216007714561234, + "grad_norm": 4.385277271270752, + "learning_rate": 3.829576284488468e-05, + "loss": 5.5953, + "step": 3335 + }, + { + "epoch": 0.32169720347155256, + "grad_norm": 6.868374824523926, + "learning_rate": 3.8289348396125404e-05, + "loss": 5.5864, + "step": 3336 + }, + { + "epoch": 0.3217936354869817, + "grad_norm": 7.987143516540527, + "learning_rate": 3.8282932727686107e-05, + "loss": 5.9222, + "step": 3337 + }, + { + "epoch": 0.3218900675024108, + "grad_norm": 5.683267593383789, + "learning_rate": 3.827651584015558e-05, + "loss": 5.9455, + "step": 3338 + }, + { + "epoch": 0.3219864995178399, + "grad_norm": 3.497702121734619, + "learning_rate": 3.827009773412278e-05, + "loss": 6.0128, + "step": 3339 + }, + { + "epoch": 0.32208293153326906, + "grad_norm": 7.3820672035217285, + "learning_rate": 3.826367841017674e-05, + "loss": 5.9886, + "step": 3340 + }, + { + "epoch": 0.3221793635486982, + "grad_norm": 7.064610004425049, + "learning_rate": 3.8257257868906624e-05, + "loss": 6.1855, + "step": 3341 + }, + { + "epoch": 0.3222757955641273, + "grad_norm": 6.375041961669922, + "learning_rate": 3.8250836110901687e-05, + "loss": 6.1523, + "step": 3342 + }, + { + "epoch": 0.3223722275795564, + "grad_norm": 3.1470186710357666, + "learning_rate": 3.824441313675133e-05, + "loss": 6.0409, + "step": 3343 + }, + { + "epoch": 0.32246865959498555, + "grad_norm": 3.9517626762390137, + "learning_rate": 3.823798894704503e-05, + "loss": 5.9739, + "step": 3344 + }, + { + "epoch": 0.32256509161041463, + "grad_norm": 7.934202671051025, + "learning_rate": 3.823156354237241e-05, + "loss": 5.9956, + "step": 3345 + }, + { + "epoch": 0.32266152362584377, + "grad_norm": 5.547542572021484, + "learning_rate": 3.822513692332317e-05, + "loss": 6.178, + "step": 3346 + }, + { + "epoch": 0.3227579556412729, + "grad_norm": 3.153993844985962, + "learning_rate": 3.821870909048713e-05, + "loss": 5.7033, + "step": 3347 + }, + { + "epoch": 0.32285438765670205, + "grad_norm": 3.478752374649048, + "learning_rate": 3.8212280044454246e-05, + "loss": 5.8183, + "step": 3348 + }, + { + "epoch": 0.32295081967213113, + "grad_norm": 3.8147974014282227, + "learning_rate": 3.820584978581456e-05, + "loss": 5.9088, + "step": 3349 + }, + { + "epoch": 0.32304725168756027, + "grad_norm": 2.232541084289551, + "learning_rate": 3.819941831515824e-05, + "loss": 5.8068, + "step": 3350 + }, + { + "epoch": 0.3231436837029894, + "grad_norm": 2.766420602798462, + "learning_rate": 3.819298563307555e-05, + "loss": 5.8018, + "step": 3351 + }, + { + "epoch": 0.32324011571841854, + "grad_norm": 3.000338077545166, + "learning_rate": 3.8186551740156875e-05, + "loss": 5.8009, + "step": 3352 + }, + { + "epoch": 0.3233365477338476, + "grad_norm": 1.741951823234558, + "learning_rate": 3.8180116636992704e-05, + "loss": 5.8507, + "step": 3353 + }, + { + "epoch": 0.32343297974927676, + "grad_norm": 1.9154473543167114, + "learning_rate": 3.817368032417367e-05, + "loss": 5.8376, + "step": 3354 + }, + { + "epoch": 0.3235294117647059, + "grad_norm": 2.8419930934906006, + "learning_rate": 3.816724280229045e-05, + "loss": 5.1749, + "step": 3355 + }, + { + "epoch": 0.323625843780135, + "grad_norm": 3.2232213020324707, + "learning_rate": 3.81608040719339e-05, + "loss": 5.5141, + "step": 3356 + }, + { + "epoch": 0.3237222757955641, + "grad_norm": 4.161709785461426, + "learning_rate": 3.815436413369495e-05, + "loss": 5.8728, + "step": 3357 + }, + { + "epoch": 0.32381870781099326, + "grad_norm": 2.746968984603882, + "learning_rate": 3.814792298816465e-05, + "loss": 5.732, + "step": 3358 + }, + { + "epoch": 0.3239151398264224, + "grad_norm": 3.4565157890319824, + "learning_rate": 3.8141480635934155e-05, + "loss": 5.7448, + "step": 3359 + }, + { + "epoch": 0.3240115718418515, + "grad_norm": 3.5280375480651855, + "learning_rate": 3.813503707759475e-05, + "loss": 5.832, + "step": 3360 + }, + { + "epoch": 0.3241080038572806, + "grad_norm": 3.203831672668457, + "learning_rate": 3.81285923137378e-05, + "loss": 5.9505, + "step": 3361 + }, + { + "epoch": 0.32420443587270975, + "grad_norm": 3.5697171688079834, + "learning_rate": 3.8122146344954816e-05, + "loss": 5.9455, + "step": 3362 + }, + { + "epoch": 0.3243008678881389, + "grad_norm": 3.476818799972534, + "learning_rate": 3.8115699171837387e-05, + "loss": 5.9914, + "step": 3363 + }, + { + "epoch": 0.32439729990356797, + "grad_norm": 3.2167458534240723, + "learning_rate": 3.810925079497723e-05, + "loss": 5.8366, + "step": 3364 + }, + { + "epoch": 0.3244937319189971, + "grad_norm": 3.1837916374206543, + "learning_rate": 3.8102801214966174e-05, + "loss": 5.8494, + "step": 3365 + }, + { + "epoch": 0.32459016393442625, + "grad_norm": 3.6710031032562256, + "learning_rate": 3.8096350432396144e-05, + "loss": 5.9219, + "step": 3366 + }, + { + "epoch": 0.32468659594985533, + "grad_norm": 4.30075216293335, + "learning_rate": 3.8089898447859195e-05, + "loss": 5.8699, + "step": 3367 + }, + { + "epoch": 0.32478302796528447, + "grad_norm": 3.5353212356567383, + "learning_rate": 3.808344526194748e-05, + "loss": 5.7824, + "step": 3368 + }, + { + "epoch": 0.3248794599807136, + "grad_norm": 4.641177654266357, + "learning_rate": 3.8076990875253253e-05, + "loss": 5.9617, + "step": 3369 + }, + { + "epoch": 0.32497589199614274, + "grad_norm": 3.2233870029449463, + "learning_rate": 3.80705352883689e-05, + "loss": 5.6877, + "step": 3370 + }, + { + "epoch": 0.3250723240115718, + "grad_norm": 2.7355587482452393, + "learning_rate": 3.806407850188691e-05, + "loss": 5.8294, + "step": 3371 + }, + { + "epoch": 0.32516875602700096, + "grad_norm": 2.9650909900665283, + "learning_rate": 3.805762051639987e-05, + "loss": 5.9009, + "step": 3372 + }, + { + "epoch": 0.3252651880424301, + "grad_norm": 2.4856810569763184, + "learning_rate": 3.80511613325005e-05, + "loss": 5.987, + "step": 3373 + }, + { + "epoch": 0.32536162005785924, + "grad_norm": 2.791654109954834, + "learning_rate": 3.80447009507816e-05, + "loss": 6.0686, + "step": 3374 + }, + { + "epoch": 0.3254580520732883, + "grad_norm": 2.0695078372955322, + "learning_rate": 3.803823937183609e-05, + "loss": 5.9431, + "step": 3375 + }, + { + "epoch": 0.32555448408871746, + "grad_norm": 2.262892246246338, + "learning_rate": 3.803177659625704e-05, + "loss": 5.87, + "step": 3376 + }, + { + "epoch": 0.3256509161041466, + "grad_norm": 1.9678587913513184, + "learning_rate": 3.802531262463756e-05, + "loss": 5.9061, + "step": 3377 + }, + { + "epoch": 0.3257473481195757, + "grad_norm": 2.083035945892334, + "learning_rate": 3.801884745757092e-05, + "loss": 5.8308, + "step": 3378 + }, + { + "epoch": 0.3258437801350048, + "grad_norm": 2.478466272354126, + "learning_rate": 3.801238109565049e-05, + "loss": 5.8808, + "step": 3379 + }, + { + "epoch": 0.32594021215043395, + "grad_norm": 2.125162124633789, + "learning_rate": 3.800591353946973e-05, + "loss": 5.8674, + "step": 3380 + }, + { + "epoch": 0.3260366441658631, + "grad_norm": 2.4082188606262207, + "learning_rate": 3.799944478962224e-05, + "loss": 5.8497, + "step": 3381 + }, + { + "epoch": 0.32613307618129217, + "grad_norm": 1.9768050909042358, + "learning_rate": 3.79929748467017e-05, + "loss": 5.8971, + "step": 3382 + }, + { + "epoch": 0.3262295081967213, + "grad_norm": 2.329263925552368, + "learning_rate": 3.798650371130192e-05, + "loss": 5.8844, + "step": 3383 + }, + { + "epoch": 0.32632594021215044, + "grad_norm": 1.9914339780807495, + "learning_rate": 3.7980031384016824e-05, + "loss": 5.8345, + "step": 3384 + }, + { + "epoch": 0.3264223722275796, + "grad_norm": 3.0281882286071777, + "learning_rate": 3.797355786544041e-05, + "loss": 5.6292, + "step": 3385 + }, + { + "epoch": 0.32651880424300866, + "grad_norm": 3.2686803340911865, + "learning_rate": 3.796708315616683e-05, + "loss": 5.6058, + "step": 3386 + }, + { + "epoch": 0.3266152362584378, + "grad_norm": 2.5869812965393066, + "learning_rate": 3.7960607256790326e-05, + "loss": 5.6591, + "step": 3387 + }, + { + "epoch": 0.32671166827386694, + "grad_norm": 3.210618734359741, + "learning_rate": 3.795413016790523e-05, + "loss": 5.7599, + "step": 3388 + }, + { + "epoch": 0.326808100289296, + "grad_norm": 3.859790325164795, + "learning_rate": 3.794765189010601e-05, + "loss": 5.8968, + "step": 3389 + }, + { + "epoch": 0.32690453230472516, + "grad_norm": 2.861849069595337, + "learning_rate": 3.794117242398724e-05, + "loss": 5.9798, + "step": 3390 + }, + { + "epoch": 0.3270009643201543, + "grad_norm": 3.2939929962158203, + "learning_rate": 3.7934691770143596e-05, + "loss": 5.9198, + "step": 3391 + }, + { + "epoch": 0.32709739633558343, + "grad_norm": 2.4950473308563232, + "learning_rate": 3.792820992916985e-05, + "loss": 5.8461, + "step": 3392 + }, + { + "epoch": 0.3271938283510125, + "grad_norm": 3.2421255111694336, + "learning_rate": 3.792172690166091e-05, + "loss": 5.9301, + "step": 3393 + }, + { + "epoch": 0.32729026036644165, + "grad_norm": 2.2942376136779785, + "learning_rate": 3.79152426882118e-05, + "loss": 5.9736, + "step": 3394 + }, + { + "epoch": 0.3273866923818708, + "grad_norm": 1.740189790725708, + "learning_rate": 3.790875728941759e-05, + "loss": 5.8877, + "step": 3395 + }, + { + "epoch": 0.32748312439729993, + "grad_norm": 3.9561140537261963, + "learning_rate": 3.7902270705873534e-05, + "loss": 5.9988, + "step": 3396 + }, + { + "epoch": 0.327579556412729, + "grad_norm": 2.22383451461792, + "learning_rate": 3.789578293817495e-05, + "loss": 5.8394, + "step": 3397 + }, + { + "epoch": 0.32767598842815815, + "grad_norm": 3.188727855682373, + "learning_rate": 3.788929398691728e-05, + "loss": 5.9005, + "step": 3398 + }, + { + "epoch": 0.3277724204435873, + "grad_norm": 2.9418728351593018, + "learning_rate": 3.788280385269607e-05, + "loss": 5.9058, + "step": 3399 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 2.5485541820526123, + "learning_rate": 3.787631253610698e-05, + "loss": 5.8343, + "step": 3400 + }, + { + "epoch": 0.3279652844744455, + "grad_norm": 2.3064303398132324, + "learning_rate": 3.7869820037745776e-05, + "loss": 5.8379, + "step": 3401 + }, + { + "epoch": 0.32806171648987464, + "grad_norm": 2.8621926307678223, + "learning_rate": 3.786332635820832e-05, + "loss": 5.7839, + "step": 3402 + }, + { + "epoch": 0.3281581485053038, + "grad_norm": 4.1466264724731445, + "learning_rate": 3.7856831498090614e-05, + "loss": 5.9188, + "step": 3403 + }, + { + "epoch": 0.32825458052073286, + "grad_norm": 3.9696836471557617, + "learning_rate": 3.785033545798873e-05, + "loss": 5.9123, + "step": 3404 + }, + { + "epoch": 0.328351012536162, + "grad_norm": 2.6896040439605713, + "learning_rate": 3.784383823849887e-05, + "loss": 5.8758, + "step": 3405 + }, + { + "epoch": 0.32844744455159114, + "grad_norm": 4.1645402908325195, + "learning_rate": 3.783733984021736e-05, + "loss": 5.6425, + "step": 3406 + }, + { + "epoch": 0.3285438765670203, + "grad_norm": 5.741302013397217, + "learning_rate": 3.783084026374059e-05, + "loss": 5.7832, + "step": 3407 + }, + { + "epoch": 0.32864030858244936, + "grad_norm": 4.098703861236572, + "learning_rate": 3.7824339509665094e-05, + "loss": 5.8765, + "step": 3408 + }, + { + "epoch": 0.3287367405978785, + "grad_norm": 4.27393102645874, + "learning_rate": 3.78178375785875e-05, + "loss": 5.7462, + "step": 3409 + }, + { + "epoch": 0.32883317261330763, + "grad_norm": 5.5370707511901855, + "learning_rate": 3.7811334471104555e-05, + "loss": 5.7472, + "step": 3410 + }, + { + "epoch": 0.3289296046287367, + "grad_norm": 2.9103643894195557, + "learning_rate": 3.780483018781309e-05, + "loss": 6.1305, + "step": 3411 + }, + { + "epoch": 0.32902603664416585, + "grad_norm": 2.766334295272827, + "learning_rate": 3.7798324729310085e-05, + "loss": 5.8636, + "step": 3412 + }, + { + "epoch": 0.329122468659595, + "grad_norm": 3.2492988109588623, + "learning_rate": 3.779181809619258e-05, + "loss": 5.5746, + "step": 3413 + }, + { + "epoch": 0.32921890067502413, + "grad_norm": 3.0456392765045166, + "learning_rate": 3.778531028905775e-05, + "loss": 5.3625, + "step": 3414 + }, + { + "epoch": 0.3293153326904532, + "grad_norm": 2.688007116317749, + "learning_rate": 3.7778801308502885e-05, + "loss": 5.2818, + "step": 3415 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 4.901793003082275, + "learning_rate": 3.777229115512536e-05, + "loss": 5.8511, + "step": 3416 + }, + { + "epoch": 0.3295081967213115, + "grad_norm": 3.864717721939087, + "learning_rate": 3.7765779829522675e-05, + "loss": 5.268, + "step": 3417 + }, + { + "epoch": 0.3296046287367406, + "grad_norm": 3.1484501361846924, + "learning_rate": 3.7759267332292434e-05, + "loss": 5.7575, + "step": 3418 + }, + { + "epoch": 0.3297010607521697, + "grad_norm": 2.653043270111084, + "learning_rate": 3.775275366403233e-05, + "loss": 6.066, + "step": 3419 + }, + { + "epoch": 0.32979749276759884, + "grad_norm": 2.6725385189056396, + "learning_rate": 3.77462388253402e-05, + "loss": 6.2546, + "step": 3420 + }, + { + "epoch": 0.329893924783028, + "grad_norm": 3.6591482162475586, + "learning_rate": 3.773972281681396e-05, + "loss": 5.4164, + "step": 3421 + }, + { + "epoch": 0.32999035679845706, + "grad_norm": 4.08371114730835, + "learning_rate": 3.773320563905163e-05, + "loss": 5.5607, + "step": 3422 + }, + { + "epoch": 0.3300867888138862, + "grad_norm": 3.3361175060272217, + "learning_rate": 3.772668729265138e-05, + "loss": 5.7493, + "step": 3423 + }, + { + "epoch": 0.33018322082931534, + "grad_norm": 4.1716742515563965, + "learning_rate": 3.772016777821141e-05, + "loss": 5.5629, + "step": 3424 + }, + { + "epoch": 0.3302796528447445, + "grad_norm": 2.993905782699585, + "learning_rate": 3.771364709633011e-05, + "loss": 5.87, + "step": 3425 + }, + { + "epoch": 0.33037608486017356, + "grad_norm": 3.3309779167175293, + "learning_rate": 3.770712524760591e-05, + "loss": 6.011, + "step": 3426 + }, + { + "epoch": 0.3304725168756027, + "grad_norm": 4.6572723388671875, + "learning_rate": 3.770060223263742e-05, + "loss": 5.82, + "step": 3427 + }, + { + "epoch": 0.33056894889103183, + "grad_norm": 4.229450225830078, + "learning_rate": 3.769407805202327e-05, + "loss": 6.0456, + "step": 3428 + }, + { + "epoch": 0.33066538090646097, + "grad_norm": 4.426070690155029, + "learning_rate": 3.768755270636226e-05, + "loss": 5.8203, + "step": 3429 + }, + { + "epoch": 0.33076181292189005, + "grad_norm": 4.04083251953125, + "learning_rate": 3.768102619625329e-05, + "loss": 5.933, + "step": 3430 + }, + { + "epoch": 0.3308582449373192, + "grad_norm": 2.967085123062134, + "learning_rate": 3.767449852229534e-05, + "loss": 5.92, + "step": 3431 + }, + { + "epoch": 0.33095467695274833, + "grad_norm": 3.278515100479126, + "learning_rate": 3.766796968508752e-05, + "loss": 5.7226, + "step": 3432 + }, + { + "epoch": 0.3310511089681774, + "grad_norm": 4.092676639556885, + "learning_rate": 3.7661439685229026e-05, + "loss": 6.1803, + "step": 3433 + }, + { + "epoch": 0.33114754098360655, + "grad_norm": 2.6460113525390625, + "learning_rate": 3.765490852331919e-05, + "loss": 6.1598, + "step": 3434 + }, + { + "epoch": 0.3312439729990357, + "grad_norm": 2.9913547039031982, + "learning_rate": 3.764837619995741e-05, + "loss": 5.9056, + "step": 3435 + }, + { + "epoch": 0.3313404050144648, + "grad_norm": 2.2504560947418213, + "learning_rate": 3.7641842715743245e-05, + "loss": 5.8498, + "step": 3436 + }, + { + "epoch": 0.3314368370298939, + "grad_norm": 2.494401693344116, + "learning_rate": 3.763530807127631e-05, + "loss": 5.7821, + "step": 3437 + }, + { + "epoch": 0.33153326904532304, + "grad_norm": 2.248682737350464, + "learning_rate": 3.762877226715635e-05, + "loss": 5.4807, + "step": 3438 + }, + { + "epoch": 0.3316297010607522, + "grad_norm": 3.076559066772461, + "learning_rate": 3.7622235303983214e-05, + "loss": 5.7989, + "step": 3439 + }, + { + "epoch": 0.3317261330761813, + "grad_norm": 3.4339773654937744, + "learning_rate": 3.7615697182356854e-05, + "loss": 5.2926, + "step": 3440 + }, + { + "epoch": 0.3318225650916104, + "grad_norm": 2.8234403133392334, + "learning_rate": 3.760915790287734e-05, + "loss": 5.9113, + "step": 3441 + }, + { + "epoch": 0.33191899710703954, + "grad_norm": 3.082697629928589, + "learning_rate": 3.760261746614484e-05, + "loss": 5.6566, + "step": 3442 + }, + { + "epoch": 0.3320154291224687, + "grad_norm": 3.0850064754486084, + "learning_rate": 3.759607587275961e-05, + "loss": 5.7339, + "step": 3443 + }, + { + "epoch": 0.33211186113789776, + "grad_norm": 1.6412379741668701, + "learning_rate": 3.758953312332204e-05, + "loss": 5.7423, + "step": 3444 + }, + { + "epoch": 0.3322082931533269, + "grad_norm": 3.1716997623443604, + "learning_rate": 3.758298921843262e-05, + "loss": 5.7344, + "step": 3445 + }, + { + "epoch": 0.33230472516875603, + "grad_norm": 3.597747802734375, + "learning_rate": 3.7576444158691936e-05, + "loss": 6.054, + "step": 3446 + }, + { + "epoch": 0.33240115718418517, + "grad_norm": 3.098081588745117, + "learning_rate": 3.7569897944700684e-05, + "loss": 5.704, + "step": 3447 + }, + { + "epoch": 0.33249758919961425, + "grad_norm": 3.7912802696228027, + "learning_rate": 3.756335057705967e-05, + "loss": 5.4657, + "step": 3448 + }, + { + "epoch": 0.3325940212150434, + "grad_norm": 3.0478227138519287, + "learning_rate": 3.75568020563698e-05, + "loss": 5.7816, + "step": 3449 + }, + { + "epoch": 0.3326904532304725, + "grad_norm": 2.5501389503479004, + "learning_rate": 3.7550252383232096e-05, + "loss": 5.9457, + "step": 3450 + }, + { + "epoch": 0.33278688524590166, + "grad_norm": 3.5297904014587402, + "learning_rate": 3.7543701558247676e-05, + "loss": 5.9512, + "step": 3451 + }, + { + "epoch": 0.33288331726133075, + "grad_norm": 4.978525638580322, + "learning_rate": 3.753714958201776e-05, + "loss": 5.7665, + "step": 3452 + }, + { + "epoch": 0.3329797492767599, + "grad_norm": 4.029890537261963, + "learning_rate": 3.753059645514369e-05, + "loss": 5.7987, + "step": 3453 + }, + { + "epoch": 0.333076181292189, + "grad_norm": 3.544055461883545, + "learning_rate": 3.75240421782269e-05, + "loss": 6.1344, + "step": 3454 + }, + { + "epoch": 0.3331726133076181, + "grad_norm": 3.2018439769744873, + "learning_rate": 3.7517486751868934e-05, + "loss": 5.8522, + "step": 3455 + }, + { + "epoch": 0.33326904532304724, + "grad_norm": 5.256520748138428, + "learning_rate": 3.7510930176671444e-05, + "loss": 5.6717, + "step": 3456 + }, + { + "epoch": 0.3333654773384764, + "grad_norm": 3.7159929275512695, + "learning_rate": 3.750437245323618e-05, + "loss": 5.5338, + "step": 3457 + }, + { + "epoch": 0.3334619093539055, + "grad_norm": 3.824860095977783, + "learning_rate": 3.7497813582165e-05, + "loss": 5.3353, + "step": 3458 + }, + { + "epoch": 0.3335583413693346, + "grad_norm": 4.706644535064697, + "learning_rate": 3.749125356405987e-05, + "loss": 5.7234, + "step": 3459 + }, + { + "epoch": 0.33365477338476374, + "grad_norm": 4.3433051109313965, + "learning_rate": 3.7484692399522865e-05, + "loss": 5.2566, + "step": 3460 + }, + { + "epoch": 0.3337512054001929, + "grad_norm": 3.910614252090454, + "learning_rate": 3.747813008915615e-05, + "loss": 5.8892, + "step": 3461 + }, + { + "epoch": 0.333847637415622, + "grad_norm": 4.643282413482666, + "learning_rate": 3.747156663356202e-05, + "loss": 5.8722, + "step": 3462 + }, + { + "epoch": 0.3339440694310511, + "grad_norm": 2.8346331119537354, + "learning_rate": 3.7465002033342856e-05, + "loss": 5.9001, + "step": 3463 + }, + { + "epoch": 0.33404050144648023, + "grad_norm": 2.580716609954834, + "learning_rate": 3.745843628910115e-05, + "loss": 6.0918, + "step": 3464 + }, + { + "epoch": 0.33413693346190937, + "grad_norm": 2.836132287979126, + "learning_rate": 3.745186940143949e-05, + "loss": 5.856, + "step": 3465 + }, + { + "epoch": 0.33423336547733845, + "grad_norm": 2.8831093311309814, + "learning_rate": 3.744530137096058e-05, + "loss": 5.7539, + "step": 3466 + }, + { + "epoch": 0.3343297974927676, + "grad_norm": 2.8061163425445557, + "learning_rate": 3.743873219826723e-05, + "loss": 5.7842, + "step": 3467 + }, + { + "epoch": 0.3344262295081967, + "grad_norm": 2.3918988704681396, + "learning_rate": 3.743216188396235e-05, + "loss": 5.7614, + "step": 3468 + }, + { + "epoch": 0.33452266152362586, + "grad_norm": 2.624349355697632, + "learning_rate": 3.742559042864895e-05, + "loss": 5.8596, + "step": 3469 + }, + { + "epoch": 0.33461909353905495, + "grad_norm": 2.925089120864868, + "learning_rate": 3.7419017832930156e-05, + "loss": 5.8638, + "step": 3470 + }, + { + "epoch": 0.3347155255544841, + "grad_norm": 2.10020112991333, + "learning_rate": 3.74124440974092e-05, + "loss": 5.9067, + "step": 3471 + }, + { + "epoch": 0.3348119575699132, + "grad_norm": 2.026355743408203, + "learning_rate": 3.740586922268938e-05, + "loss": 5.6826, + "step": 3472 + }, + { + "epoch": 0.33490838958534236, + "grad_norm": 2.0885276794433594, + "learning_rate": 3.739929320937417e-05, + "loss": 5.8381, + "step": 3473 + }, + { + "epoch": 0.33500482160077144, + "grad_norm": 1.9168996810913086, + "learning_rate": 3.7392716058067086e-05, + "loss": 5.8809, + "step": 3474 + }, + { + "epoch": 0.3351012536162006, + "grad_norm": 2.7754862308502197, + "learning_rate": 3.7386137769371775e-05, + "loss": 5.8533, + "step": 3475 + }, + { + "epoch": 0.3351976856316297, + "grad_norm": 2.3092894554138184, + "learning_rate": 3.7379558343891984e-05, + "loss": 5.9286, + "step": 3476 + }, + { + "epoch": 0.3352941176470588, + "grad_norm": 2.048403739929199, + "learning_rate": 3.7372977782231566e-05, + "loss": 5.7737, + "step": 3477 + }, + { + "epoch": 0.33539054966248794, + "grad_norm": 2.935818910598755, + "learning_rate": 3.7366396084994475e-05, + "loss": 5.7643, + "step": 3478 + }, + { + "epoch": 0.3354869816779171, + "grad_norm": 2.8805267810821533, + "learning_rate": 3.735981325278477e-05, + "loss": 5.9927, + "step": 3479 + }, + { + "epoch": 0.3355834136933462, + "grad_norm": 3.1234402656555176, + "learning_rate": 3.735322928620662e-05, + "loss": 5.6322, + "step": 3480 + }, + { + "epoch": 0.3356798457087753, + "grad_norm": 3.559009075164795, + "learning_rate": 3.734664418586429e-05, + "loss": 5.3698, + "step": 3481 + }, + { + "epoch": 0.33577627772420443, + "grad_norm": 2.6111714839935303, + "learning_rate": 3.734005795236215e-05, + "loss": 5.6441, + "step": 3482 + }, + { + "epoch": 0.33587270973963357, + "grad_norm": 3.0585126876831055, + "learning_rate": 3.733347058630468e-05, + "loss": 5.5284, + "step": 3483 + }, + { + "epoch": 0.3359691417550627, + "grad_norm": 3.4431490898132324, + "learning_rate": 3.7326882088296463e-05, + "loss": 5.5547, + "step": 3484 + }, + { + "epoch": 0.3360655737704918, + "grad_norm": 3.6926395893096924, + "learning_rate": 3.732029245894218e-05, + "loss": 5.2254, + "step": 3485 + }, + { + "epoch": 0.3361620057859209, + "grad_norm": 3.4111204147338867, + "learning_rate": 3.731370169884662e-05, + "loss": 5.7069, + "step": 3486 + }, + { + "epoch": 0.33625843780135006, + "grad_norm": 6.081180095672607, + "learning_rate": 3.730710980861466e-05, + "loss": 5.5253, + "step": 3487 + }, + { + "epoch": 0.33635486981677915, + "grad_norm": 4.026671886444092, + "learning_rate": 3.730051678885133e-05, + "loss": 5.778, + "step": 3488 + }, + { + "epoch": 0.3364513018322083, + "grad_norm": 3.6700663566589355, + "learning_rate": 3.7293922640161697e-05, + "loss": 5.7698, + "step": 3489 + }, + { + "epoch": 0.3365477338476374, + "grad_norm": 3.6922109127044678, + "learning_rate": 3.728732736315097e-05, + "loss": 5.8045, + "step": 3490 + }, + { + "epoch": 0.33664416586306656, + "grad_norm": 2.7666680812835693, + "learning_rate": 3.728073095842447e-05, + "loss": 5.8041, + "step": 3491 + }, + { + "epoch": 0.33674059787849564, + "grad_norm": 3.2328720092773438, + "learning_rate": 3.7274133426587584e-05, + "loss": 5.4082, + "step": 3492 + }, + { + "epoch": 0.3368370298939248, + "grad_norm": 3.0867226123809814, + "learning_rate": 3.726753476824586e-05, + "loss": 5.5156, + "step": 3493 + }, + { + "epoch": 0.3369334619093539, + "grad_norm": 2.2976489067077637, + "learning_rate": 3.7260934984004884e-05, + "loss": 5.8923, + "step": 3494 + }, + { + "epoch": 0.33702989392478305, + "grad_norm": 1.7867648601531982, + "learning_rate": 3.7254334074470384e-05, + "loss": 5.6462, + "step": 3495 + }, + { + "epoch": 0.33712632594021213, + "grad_norm": 2.8281924724578857, + "learning_rate": 3.724773204024819e-05, + "loss": 5.7181, + "step": 3496 + }, + { + "epoch": 0.3372227579556413, + "grad_norm": 2.653963327407837, + "learning_rate": 3.724112888194422e-05, + "loss": 5.5108, + "step": 3497 + }, + { + "epoch": 0.3373191899710704, + "grad_norm": 3.0333023071289062, + "learning_rate": 3.7234524600164513e-05, + "loss": 5.98, + "step": 3498 + }, + { + "epoch": 0.3374156219864995, + "grad_norm": 2.8439221382141113, + "learning_rate": 3.7227919195515196e-05, + "loss": 5.9877, + "step": 3499 + }, + { + "epoch": 0.33751205400192863, + "grad_norm": 2.873372793197632, + "learning_rate": 3.72213126686025e-05, + "loss": 5.9927, + "step": 3500 + }, + { + "epoch": 0.33760848601735777, + "grad_norm": 2.4989659786224365, + "learning_rate": 3.721470502003277e-05, + "loss": 5.8098, + "step": 3501 + }, + { + "epoch": 0.3377049180327869, + "grad_norm": 2.4055044651031494, + "learning_rate": 3.720809625041245e-05, + "loss": 5.4985, + "step": 3502 + }, + { + "epoch": 0.337801350048216, + "grad_norm": 3.101198673248291, + "learning_rate": 3.720148636034808e-05, + "loss": 5.9475, + "step": 3503 + }, + { + "epoch": 0.3378977820636451, + "grad_norm": 2.739036798477173, + "learning_rate": 3.7194875350446306e-05, + "loss": 5.9846, + "step": 3504 + }, + { + "epoch": 0.33799421407907426, + "grad_norm": 3.1133065223693848, + "learning_rate": 3.718826322131388e-05, + "loss": 5.7683, + "step": 3505 + }, + { + "epoch": 0.3380906460945034, + "grad_norm": 1.895689845085144, + "learning_rate": 3.718164997355767e-05, + "loss": 5.7437, + "step": 3506 + }, + { + "epoch": 0.3381870781099325, + "grad_norm": 1.9193354845046997, + "learning_rate": 3.7175035607784607e-05, + "loss": 5.6431, + "step": 3507 + }, + { + "epoch": 0.3382835101253616, + "grad_norm": 2.5986745357513428, + "learning_rate": 3.716842012460177e-05, + "loss": 5.9571, + "step": 3508 + }, + { + "epoch": 0.33837994214079076, + "grad_norm": 2.8327081203460693, + "learning_rate": 3.71618035246163e-05, + "loss": 5.7066, + "step": 3509 + }, + { + "epoch": 0.33847637415621984, + "grad_norm": 5.433886528015137, + "learning_rate": 3.715518580843548e-05, + "loss": 5.0807, + "step": 3510 + }, + { + "epoch": 0.338572806171649, + "grad_norm": 3.315505027770996, + "learning_rate": 3.714856697666666e-05, + "loss": 5.8476, + "step": 3511 + }, + { + "epoch": 0.3386692381870781, + "grad_norm": 2.9820876121520996, + "learning_rate": 3.7141947029917325e-05, + "loss": 5.9356, + "step": 3512 + }, + { + "epoch": 0.33876567020250725, + "grad_norm": 2.368940830230713, + "learning_rate": 3.713532596879502e-05, + "loss": 5.8182, + "step": 3513 + }, + { + "epoch": 0.33886210221793633, + "grad_norm": 3.0451951026916504, + "learning_rate": 3.712870379390745e-05, + "loss": 6.0021, + "step": 3514 + }, + { + "epoch": 0.33895853423336547, + "grad_norm": 3.3006579875946045, + "learning_rate": 3.712208050586237e-05, + "loss": 5.7821, + "step": 3515 + }, + { + "epoch": 0.3390549662487946, + "grad_norm": 3.2603161334991455, + "learning_rate": 3.7115456105267656e-05, + "loss": 5.5911, + "step": 3516 + }, + { + "epoch": 0.33915139826422375, + "grad_norm": 3.878767251968384, + "learning_rate": 3.710883059273129e-05, + "loss": 5.6411, + "step": 3517 + }, + { + "epoch": 0.33924783027965283, + "grad_norm": 4.073793411254883, + "learning_rate": 3.7102203968861364e-05, + "loss": 5.7503, + "step": 3518 + }, + { + "epoch": 0.33934426229508197, + "grad_norm": 4.258067607879639, + "learning_rate": 3.7095576234266056e-05, + "loss": 5.7735, + "step": 3519 + }, + { + "epoch": 0.3394406943105111, + "grad_norm": 3.8316779136657715, + "learning_rate": 3.708894738955364e-05, + "loss": 5.5505, + "step": 3520 + }, + { + "epoch": 0.3395371263259402, + "grad_norm": 3.203350305557251, + "learning_rate": 3.708231743533251e-05, + "loss": 5.688, + "step": 3521 + }, + { + "epoch": 0.3396335583413693, + "grad_norm": 4.545896053314209, + "learning_rate": 3.707568637221116e-05, + "loss": 5.7373, + "step": 3522 + }, + { + "epoch": 0.33972999035679846, + "grad_norm": 2.7756688594818115, + "learning_rate": 3.706905420079818e-05, + "loss": 5.6098, + "step": 3523 + }, + { + "epoch": 0.3398264223722276, + "grad_norm": 3.7275257110595703, + "learning_rate": 3.7062420921702263e-05, + "loss": 5.8068, + "step": 3524 + }, + { + "epoch": 0.3399228543876567, + "grad_norm": 4.216909885406494, + "learning_rate": 3.705578653553219e-05, + "loss": 5.5895, + "step": 3525 + }, + { + "epoch": 0.3400192864030858, + "grad_norm": 3.4678385257720947, + "learning_rate": 3.7049151042896876e-05, + "loss": 5.454, + "step": 3526 + }, + { + "epoch": 0.34011571841851496, + "grad_norm": 4.081980228424072, + "learning_rate": 3.704251444440531e-05, + "loss": 5.4393, + "step": 3527 + }, + { + "epoch": 0.3402121504339441, + "grad_norm": 5.501698017120361, + "learning_rate": 3.703587674066658e-05, + "loss": 5.4971, + "step": 3528 + }, + { + "epoch": 0.3403085824493732, + "grad_norm": 4.907596588134766, + "learning_rate": 3.7029237932289906e-05, + "loss": 5.4649, + "step": 3529 + }, + { + "epoch": 0.3404050144648023, + "grad_norm": 3.1804072856903076, + "learning_rate": 3.702259801988458e-05, + "loss": 5.4242, + "step": 3530 + }, + { + "epoch": 0.34050144648023145, + "grad_norm": 2.8023996353149414, + "learning_rate": 3.701595700406e-05, + "loss": 5.4959, + "step": 3531 + }, + { + "epoch": 0.34059787849566053, + "grad_norm": 3.7659356594085693, + "learning_rate": 3.700931488542568e-05, + "loss": 5.7285, + "step": 3532 + }, + { + "epoch": 0.34069431051108967, + "grad_norm": 2.9802348613739014, + "learning_rate": 3.700267166459122e-05, + "loss": 5.8146, + "step": 3533 + }, + { + "epoch": 0.3407907425265188, + "grad_norm": 4.943274021148682, + "learning_rate": 3.6996027342166315e-05, + "loss": 5.6778, + "step": 3534 + }, + { + "epoch": 0.34088717454194795, + "grad_norm": 5.526296138763428, + "learning_rate": 3.69893819187608e-05, + "loss": 5.7478, + "step": 3535 + }, + { + "epoch": 0.34098360655737703, + "grad_norm": 4.019134521484375, + "learning_rate": 3.6982735394984565e-05, + "loss": 5.7965, + "step": 3536 + }, + { + "epoch": 0.34108003857280617, + "grad_norm": 3.02445650100708, + "learning_rate": 3.697608777144762e-05, + "loss": 5.8324, + "step": 3537 + }, + { + "epoch": 0.3411764705882353, + "grad_norm": 3.0209617614746094, + "learning_rate": 3.696943904876009e-05, + "loss": 5.6477, + "step": 3538 + }, + { + "epoch": 0.34127290260366444, + "grad_norm": 4.634974479675293, + "learning_rate": 3.696278922753216e-05, + "loss": 5.7916, + "step": 3539 + }, + { + "epoch": 0.3413693346190935, + "grad_norm": 3.586941957473755, + "learning_rate": 3.695613830837417e-05, + "loss": 5.8541, + "step": 3540 + }, + { + "epoch": 0.34146576663452266, + "grad_norm": 3.199023723602295, + "learning_rate": 3.694948629189652e-05, + "loss": 5.8192, + "step": 3541 + }, + { + "epoch": 0.3415621986499518, + "grad_norm": 3.2266688346862793, + "learning_rate": 3.694283317870973e-05, + "loss": 5.7826, + "step": 3542 + }, + { + "epoch": 0.3416586306653809, + "grad_norm": 3.2808146476745605, + "learning_rate": 3.6936178969424404e-05, + "loss": 5.6502, + "step": 3543 + }, + { + "epoch": 0.34175506268081, + "grad_norm": 5.307558059692383, + "learning_rate": 3.6929523664651266e-05, + "loss": 4.8076, + "step": 3544 + }, + { + "epoch": 0.34185149469623916, + "grad_norm": 5.097909927368164, + "learning_rate": 3.6922867265001123e-05, + "loss": 5.4622, + "step": 3545 + }, + { + "epoch": 0.3419479267116683, + "grad_norm": 3.0424883365631104, + "learning_rate": 3.6916209771084906e-05, + "loss": 5.3303, + "step": 3546 + }, + { + "epoch": 0.3420443587270974, + "grad_norm": 3.4932048320770264, + "learning_rate": 3.6909551183513614e-05, + "loss": 5.6413, + "step": 3547 + }, + { + "epoch": 0.3421407907425265, + "grad_norm": 3.970811367034912, + "learning_rate": 3.6902891502898386e-05, + "loss": 5.7228, + "step": 3548 + }, + { + "epoch": 0.34223722275795565, + "grad_norm": 2.6808388233184814, + "learning_rate": 3.6896230729850414e-05, + "loss": 5.8161, + "step": 3549 + }, + { + "epoch": 0.3423336547733848, + "grad_norm": 3.2901737689971924, + "learning_rate": 3.6889568864981026e-05, + "loss": 5.7515, + "step": 3550 + }, + { + "epoch": 0.34243008678881387, + "grad_norm": 3.1697640419006348, + "learning_rate": 3.688290590890164e-05, + "loss": 5.7203, + "step": 3551 + }, + { + "epoch": 0.342526518804243, + "grad_norm": 2.461531400680542, + "learning_rate": 3.6876241862223795e-05, + "loss": 5.6325, + "step": 3552 + }, + { + "epoch": 0.34262295081967215, + "grad_norm": 2.6565122604370117, + "learning_rate": 3.686957672555907e-05, + "loss": 5.6986, + "step": 3553 + }, + { + "epoch": 0.3427193828351012, + "grad_norm": 2.1679341793060303, + "learning_rate": 3.6862910499519205e-05, + "loss": 5.6351, + "step": 3554 + }, + { + "epoch": 0.34281581485053036, + "grad_norm": 2.1023149490356445, + "learning_rate": 3.685624318471602e-05, + "loss": 5.7012, + "step": 3555 + }, + { + "epoch": 0.3429122468659595, + "grad_norm": 2.255195140838623, + "learning_rate": 3.684957478176142e-05, + "loss": 5.6728, + "step": 3556 + }, + { + "epoch": 0.34300867888138864, + "grad_norm": 2.4911389350891113, + "learning_rate": 3.684290529126744e-05, + "loss": 5.3451, + "step": 3557 + }, + { + "epoch": 0.3431051108968177, + "grad_norm": 2.9335861206054688, + "learning_rate": 3.683623471384618e-05, + "loss": 5.1951, + "step": 3558 + }, + { + "epoch": 0.34320154291224686, + "grad_norm": 2.1074671745300293, + "learning_rate": 3.682956305010987e-05, + "loss": 5.2484, + "step": 3559 + }, + { + "epoch": 0.343297974927676, + "grad_norm": 3.1398472785949707, + "learning_rate": 3.6822890300670827e-05, + "loss": 5.2838, + "step": 3560 + }, + { + "epoch": 0.34339440694310513, + "grad_norm": 3.2225255966186523, + "learning_rate": 3.681621646614146e-05, + "loss": 5.2519, + "step": 3561 + }, + { + "epoch": 0.3434908389585342, + "grad_norm": 3.238166570663452, + "learning_rate": 3.6809541547134284e-05, + "loss": 5.1974, + "step": 3562 + }, + { + "epoch": 0.34358727097396335, + "grad_norm": 3.424398899078369, + "learning_rate": 3.680286554426193e-05, + "loss": 5.1506, + "step": 3563 + }, + { + "epoch": 0.3436837029893925, + "grad_norm": 2.7262861728668213, + "learning_rate": 3.67961884581371e-05, + "loss": 5.4727, + "step": 3564 + }, + { + "epoch": 0.3437801350048216, + "grad_norm": 3.8829619884490967, + "learning_rate": 3.6789510289372616e-05, + "loss": 5.5934, + "step": 3565 + }, + { + "epoch": 0.3438765670202507, + "grad_norm": 2.933872699737549, + "learning_rate": 3.678283103858138e-05, + "loss": 5.7486, + "step": 3566 + }, + { + "epoch": 0.34397299903567985, + "grad_norm": 2.4272072315216064, + "learning_rate": 3.677615070637642e-05, + "loss": 5.7295, + "step": 3567 + }, + { + "epoch": 0.344069431051109, + "grad_norm": 2.6498525142669678, + "learning_rate": 3.676946929337084e-05, + "loss": 5.9253, + "step": 3568 + }, + { + "epoch": 0.34416586306653807, + "grad_norm": 2.789222002029419, + "learning_rate": 3.676278680017785e-05, + "loss": 5.8293, + "step": 3569 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 2.568071126937866, + "learning_rate": 3.6756103227410776e-05, + "loss": 5.6919, + "step": 3570 + }, + { + "epoch": 0.34435872709739634, + "grad_norm": 2.653017520904541, + "learning_rate": 3.6749418575683e-05, + "loss": 5.8774, + "step": 3571 + }, + { + "epoch": 0.3444551591128255, + "grad_norm": 2.5044450759887695, + "learning_rate": 3.6742732845608066e-05, + "loss": 5.7439, + "step": 3572 + }, + { + "epoch": 0.34455159112825456, + "grad_norm": 3.7420992851257324, + "learning_rate": 3.673604603779955e-05, + "loss": 5.7419, + "step": 3573 + }, + { + "epoch": 0.3446480231436837, + "grad_norm": 3.942234516143799, + "learning_rate": 3.672935815287118e-05, + "loss": 5.9709, + "step": 3574 + }, + { + "epoch": 0.34474445515911284, + "grad_norm": 4.378077507019043, + "learning_rate": 3.672266919143675e-05, + "loss": 6.1226, + "step": 3575 + }, + { + "epoch": 0.3448408871745419, + "grad_norm": 2.7210021018981934, + "learning_rate": 3.671597915411018e-05, + "loss": 6.1224, + "step": 3576 + }, + { + "epoch": 0.34493731918997106, + "grad_norm": 2.7137513160705566, + "learning_rate": 3.670928804150546e-05, + "loss": 5.9951, + "step": 3577 + }, + { + "epoch": 0.3450337512054002, + "grad_norm": 2.823195219039917, + "learning_rate": 3.6702595854236694e-05, + "loss": 5.6736, + "step": 3578 + }, + { + "epoch": 0.34513018322082933, + "grad_norm": 2.243690013885498, + "learning_rate": 3.669590259291808e-05, + "loss": 5.5489, + "step": 3579 + }, + { + "epoch": 0.3452266152362584, + "grad_norm": 2.5164635181427, + "learning_rate": 3.668920825816393e-05, + "loss": 5.7288, + "step": 3580 + }, + { + "epoch": 0.34532304725168755, + "grad_norm": 2.0581588745117188, + "learning_rate": 3.668251285058863e-05, + "loss": 5.7608, + "step": 3581 + }, + { + "epoch": 0.3454194792671167, + "grad_norm": 2.6006886959075928, + "learning_rate": 3.667581637080667e-05, + "loss": 5.7061, + "step": 3582 + }, + { + "epoch": 0.34551591128254583, + "grad_norm": 2.452216863632202, + "learning_rate": 3.6669118819432663e-05, + "loss": 5.7967, + "step": 3583 + }, + { + "epoch": 0.3456123432979749, + "grad_norm": 2.660576820373535, + "learning_rate": 3.6662420197081295e-05, + "loss": 6.0687, + "step": 3584 + }, + { + "epoch": 0.34570877531340405, + "grad_norm": 3.8777456283569336, + "learning_rate": 3.665572050436735e-05, + "loss": 5.8656, + "step": 3585 + }, + { + "epoch": 0.3458052073288332, + "grad_norm": 2.5428578853607178, + "learning_rate": 3.664901974190573e-05, + "loss": 6.0582, + "step": 3586 + }, + { + "epoch": 0.34590163934426227, + "grad_norm": 3.970855951309204, + "learning_rate": 3.664231791031141e-05, + "loss": 5.7815, + "step": 3587 + }, + { + "epoch": 0.3459980713596914, + "grad_norm": 3.667466878890991, + "learning_rate": 3.663561501019948e-05, + "loss": 5.7796, + "step": 3588 + }, + { + "epoch": 0.34609450337512054, + "grad_norm": 2.6724586486816406, + "learning_rate": 3.6628911042185135e-05, + "loss": 5.6903, + "step": 3589 + }, + { + "epoch": 0.3461909353905497, + "grad_norm": 4.4857177734375, + "learning_rate": 3.662220600688363e-05, + "loss": 5.5811, + "step": 3590 + }, + { + "epoch": 0.34628736740597876, + "grad_norm": 3.6130504608154297, + "learning_rate": 3.6615499904910376e-05, + "loss": 5.4536, + "step": 3591 + }, + { + "epoch": 0.3463837994214079, + "grad_norm": 3.335897207260132, + "learning_rate": 3.660879273688084e-05, + "loss": 5.8423, + "step": 3592 + }, + { + "epoch": 0.34648023143683704, + "grad_norm": 2.286867618560791, + "learning_rate": 3.660208450341058e-05, + "loss": 5.7168, + "step": 3593 + }, + { + "epoch": 0.3465766634522662, + "grad_norm": 3.7266464233398438, + "learning_rate": 3.659537520511529e-05, + "loss": 5.729, + "step": 3594 + }, + { + "epoch": 0.34667309546769526, + "grad_norm": 3.6770284175872803, + "learning_rate": 3.658866484261074e-05, + "loss": 5.7203, + "step": 3595 + }, + { + "epoch": 0.3467695274831244, + "grad_norm": 2.627998113632202, + "learning_rate": 3.658195341651279e-05, + "loss": 5.6579, + "step": 3596 + }, + { + "epoch": 0.34686595949855353, + "grad_norm": 3.315852165222168, + "learning_rate": 3.657524092743741e-05, + "loss": 5.7398, + "step": 3597 + }, + { + "epoch": 0.3469623915139826, + "grad_norm": 4.512613296508789, + "learning_rate": 3.656852737600067e-05, + "loss": 5.7349, + "step": 3598 + }, + { + "epoch": 0.34705882352941175, + "grad_norm": 3.5615878105163574, + "learning_rate": 3.656181276281873e-05, + "loss": 5.8774, + "step": 3599 + }, + { + "epoch": 0.3471552555448409, + "grad_norm": 2.637031316757202, + "learning_rate": 3.655509708850783e-05, + "loss": 5.8134, + "step": 3600 + }, + { + "epoch": 0.34725168756027003, + "grad_norm": 3.656923770904541, + "learning_rate": 3.6548380353684356e-05, + "loss": 5.753, + "step": 3601 + }, + { + "epoch": 0.3473481195756991, + "grad_norm": 4.245492935180664, + "learning_rate": 3.654166255896473e-05, + "loss": 5.7187, + "step": 3602 + }, + { + "epoch": 0.34744455159112825, + "grad_norm": 3.5620999336242676, + "learning_rate": 3.653494370496554e-05, + "loss": 5.7114, + "step": 3603 + }, + { + "epoch": 0.3475409836065574, + "grad_norm": 2.159930467605591, + "learning_rate": 3.6528223792303394e-05, + "loss": 5.7923, + "step": 3604 + }, + { + "epoch": 0.3476374156219865, + "grad_norm": 3.8238439559936523, + "learning_rate": 3.652150282159507e-05, + "loss": 5.6298, + "step": 3605 + }, + { + "epoch": 0.3477338476374156, + "grad_norm": 2.853029727935791, + "learning_rate": 3.651478079345739e-05, + "loss": 5.859, + "step": 3606 + }, + { + "epoch": 0.34783027965284474, + "grad_norm": 2.2207844257354736, + "learning_rate": 3.6508057708507315e-05, + "loss": 5.7595, + "step": 3607 + }, + { + "epoch": 0.3479267116682739, + "grad_norm": 2.03663969039917, + "learning_rate": 3.650133356736186e-05, + "loss": 5.7622, + "step": 3608 + }, + { + "epoch": 0.34802314368370296, + "grad_norm": 2.6140530109405518, + "learning_rate": 3.6494608370638155e-05, + "loss": 6.0106, + "step": 3609 + }, + { + "epoch": 0.3481195756991321, + "grad_norm": 2.9761674404144287, + "learning_rate": 3.6487882118953456e-05, + "loss": 5.5696, + "step": 3610 + }, + { + "epoch": 0.34821600771456124, + "grad_norm": 2.5461642742156982, + "learning_rate": 3.6481154812925065e-05, + "loss": 5.727, + "step": 3611 + }, + { + "epoch": 0.3483124397299904, + "grad_norm": 1.636739730834961, + "learning_rate": 3.647442645317042e-05, + "loss": 5.858, + "step": 3612 + }, + { + "epoch": 0.34840887174541946, + "grad_norm": 2.2604386806488037, + "learning_rate": 3.646769704030704e-05, + "loss": 5.7292, + "step": 3613 + }, + { + "epoch": 0.3485053037608486, + "grad_norm": 1.9021587371826172, + "learning_rate": 3.646096657495255e-05, + "loss": 5.713, + "step": 3614 + }, + { + "epoch": 0.34860173577627773, + "grad_norm": 2.6401941776275635, + "learning_rate": 3.645423505772464e-05, + "loss": 5.711, + "step": 3615 + }, + { + "epoch": 0.34869816779170687, + "grad_norm": 2.2664191722869873, + "learning_rate": 3.644750248924114e-05, + "loss": 5.4874, + "step": 3616 + }, + { + "epoch": 0.34879459980713595, + "grad_norm": 4.321187496185303, + "learning_rate": 3.644076887011994e-05, + "loss": 6.1946, + "step": 3617 + }, + { + "epoch": 0.3488910318225651, + "grad_norm": 2.6655044555664062, + "learning_rate": 3.643403420097906e-05, + "loss": 5.9427, + "step": 3618 + }, + { + "epoch": 0.3489874638379942, + "grad_norm": 2.751798629760742, + "learning_rate": 3.64272984824366e-05, + "loss": 5.6944, + "step": 3619 + }, + { + "epoch": 0.3490838958534233, + "grad_norm": 2.0523793697357178, + "learning_rate": 3.6420561715110745e-05, + "loss": 5.6062, + "step": 3620 + }, + { + "epoch": 0.34918032786885245, + "grad_norm": 3.9633729457855225, + "learning_rate": 3.64138238996198e-05, + "loss": 5.5703, + "step": 3621 + }, + { + "epoch": 0.3492767598842816, + "grad_norm": 3.170755624771118, + "learning_rate": 3.640708503658213e-05, + "loss": 5.8511, + "step": 3622 + }, + { + "epoch": 0.3493731918997107, + "grad_norm": 3.265610456466675, + "learning_rate": 3.6400345126616254e-05, + "loss": 6.0315, + "step": 3623 + }, + { + "epoch": 0.3494696239151398, + "grad_norm": 4.44577169418335, + "learning_rate": 3.6393604170340725e-05, + "loss": 5.688, + "step": 3624 + }, + { + "epoch": 0.34956605593056894, + "grad_norm": 4.8722381591796875, + "learning_rate": 3.6386862168374224e-05, + "loss": 5.5625, + "step": 3625 + }, + { + "epoch": 0.3496624879459981, + "grad_norm": 4.1615400314331055, + "learning_rate": 3.638011912133553e-05, + "loss": 5.6031, + "step": 3626 + }, + { + "epoch": 0.3497589199614272, + "grad_norm": 3.3292648792266846, + "learning_rate": 3.6373375029843517e-05, + "loss": 5.3398, + "step": 3627 + }, + { + "epoch": 0.3498553519768563, + "grad_norm": 4.467573642730713, + "learning_rate": 3.636662989451714e-05, + "loss": 5.3152, + "step": 3628 + }, + { + "epoch": 0.34995178399228544, + "grad_norm": 3.117098093032837, + "learning_rate": 3.635988371597546e-05, + "loss": 5.6325, + "step": 3629 + }, + { + "epoch": 0.3500482160077146, + "grad_norm": 2.7644052505493164, + "learning_rate": 3.635313649483763e-05, + "loss": 5.8271, + "step": 3630 + }, + { + "epoch": 0.35014464802314366, + "grad_norm": 3.290959358215332, + "learning_rate": 3.634638823172291e-05, + "loss": 5.5239, + "step": 3631 + }, + { + "epoch": 0.3502410800385728, + "grad_norm": 4.550525188446045, + "learning_rate": 3.6339638927250644e-05, + "loss": 5.7458, + "step": 3632 + }, + { + "epoch": 0.35033751205400193, + "grad_norm": 3.6660377979278564, + "learning_rate": 3.633288858204027e-05, + "loss": 5.5915, + "step": 3633 + }, + { + "epoch": 0.35043394406943107, + "grad_norm": 3.532815933227539, + "learning_rate": 3.632613719671133e-05, + "loss": 5.8905, + "step": 3634 + }, + { + "epoch": 0.35053037608486015, + "grad_norm": 3.3728814125061035, + "learning_rate": 3.631938477188346e-05, + "loss": 5.6414, + "step": 3635 + }, + { + "epoch": 0.3506268081002893, + "grad_norm": 2.3667430877685547, + "learning_rate": 3.631263130817639e-05, + "loss": 5.6318, + "step": 3636 + }, + { + "epoch": 0.3507232401157184, + "grad_norm": 2.0634732246398926, + "learning_rate": 3.6305876806209945e-05, + "loss": 5.5606, + "step": 3637 + }, + { + "epoch": 0.35081967213114756, + "grad_norm": 2.7093164920806885, + "learning_rate": 3.629912126660403e-05, + "loss": 5.829, + "step": 3638 + }, + { + "epoch": 0.35091610414657665, + "grad_norm": 2.023775100708008, + "learning_rate": 3.629236468997868e-05, + "loss": 5.8007, + "step": 3639 + }, + { + "epoch": 0.3510125361620058, + "grad_norm": 1.650914192199707, + "learning_rate": 3.6285607076953996e-05, + "loss": 5.7816, + "step": 3640 + }, + { + "epoch": 0.3511089681774349, + "grad_norm": 2.332780599594116, + "learning_rate": 3.6278848428150185e-05, + "loss": 5.8097, + "step": 3641 + }, + { + "epoch": 0.351205400192864, + "grad_norm": 3.1151821613311768, + "learning_rate": 3.627208874418755e-05, + "loss": 5.8682, + "step": 3642 + }, + { + "epoch": 0.35130183220829314, + "grad_norm": 2.3973326683044434, + "learning_rate": 3.626532802568649e-05, + "loss": 5.8418, + "step": 3643 + }, + { + "epoch": 0.3513982642237223, + "grad_norm": 3.1881043910980225, + "learning_rate": 3.625856627326747e-05, + "loss": 5.9446, + "step": 3644 + }, + { + "epoch": 0.3514946962391514, + "grad_norm": 3.701080322265625, + "learning_rate": 3.6251803487551106e-05, + "loss": 5.4958, + "step": 3645 + }, + { + "epoch": 0.3515911282545805, + "grad_norm": 2.330763339996338, + "learning_rate": 3.6245039669158064e-05, + "loss": 5.703, + "step": 3646 + }, + { + "epoch": 0.35168756027000964, + "grad_norm": 2.326183795928955, + "learning_rate": 3.6238274818709125e-05, + "loss": 5.7196, + "step": 3647 + }, + { + "epoch": 0.3517839922854388, + "grad_norm": 2.2011208534240723, + "learning_rate": 3.623150893682516e-05, + "loss": 5.844, + "step": 3648 + }, + { + "epoch": 0.3518804243008679, + "grad_norm": 1.8459253311157227, + "learning_rate": 3.622474202412712e-05, + "loss": 5.6376, + "step": 3649 + }, + { + "epoch": 0.351976856316297, + "grad_norm": 2.0619592666625977, + "learning_rate": 3.621797408123607e-05, + "loss": 5.7489, + "step": 3650 + }, + { + "epoch": 0.35207328833172613, + "grad_norm": 1.7747198343276978, + "learning_rate": 3.6211205108773176e-05, + "loss": 5.725, + "step": 3651 + }, + { + "epoch": 0.35216972034715527, + "grad_norm": 3.022080421447754, + "learning_rate": 3.620443510735967e-05, + "loss": 5.3545, + "step": 3652 + }, + { + "epoch": 0.35226615236258435, + "grad_norm": 2.6718032360076904, + "learning_rate": 3.61976640776169e-05, + "loss": 5.5735, + "step": 3653 + }, + { + "epoch": 0.3523625843780135, + "grad_norm": 3.2873880863189697, + "learning_rate": 3.6190892020166314e-05, + "loss": 5.687, + "step": 3654 + }, + { + "epoch": 0.3524590163934426, + "grad_norm": 3.6808388233184814, + "learning_rate": 3.6184118935629416e-05, + "loss": 5.6311, + "step": 3655 + }, + { + "epoch": 0.35255544840887176, + "grad_norm": 3.1662099361419678, + "learning_rate": 3.617734482462785e-05, + "loss": 5.373, + "step": 3656 + }, + { + "epoch": 0.35265188042430085, + "grad_norm": 4.409822940826416, + "learning_rate": 3.617056968778334e-05, + "loss": 5.7719, + "step": 3657 + }, + { + "epoch": 0.35274831243973, + "grad_norm": 5.472177028656006, + "learning_rate": 3.6163793525717695e-05, + "loss": 5.3095, + "step": 3658 + }, + { + "epoch": 0.3528447444551591, + "grad_norm": 3.2938733100891113, + "learning_rate": 3.6157016339052805e-05, + "loss": 5.3641, + "step": 3659 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 4.105088710784912, + "learning_rate": 3.61502381284107e-05, + "loss": 5.5341, + "step": 3660 + }, + { + "epoch": 0.35303760848601734, + "grad_norm": 3.072277307510376, + "learning_rate": 3.6143458894413465e-05, + "loss": 5.7045, + "step": 3661 + }, + { + "epoch": 0.3531340405014465, + "grad_norm": 3.9239253997802734, + "learning_rate": 3.6136678637683275e-05, + "loss": 5.5733, + "step": 3662 + }, + { + "epoch": 0.3532304725168756, + "grad_norm": 4.006842136383057, + "learning_rate": 3.612989735884244e-05, + "loss": 5.6753, + "step": 3663 + }, + { + "epoch": 0.3533269045323047, + "grad_norm": 4.025613784790039, + "learning_rate": 3.6123115058513315e-05, + "loss": 5.7828, + "step": 3664 + }, + { + "epoch": 0.35342333654773384, + "grad_norm": 4.329919815063477, + "learning_rate": 3.6116331737318386e-05, + "loss": 5.6157, + "step": 3665 + }, + { + "epoch": 0.353519768563163, + "grad_norm": 2.998886823654175, + "learning_rate": 3.6109547395880213e-05, + "loss": 5.5029, + "step": 3666 + }, + { + "epoch": 0.3536162005785921, + "grad_norm": 5.918035984039307, + "learning_rate": 3.610276203482145e-05, + "loss": 5.7288, + "step": 3667 + }, + { + "epoch": 0.3537126325940212, + "grad_norm": 5.832649230957031, + "learning_rate": 3.609597565476485e-05, + "loss": 5.5437, + "step": 3668 + }, + { + "epoch": 0.35380906460945033, + "grad_norm": 4.358615875244141, + "learning_rate": 3.608918825633327e-05, + "loss": 4.876, + "step": 3669 + }, + { + "epoch": 0.35390549662487947, + "grad_norm": 2.4800405502319336, + "learning_rate": 3.608239984014964e-05, + "loss": 5.1817, + "step": 3670 + }, + { + "epoch": 0.3540019286403086, + "grad_norm": 4.23337984085083, + "learning_rate": 3.6075610406836983e-05, + "loss": 5.8179, + "step": 3671 + }, + { + "epoch": 0.3540983606557377, + "grad_norm": 3.699126958847046, + "learning_rate": 3.606881995701844e-05, + "loss": 5.7213, + "step": 3672 + }, + { + "epoch": 0.3541947926711668, + "grad_norm": 4.388095378875732, + "learning_rate": 3.606202849131723e-05, + "loss": 5.8788, + "step": 3673 + }, + { + "epoch": 0.35429122468659596, + "grad_norm": 5.408345699310303, + "learning_rate": 3.605523601035665e-05, + "loss": 5.711, + "step": 3674 + }, + { + "epoch": 0.3543876567020251, + "grad_norm": 4.388432025909424, + "learning_rate": 3.604844251476013e-05, + "loss": 5.5386, + "step": 3675 + }, + { + "epoch": 0.3544840887174542, + "grad_norm": 5.213278293609619, + "learning_rate": 3.604164800515114e-05, + "loss": 5.7733, + "step": 3676 + }, + { + "epoch": 0.3545805207328833, + "grad_norm": 5.379817008972168, + "learning_rate": 3.6034852482153297e-05, + "loss": 5.8759, + "step": 3677 + }, + { + "epoch": 0.35467695274831246, + "grad_norm": 4.587655067443848, + "learning_rate": 3.602805594639026e-05, + "loss": 5.6272, + "step": 3678 + }, + { + "epoch": 0.35477338476374154, + "grad_norm": 3.226290225982666, + "learning_rate": 3.6021258398485835e-05, + "loss": 5.591, + "step": 3679 + }, + { + "epoch": 0.3548698167791707, + "grad_norm": 4.957486629486084, + "learning_rate": 3.601445983906387e-05, + "loss": 5.6676, + "step": 3680 + }, + { + "epoch": 0.3549662487945998, + "grad_norm": 6.405223846435547, + "learning_rate": 3.6007660268748356e-05, + "loss": 5.8431, + "step": 3681 + }, + { + "epoch": 0.35506268081002895, + "grad_norm": 4.70442533493042, + "learning_rate": 3.6000859688163316e-05, + "loss": 6.0549, + "step": 3682 + }, + { + "epoch": 0.35515911282545803, + "grad_norm": 3.8742387294769287, + "learning_rate": 3.599405809793291e-05, + "loss": 5.9103, + "step": 3683 + }, + { + "epoch": 0.35525554484088717, + "grad_norm": 3.7377283573150635, + "learning_rate": 3.5987255498681395e-05, + "loss": 5.8714, + "step": 3684 + }, + { + "epoch": 0.3553519768563163, + "grad_norm": 2.9657199382781982, + "learning_rate": 3.598045189103308e-05, + "loss": 5.7135, + "step": 3685 + }, + { + "epoch": 0.35544840887174545, + "grad_norm": 2.9086897373199463, + "learning_rate": 3.597364727561241e-05, + "loss": 5.6989, + "step": 3686 + }, + { + "epoch": 0.35554484088717453, + "grad_norm": 2.8751935958862305, + "learning_rate": 3.5966841653043905e-05, + "loss": 5.7664, + "step": 3687 + }, + { + "epoch": 0.35564127290260367, + "grad_norm": 2.850121259689331, + "learning_rate": 3.596003502395217e-05, + "loss": 5.6957, + "step": 3688 + }, + { + "epoch": 0.3557377049180328, + "grad_norm": 2.534365653991699, + "learning_rate": 3.595322738896191e-05, + "loss": 5.6029, + "step": 3689 + }, + { + "epoch": 0.3558341369334619, + "grad_norm": 2.6279778480529785, + "learning_rate": 3.594641874869792e-05, + "loss": 5.981, + "step": 3690 + }, + { + "epoch": 0.355930568948891, + "grad_norm": 2.1219701766967773, + "learning_rate": 3.5939609103785085e-05, + "loss": 5.9813, + "step": 3691 + }, + { + "epoch": 0.35602700096432016, + "grad_norm": 2.6557185649871826, + "learning_rate": 3.5932798454848396e-05, + "loss": 5.79, + "step": 3692 + }, + { + "epoch": 0.3561234329797493, + "grad_norm": 3.595400333404541, + "learning_rate": 3.5925986802512926e-05, + "loss": 6.0443, + "step": 3693 + }, + { + "epoch": 0.3562198649951784, + "grad_norm": 7.707708835601807, + "learning_rate": 3.591917414740382e-05, + "loss": 5.6143, + "step": 3694 + }, + { + "epoch": 0.3563162970106075, + "grad_norm": 2.7736942768096924, + "learning_rate": 3.591236049014637e-05, + "loss": 5.7302, + "step": 3695 + }, + { + "epoch": 0.35641272902603666, + "grad_norm": 3.1385915279388428, + "learning_rate": 3.590554583136589e-05, + "loss": 5.9107, + "step": 3696 + }, + { + "epoch": 0.3565091610414658, + "grad_norm": 2.4225099086761475, + "learning_rate": 3.589873017168784e-05, + "loss": 5.1976, + "step": 3697 + }, + { + "epoch": 0.3566055930568949, + "grad_norm": 4.685455322265625, + "learning_rate": 3.589191351173775e-05, + "loss": 5.7343, + "step": 3698 + }, + { + "epoch": 0.356702025072324, + "grad_norm": 4.029631614685059, + "learning_rate": 3.5885095852141246e-05, + "loss": 5.6785, + "step": 3699 + }, + { + "epoch": 0.35679845708775315, + "grad_norm": 2.8679327964782715, + "learning_rate": 3.587827719352404e-05, + "loss": 5.8616, + "step": 3700 + }, + { + "epoch": 0.35689488910318223, + "grad_norm": 2.840888500213623, + "learning_rate": 3.5871457536511935e-05, + "loss": 5.3071, + "step": 3701 + }, + { + "epoch": 0.35699132111861137, + "grad_norm": 3.7280569076538086, + "learning_rate": 3.5864636881730846e-05, + "loss": 5.6125, + "step": 3702 + }, + { + "epoch": 0.3570877531340405, + "grad_norm": 4.894455432891846, + "learning_rate": 3.5857815229806756e-05, + "loss": 5.6522, + "step": 3703 + }, + { + "epoch": 0.35718418514946965, + "grad_norm": 3.294952630996704, + "learning_rate": 3.585099258136574e-05, + "loss": 5.5176, + "step": 3704 + }, + { + "epoch": 0.35728061716489873, + "grad_norm": 2.1710667610168457, + "learning_rate": 3.584416893703399e-05, + "loss": 5.5655, + "step": 3705 + }, + { + "epoch": 0.35737704918032787, + "grad_norm": 5.784653663635254, + "learning_rate": 3.5837344297437764e-05, + "loss": 5.3185, + "step": 3706 + }, + { + "epoch": 0.357473481195757, + "grad_norm": 4.223573684692383, + "learning_rate": 3.583051866320341e-05, + "loss": 5.1744, + "step": 3707 + }, + { + "epoch": 0.35756991321118614, + "grad_norm": 7.142274379730225, + "learning_rate": 3.582369203495739e-05, + "loss": 5.6538, + "step": 3708 + }, + { + "epoch": 0.3576663452266152, + "grad_norm": 6.726316452026367, + "learning_rate": 3.581686441332622e-05, + "loss": 5.7416, + "step": 3709 + }, + { + "epoch": 0.35776277724204436, + "grad_norm": 4.861066818237305, + "learning_rate": 3.581003579893657e-05, + "loss": 5.5032, + "step": 3710 + }, + { + "epoch": 0.3578592092574735, + "grad_norm": 3.0128488540649414, + "learning_rate": 3.580320619241513e-05, + "loss": 5.6769, + "step": 3711 + }, + { + "epoch": 0.3579556412729026, + "grad_norm": 5.574343681335449, + "learning_rate": 3.5796375594388715e-05, + "loss": 5.8245, + "step": 3712 + }, + { + "epoch": 0.3580520732883317, + "grad_norm": 5.50689172744751, + "learning_rate": 3.578954400548425e-05, + "loss": 5.6485, + "step": 3713 + }, + { + "epoch": 0.35814850530376086, + "grad_norm": 4.199395179748535, + "learning_rate": 3.57827114263287e-05, + "loss": 5.5357, + "step": 3714 + }, + { + "epoch": 0.35824493731919, + "grad_norm": 4.258480548858643, + "learning_rate": 3.577587785754918e-05, + "loss": 5.5897, + "step": 3715 + }, + { + "epoch": 0.3583413693346191, + "grad_norm": 5.79876184463501, + "learning_rate": 3.576904329977284e-05, + "loss": 5.7812, + "step": 3716 + }, + { + "epoch": 0.3584378013500482, + "grad_norm": 8.921433448791504, + "learning_rate": 3.576220775362697e-05, + "loss": 5.4238, + "step": 3717 + }, + { + "epoch": 0.35853423336547735, + "grad_norm": 3.2846617698669434, + "learning_rate": 3.575537121973892e-05, + "loss": 5.6753, + "step": 3718 + }, + { + "epoch": 0.3586306653809065, + "grad_norm": 3.3848092555999756, + "learning_rate": 3.5748533698736135e-05, + "loss": 5.5458, + "step": 3719 + }, + { + "epoch": 0.35872709739633557, + "grad_norm": 4.105731010437012, + "learning_rate": 3.574169519124616e-05, + "loss": 5.5324, + "step": 3720 + }, + { + "epoch": 0.3588235294117647, + "grad_norm": 2.7256484031677246, + "learning_rate": 3.573485569789662e-05, + "loss": 5.4267, + "step": 3721 + }, + { + "epoch": 0.35891996142719385, + "grad_norm": 2.8361003398895264, + "learning_rate": 3.572801521931522e-05, + "loss": 5.3558, + "step": 3722 + }, + { + "epoch": 0.3590163934426229, + "grad_norm": 2.7855257987976074, + "learning_rate": 3.572117375612981e-05, + "loss": 5.7927, + "step": 3723 + }, + { + "epoch": 0.35911282545805207, + "grad_norm": 2.3897016048431396, + "learning_rate": 3.571433130896826e-05, + "loss": 5.6138, + "step": 3724 + }, + { + "epoch": 0.3592092574734812, + "grad_norm": 3.9553027153015137, + "learning_rate": 3.570748787845856e-05, + "loss": 5.6255, + "step": 3725 + }, + { + "epoch": 0.35930568948891034, + "grad_norm": 3.7468106746673584, + "learning_rate": 3.5700643465228814e-05, + "loss": 5.8232, + "step": 3726 + }, + { + "epoch": 0.3594021215043394, + "grad_norm": 4.192907333374023, + "learning_rate": 3.5693798069907176e-05, + "loss": 5.6588, + "step": 3727 + }, + { + "epoch": 0.35949855351976856, + "grad_norm": 2.5856685638427734, + "learning_rate": 3.5686951693121916e-05, + "loss": 5.774, + "step": 3728 + }, + { + "epoch": 0.3595949855351977, + "grad_norm": 8.32559871673584, + "learning_rate": 3.568010433550138e-05, + "loss": 5.541, + "step": 3729 + }, + { + "epoch": 0.35969141755062684, + "grad_norm": 9.674389839172363, + "learning_rate": 3.5673255997674016e-05, + "loss": 5.7065, + "step": 3730 + }, + { + "epoch": 0.3597878495660559, + "grad_norm": 9.082944869995117, + "learning_rate": 3.5666406680268346e-05, + "loss": 6.0375, + "step": 3731 + }, + { + "epoch": 0.35988428158148505, + "grad_norm": 4.48557710647583, + "learning_rate": 3.565955638391301e-05, + "loss": 5.6209, + "step": 3732 + }, + { + "epoch": 0.3599807135969142, + "grad_norm": 5.836783409118652, + "learning_rate": 3.56527051092367e-05, + "loss": 5.3495, + "step": 3733 + }, + { + "epoch": 0.3600771456123433, + "grad_norm": 6.6480712890625, + "learning_rate": 3.5645852856868225e-05, + "loss": 5.6969, + "step": 3734 + }, + { + "epoch": 0.3601735776277724, + "grad_norm": 6.367638111114502, + "learning_rate": 3.563899962743649e-05, + "loss": 5.2511, + "step": 3735 + }, + { + "epoch": 0.36027000964320155, + "grad_norm": 5.861395835876465, + "learning_rate": 3.5632145421570456e-05, + "loss": 5.458, + "step": 3736 + }, + { + "epoch": 0.3603664416586307, + "grad_norm": 5.020025730133057, + "learning_rate": 3.56252902398992e-05, + "loss": 5.4795, + "step": 3737 + }, + { + "epoch": 0.36046287367405977, + "grad_norm": 3.5190558433532715, + "learning_rate": 3.561843408305188e-05, + "loss": 5.7019, + "step": 3738 + }, + { + "epoch": 0.3605593056894889, + "grad_norm": 4.556247234344482, + "learning_rate": 3.561157695165776e-05, + "loss": 5.8785, + "step": 3739 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 5.458324909210205, + "learning_rate": 3.5604718846346163e-05, + "loss": 5.8371, + "step": 3740 + }, + { + "epoch": 0.3607521697203472, + "grad_norm": 3.803964376449585, + "learning_rate": 3.5597859767746524e-05, + "loss": 5.5106, + "step": 3741 + }, + { + "epoch": 0.36084860173577626, + "grad_norm": 3.2495133876800537, + "learning_rate": 3.559099971648835e-05, + "loss": 5.2226, + "step": 3742 + }, + { + "epoch": 0.3609450337512054, + "grad_norm": 1.921447992324829, + "learning_rate": 3.558413869320127e-05, + "loss": 5.0764, + "step": 3743 + }, + { + "epoch": 0.36104146576663454, + "grad_norm": 3.0807645320892334, + "learning_rate": 3.557727669851496e-05, + "loss": 5.6222, + "step": 3744 + }, + { + "epoch": 0.3611378977820636, + "grad_norm": 3.224292516708374, + "learning_rate": 3.557041373305921e-05, + "loss": 5.5188, + "step": 3745 + }, + { + "epoch": 0.36123432979749276, + "grad_norm": 4.610684871673584, + "learning_rate": 3.556354979746391e-05, + "loss": 5.4833, + "step": 3746 + }, + { + "epoch": 0.3613307618129219, + "grad_norm": 2.89540433883667, + "learning_rate": 3.5556684892358996e-05, + "loss": 5.4409, + "step": 3747 + }, + { + "epoch": 0.36142719382835103, + "grad_norm": 3.692960739135742, + "learning_rate": 3.5549819018374545e-05, + "loss": 5.4363, + "step": 3748 + }, + { + "epoch": 0.3615236258437801, + "grad_norm": 5.065814018249512, + "learning_rate": 3.554295217614069e-05, + "loss": 5.393, + "step": 3749 + }, + { + "epoch": 0.36162005785920925, + "grad_norm": 3.640584707260132, + "learning_rate": 3.553608436628766e-05, + "loss": 5.4202, + "step": 3750 + }, + { + "epoch": 0.3617164898746384, + "grad_norm": 3.9628307819366455, + "learning_rate": 3.5529215589445776e-05, + "loss": 5.4814, + "step": 3751 + }, + { + "epoch": 0.36181292189006753, + "grad_norm": 3.582988977432251, + "learning_rate": 3.552234584624544e-05, + "loss": 5.4, + "step": 3752 + }, + { + "epoch": 0.3619093539054966, + "grad_norm": 2.884469747543335, + "learning_rate": 3.5515475137317153e-05, + "loss": 5.4028, + "step": 3753 + }, + { + "epoch": 0.36200578592092575, + "grad_norm": 2.719250440597534, + "learning_rate": 3.55086034632915e-05, + "loss": 5.4557, + "step": 3754 + }, + { + "epoch": 0.3621022179363549, + "grad_norm": 4.277882099151611, + "learning_rate": 3.550173082479916e-05, + "loss": 5.9742, + "step": 3755 + }, + { + "epoch": 0.36219864995178397, + "grad_norm": 3.5649237632751465, + "learning_rate": 3.5494857222470886e-05, + "loss": 5.9569, + "step": 3756 + }, + { + "epoch": 0.3622950819672131, + "grad_norm": 2.875197410583496, + "learning_rate": 3.548798265693754e-05, + "loss": 5.9583, + "step": 3757 + }, + { + "epoch": 0.36239151398264224, + "grad_norm": 4.68684720993042, + "learning_rate": 3.5481107128830046e-05, + "loss": 5.5538, + "step": 3758 + }, + { + "epoch": 0.3624879459980714, + "grad_norm": 3.742858648300171, + "learning_rate": 3.5474230638779443e-05, + "loss": 5.5452, + "step": 3759 + }, + { + "epoch": 0.36258437801350046, + "grad_norm": 5.714643955230713, + "learning_rate": 3.546735318741684e-05, + "loss": 5.857, + "step": 3760 + }, + { + "epoch": 0.3626808100289296, + "grad_norm": 3.8855459690093994, + "learning_rate": 3.546047477537345e-05, + "loss": 5.7583, + "step": 3761 + }, + { + "epoch": 0.36277724204435874, + "grad_norm": 6.040802955627441, + "learning_rate": 3.545359540328056e-05, + "loss": 5.2243, + "step": 3762 + }, + { + "epoch": 0.3628736740597879, + "grad_norm": 4.460951805114746, + "learning_rate": 3.544671507176955e-05, + "loss": 5.5008, + "step": 3763 + }, + { + "epoch": 0.36297010607521696, + "grad_norm": 3.740096092224121, + "learning_rate": 3.543983378147189e-05, + "loss": 5.6767, + "step": 3764 + }, + { + "epoch": 0.3630665380906461, + "grad_norm": 2.355501651763916, + "learning_rate": 3.5432951533019135e-05, + "loss": 5.526, + "step": 3765 + }, + { + "epoch": 0.36316297010607523, + "grad_norm": 3.4431235790252686, + "learning_rate": 3.542606832704292e-05, + "loss": 5.8347, + "step": 3766 + }, + { + "epoch": 0.3632594021215043, + "grad_norm": 3.868166923522949, + "learning_rate": 3.541918416417499e-05, + "loss": 5.5786, + "step": 3767 + }, + { + "epoch": 0.36335583413693345, + "grad_norm": 2.7445099353790283, + "learning_rate": 3.541229904504717e-05, + "loss": 5.5988, + "step": 3768 + }, + { + "epoch": 0.3634522661523626, + "grad_norm": 2.9302914142608643, + "learning_rate": 3.540541297029134e-05, + "loss": 5.8615, + "step": 3769 + }, + { + "epoch": 0.36354869816779173, + "grad_norm": 2.4624016284942627, + "learning_rate": 3.5398525940539535e-05, + "loss": 5.5096, + "step": 3770 + }, + { + "epoch": 0.3636451301832208, + "grad_norm": 2.2218661308288574, + "learning_rate": 3.5391637956423806e-05, + "loss": 5.8806, + "step": 3771 + }, + { + "epoch": 0.36374156219864995, + "grad_norm": 3.03620982170105, + "learning_rate": 3.538474901857635e-05, + "loss": 5.9519, + "step": 3772 + }, + { + "epoch": 0.3638379942140791, + "grad_norm": 2.7013113498687744, + "learning_rate": 3.537785912762939e-05, + "loss": 5.7337, + "step": 3773 + }, + { + "epoch": 0.3639344262295082, + "grad_norm": 2.48042368888855, + "learning_rate": 3.537096828421531e-05, + "loss": 5.9526, + "step": 3774 + }, + { + "epoch": 0.3640308582449373, + "grad_norm": 2.7399637699127197, + "learning_rate": 3.5364076488966516e-05, + "loss": 5.1384, + "step": 3775 + }, + { + "epoch": 0.36412729026036644, + "grad_norm": 3.314091444015503, + "learning_rate": 3.535718374251554e-05, + "loss": 5.4117, + "step": 3776 + }, + { + "epoch": 0.3642237222757956, + "grad_norm": 4.11826229095459, + "learning_rate": 3.5350290045494985e-05, + "loss": 5.521, + "step": 3777 + }, + { + "epoch": 0.36432015429122466, + "grad_norm": 2.8709287643432617, + "learning_rate": 3.534339539853755e-05, + "loss": 5.5202, + "step": 3778 + }, + { + "epoch": 0.3644165863066538, + "grad_norm": 2.2105748653411865, + "learning_rate": 3.5336499802276025e-05, + "loss": 5.6121, + "step": 3779 + }, + { + "epoch": 0.36451301832208294, + "grad_norm": 2.3675734996795654, + "learning_rate": 3.5329603257343256e-05, + "loss": 5.2776, + "step": 3780 + }, + { + "epoch": 0.3646094503375121, + "grad_norm": 2.9349734783172607, + "learning_rate": 3.532270576437222e-05, + "loss": 5.7029, + "step": 3781 + }, + { + "epoch": 0.36470588235294116, + "grad_norm": 3.872288465499878, + "learning_rate": 3.5315807323995954e-05, + "loss": 5.7077, + "step": 3782 + }, + { + "epoch": 0.3648023143683703, + "grad_norm": 3.246687650680542, + "learning_rate": 3.5308907936847594e-05, + "loss": 5.2925, + "step": 3783 + }, + { + "epoch": 0.36489874638379943, + "grad_norm": 4.078927993774414, + "learning_rate": 3.530200760356034e-05, + "loss": 5.7486, + "step": 3784 + }, + { + "epoch": 0.36499517839922857, + "grad_norm": 2.5861880779266357, + "learning_rate": 3.529510632476752e-05, + "loss": 5.6288, + "step": 3785 + }, + { + "epoch": 0.36509161041465765, + "grad_norm": 3.335150718688965, + "learning_rate": 3.5288204101102506e-05, + "loss": 5.5335, + "step": 3786 + }, + { + "epoch": 0.3651880424300868, + "grad_norm": 3.8498857021331787, + "learning_rate": 3.528130093319879e-05, + "loss": 5.3854, + "step": 3787 + }, + { + "epoch": 0.3652844744455159, + "grad_norm": 3.1595332622528076, + "learning_rate": 3.5274396821689916e-05, + "loss": 5.583, + "step": 3788 + }, + { + "epoch": 0.365380906460945, + "grad_norm": 2.67470383644104, + "learning_rate": 3.5267491767209554e-05, + "loss": 5.5874, + "step": 3789 + }, + { + "epoch": 0.36547733847637415, + "grad_norm": 2.6194708347320557, + "learning_rate": 3.526058577039143e-05, + "loss": 5.5384, + "step": 3790 + }, + { + "epoch": 0.3655737704918033, + "grad_norm": 2.372756242752075, + "learning_rate": 3.525367883186938e-05, + "loss": 5.6561, + "step": 3791 + }, + { + "epoch": 0.3656702025072324, + "grad_norm": 2.560392379760742, + "learning_rate": 3.52467709522773e-05, + "loss": 5.6415, + "step": 3792 + }, + { + "epoch": 0.3657666345226615, + "grad_norm": 2.7225215435028076, + "learning_rate": 3.52398621322492e-05, + "loss": 5.4341, + "step": 3793 + }, + { + "epoch": 0.36586306653809064, + "grad_norm": 2.3629982471466064, + "learning_rate": 3.523295237241916e-05, + "loss": 5.5815, + "step": 3794 + }, + { + "epoch": 0.3659594985535198, + "grad_norm": 6.638643741607666, + "learning_rate": 3.522604167342134e-05, + "loss": 4.7007, + "step": 3795 + }, + { + "epoch": 0.3660559305689489, + "grad_norm": 4.120229244232178, + "learning_rate": 3.521913003589e-05, + "loss": 4.6726, + "step": 3796 + }, + { + "epoch": 0.366152362584378, + "grad_norm": 6.44297981262207, + "learning_rate": 3.5212217460459485e-05, + "loss": 5.1996, + "step": 3797 + }, + { + "epoch": 0.36624879459980714, + "grad_norm": 5.572206497192383, + "learning_rate": 3.520530394776422e-05, + "loss": 5.0507, + "step": 3798 + }, + { + "epoch": 0.3663452266152363, + "grad_norm": 2.390824556350708, + "learning_rate": 3.519838949843872e-05, + "loss": 4.9654, + "step": 3799 + }, + { + "epoch": 0.36644165863066536, + "grad_norm": 3.112222671508789, + "learning_rate": 3.519147411311758e-05, + "loss": 5.4671, + "step": 3800 + }, + { + "epoch": 0.3665380906460945, + "grad_norm": 4.017904758453369, + "learning_rate": 3.5184557792435505e-05, + "loss": 5.5641, + "step": 3801 + }, + { + "epoch": 0.36663452266152363, + "grad_norm": 2.502396583557129, + "learning_rate": 3.517764053702723e-05, + "loss": 5.5584, + "step": 3802 + }, + { + "epoch": 0.36673095467695277, + "grad_norm": 2.637723684310913, + "learning_rate": 3.517072234752765e-05, + "loss": 5.4739, + "step": 3803 + }, + { + "epoch": 0.36682738669238185, + "grad_norm": 2.9124083518981934, + "learning_rate": 3.5163803224571675e-05, + "loss": 5.576, + "step": 3804 + }, + { + "epoch": 0.366923818707811, + "grad_norm": 2.2038118839263916, + "learning_rate": 3.515688316879436e-05, + "loss": 5.644, + "step": 3805 + }, + { + "epoch": 0.3670202507232401, + "grad_norm": 3.3932418823242188, + "learning_rate": 3.514996218083081e-05, + "loss": 5.647, + "step": 3806 + }, + { + "epoch": 0.36711668273866926, + "grad_norm": 3.646639108657837, + "learning_rate": 3.5143040261316225e-05, + "loss": 5.8012, + "step": 3807 + }, + { + "epoch": 0.36721311475409835, + "grad_norm": 2.7874016761779785, + "learning_rate": 3.513611741088588e-05, + "loss": 5.6519, + "step": 3808 + }, + { + "epoch": 0.3673095467695275, + "grad_norm": 4.006798267364502, + "learning_rate": 3.512919363017516e-05, + "loss": 5.7577, + "step": 3809 + }, + { + "epoch": 0.3674059787849566, + "grad_norm": 5.209338665008545, + "learning_rate": 3.512226891981951e-05, + "loss": 5.7272, + "step": 3810 + }, + { + "epoch": 0.3675024108003857, + "grad_norm": 2.8081958293914795, + "learning_rate": 3.511534328045449e-05, + "loss": 5.7551, + "step": 3811 + }, + { + "epoch": 0.36759884281581484, + "grad_norm": 3.3728010654449463, + "learning_rate": 3.51084167127157e-05, + "loss": 5.3362, + "step": 3812 + }, + { + "epoch": 0.367695274831244, + "grad_norm": 4.989809036254883, + "learning_rate": 3.510148921723888e-05, + "loss": 5.4775, + "step": 3813 + }, + { + "epoch": 0.3677917068466731, + "grad_norm": 5.025696277618408, + "learning_rate": 3.50945607946598e-05, + "loss": 5.5876, + "step": 3814 + }, + { + "epoch": 0.3678881388621022, + "grad_norm": 3.911630868911743, + "learning_rate": 3.5087631445614354e-05, + "loss": 5.664, + "step": 3815 + }, + { + "epoch": 0.36798457087753134, + "grad_norm": 2.1593286991119385, + "learning_rate": 3.508070117073852e-05, + "loss": 5.8723, + "step": 3816 + }, + { + "epoch": 0.3680810028929605, + "grad_norm": 2.825632333755493, + "learning_rate": 3.507376997066833e-05, + "loss": 5.7914, + "step": 3817 + }, + { + "epoch": 0.3681774349083896, + "grad_norm": 4.206254959106445, + "learning_rate": 3.5066837846039936e-05, + "loss": 5.5168, + "step": 3818 + }, + { + "epoch": 0.3682738669238187, + "grad_norm": 3.851815938949585, + "learning_rate": 3.5059904797489554e-05, + "loss": 5.4278, + "step": 3819 + }, + { + "epoch": 0.36837029893924783, + "grad_norm": 3.264936923980713, + "learning_rate": 3.505297082565349e-05, + "loss": 5.4044, + "step": 3820 + }, + { + "epoch": 0.36846673095467697, + "grad_norm": 3.824169397354126, + "learning_rate": 3.504603593116814e-05, + "loss": 5.6186, + "step": 3821 + }, + { + "epoch": 0.36856316297010605, + "grad_norm": 4.190293312072754, + "learning_rate": 3.503910011466997e-05, + "loss": 5.913, + "step": 3822 + }, + { + "epoch": 0.3686595949855352, + "grad_norm": 2.836749315261841, + "learning_rate": 3.503216337679556e-05, + "loss": 5.9896, + "step": 3823 + }, + { + "epoch": 0.3687560270009643, + "grad_norm": 3.5375754833221436, + "learning_rate": 3.502522571818153e-05, + "loss": 5.5168, + "step": 3824 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 4.397594451904297, + "learning_rate": 3.501828713946463e-05, + "loss": 5.6003, + "step": 3825 + }, + { + "epoch": 0.36894889103182255, + "grad_norm": 5.19051456451416, + "learning_rate": 3.501134764128167e-05, + "loss": 5.5855, + "step": 3826 + }, + { + "epoch": 0.3690453230472517, + "grad_norm": 3.9817373752593994, + "learning_rate": 3.500440722426954e-05, + "loss": 5.575, + "step": 3827 + }, + { + "epoch": 0.3691417550626808, + "grad_norm": 5.643368721008301, + "learning_rate": 3.499746588906523e-05, + "loss": 5.5517, + "step": 3828 + }, + { + "epoch": 0.36923818707810996, + "grad_norm": 5.306240081787109, + "learning_rate": 3.49905236363058e-05, + "loss": 5.4234, + "step": 3829 + }, + { + "epoch": 0.36933461909353904, + "grad_norm": 5.690130710601807, + "learning_rate": 3.4983580466628404e-05, + "loss": 5.621, + "step": 3830 + }, + { + "epoch": 0.3694310511089682, + "grad_norm": 5.800205707550049, + "learning_rate": 3.497663638067029e-05, + "loss": 5.4992, + "step": 3831 + }, + { + "epoch": 0.3695274831243973, + "grad_norm": 4.046509742736816, + "learning_rate": 3.496969137906877e-05, + "loss": 5.6191, + "step": 3832 + }, + { + "epoch": 0.3696239151398264, + "grad_norm": 4.391824245452881, + "learning_rate": 3.496274546246124e-05, + "loss": 5.9517, + "step": 3833 + }, + { + "epoch": 0.36972034715525554, + "grad_norm": 3.856318473815918, + "learning_rate": 3.4955798631485196e-05, + "loss": 5.48, + "step": 3834 + }, + { + "epoch": 0.3698167791706847, + "grad_norm": 4.529941082000732, + "learning_rate": 3.49488508867782e-05, + "loss": 5.6984, + "step": 3835 + }, + { + "epoch": 0.3699132111861138, + "grad_norm": 4.047680377960205, + "learning_rate": 3.494190222897793e-05, + "loss": 5.5874, + "step": 3836 + }, + { + "epoch": 0.3700096432015429, + "grad_norm": 3.5656745433807373, + "learning_rate": 3.49349526587221e-05, + "loss": 5.5109, + "step": 3837 + }, + { + "epoch": 0.37010607521697203, + "grad_norm": 4.189818859100342, + "learning_rate": 3.4928002176648545e-05, + "loss": 6.0749, + "step": 3838 + }, + { + "epoch": 0.37020250723240117, + "grad_norm": 4.058129787445068, + "learning_rate": 3.492105078339517e-05, + "loss": 5.8401, + "step": 3839 + }, + { + "epoch": 0.3702989392478303, + "grad_norm": 4.6549506187438965, + "learning_rate": 3.491409847959996e-05, + "loss": 5.4883, + "step": 3840 + }, + { + "epoch": 0.3703953712632594, + "grad_norm": 3.2141783237457275, + "learning_rate": 3.4907145265900996e-05, + "loss": 5.7764, + "step": 3841 + }, + { + "epoch": 0.3704918032786885, + "grad_norm": 3.4147086143493652, + "learning_rate": 3.490019114293644e-05, + "loss": 5.9214, + "step": 3842 + }, + { + "epoch": 0.37058823529411766, + "grad_norm": 2.404242753982544, + "learning_rate": 3.489323611134452e-05, + "loss": 5.6894, + "step": 3843 + }, + { + "epoch": 0.37068466730954674, + "grad_norm": 2.7655069828033447, + "learning_rate": 3.488628017176356e-05, + "loss": 5.6603, + "step": 3844 + }, + { + "epoch": 0.3707810993249759, + "grad_norm": 2.4752330780029297, + "learning_rate": 3.487932332483199e-05, + "loss": 5.5576, + "step": 3845 + }, + { + "epoch": 0.370877531340405, + "grad_norm": 2.7192044258117676, + "learning_rate": 3.487236557118828e-05, + "loss": 5.5805, + "step": 3846 + }, + { + "epoch": 0.37097396335583416, + "grad_norm": 1.8631010055541992, + "learning_rate": 3.4865406911471e-05, + "loss": 5.8241, + "step": 3847 + }, + { + "epoch": 0.37107039537126324, + "grad_norm": 2.2940731048583984, + "learning_rate": 3.485844734631882e-05, + "loss": 5.9798, + "step": 3848 + }, + { + "epoch": 0.3711668273866924, + "grad_norm": 2.0954675674438477, + "learning_rate": 3.4851486876370474e-05, + "loss": 5.9374, + "step": 3849 + }, + { + "epoch": 0.3712632594021215, + "grad_norm": 3.2687976360321045, + "learning_rate": 3.484452550226479e-05, + "loss": 5.3269, + "step": 3850 + }, + { + "epoch": 0.37135969141755065, + "grad_norm": 2.804288387298584, + "learning_rate": 3.483756322464067e-05, + "loss": 5.5462, + "step": 3851 + }, + { + "epoch": 0.37145612343297973, + "grad_norm": 2.48767352104187, + "learning_rate": 3.483060004413711e-05, + "loss": 5.6823, + "step": 3852 + }, + { + "epoch": 0.37155255544840887, + "grad_norm": 3.0890989303588867, + "learning_rate": 3.4823635961393175e-05, + "loss": 5.5867, + "step": 3853 + }, + { + "epoch": 0.371648987463838, + "grad_norm": 3.494474411010742, + "learning_rate": 3.481667097704802e-05, + "loss": 5.8366, + "step": 3854 + }, + { + "epoch": 0.3717454194792671, + "grad_norm": 2.728492021560669, + "learning_rate": 3.480970509174089e-05, + "loss": 5.6956, + "step": 3855 + }, + { + "epoch": 0.37184185149469623, + "grad_norm": 3.6856749057769775, + "learning_rate": 3.48027383061111e-05, + "loss": 5.5422, + "step": 3856 + }, + { + "epoch": 0.37193828351012537, + "grad_norm": 3.5633671283721924, + "learning_rate": 3.479577062079805e-05, + "loss": 5.7798, + "step": 3857 + }, + { + "epoch": 0.3720347155255545, + "grad_norm": 2.8018953800201416, + "learning_rate": 3.4788802036441235e-05, + "loss": 5.616, + "step": 3858 + }, + { + "epoch": 0.3721311475409836, + "grad_norm": 3.0900111198425293, + "learning_rate": 3.478183255368021e-05, + "loss": 5.5232, + "step": 3859 + }, + { + "epoch": 0.3722275795564127, + "grad_norm": 4.232784748077393, + "learning_rate": 3.4774862173154636e-05, + "loss": 5.4705, + "step": 3860 + }, + { + "epoch": 0.37232401157184186, + "grad_norm": 3.1680350303649902, + "learning_rate": 3.476789089550425e-05, + "loss": 5.2902, + "step": 3861 + }, + { + "epoch": 0.372420443587271, + "grad_norm": 3.785998582839966, + "learning_rate": 3.4760918721368844e-05, + "loss": 5.4634, + "step": 3862 + }, + { + "epoch": 0.3725168756027001, + "grad_norm": 4.1176228523254395, + "learning_rate": 3.475394565138834e-05, + "loss": 5.8668, + "step": 3863 + }, + { + "epoch": 0.3726133076181292, + "grad_norm": 3.111459255218506, + "learning_rate": 3.4746971686202714e-05, + "loss": 5.5248, + "step": 3864 + }, + { + "epoch": 0.37270973963355836, + "grad_norm": 3.618481159210205, + "learning_rate": 3.473999682645202e-05, + "loss": 5.3264, + "step": 3865 + }, + { + "epoch": 0.37280617164898744, + "grad_norm": 3.807424783706665, + "learning_rate": 3.473302107277639e-05, + "loss": 5.6555, + "step": 3866 + }, + { + "epoch": 0.3729026036644166, + "grad_norm": 3.541393995285034, + "learning_rate": 3.472604442581609e-05, + "loss": 5.6385, + "step": 3867 + }, + { + "epoch": 0.3729990356798457, + "grad_norm": 2.687474012374878, + "learning_rate": 3.471906688621139e-05, + "loss": 5.8449, + "step": 3868 + }, + { + "epoch": 0.37309546769527485, + "grad_norm": 4.06566047668457, + "learning_rate": 3.47120884546027e-05, + "loss": 5.9302, + "step": 3869 + }, + { + "epoch": 0.37319189971070393, + "grad_norm": 3.295748472213745, + "learning_rate": 3.470510913163048e-05, + "loss": 5.8107, + "step": 3870 + }, + { + "epoch": 0.37328833172613307, + "grad_norm": 3.120363235473633, + "learning_rate": 3.469812891793529e-05, + "loss": 5.8341, + "step": 3871 + }, + { + "epoch": 0.3733847637415622, + "grad_norm": 2.704383611679077, + "learning_rate": 3.469114781415777e-05, + "loss": 5.8027, + "step": 3872 + }, + { + "epoch": 0.37348119575699135, + "grad_norm": 2.4810359477996826, + "learning_rate": 3.468416582093862e-05, + "loss": 5.598, + "step": 3873 + }, + { + "epoch": 0.37357762777242043, + "grad_norm": 2.9362568855285645, + "learning_rate": 3.467718293891865e-05, + "loss": 5.7572, + "step": 3874 + }, + { + "epoch": 0.37367405978784957, + "grad_norm": 2.7162129878997803, + "learning_rate": 3.467019916873875e-05, + "loss": 5.7313, + "step": 3875 + }, + { + "epoch": 0.3737704918032787, + "grad_norm": 2.3766608238220215, + "learning_rate": 3.466321451103987e-05, + "loss": 5.643, + "step": 3876 + }, + { + "epoch": 0.3738669238187078, + "grad_norm": 2.388240337371826, + "learning_rate": 3.465622896646305e-05, + "loss": 5.649, + "step": 3877 + }, + { + "epoch": 0.3739633558341369, + "grad_norm": 2.743987798690796, + "learning_rate": 3.464924253564943e-05, + "loss": 5.5747, + "step": 3878 + }, + { + "epoch": 0.37405978784956606, + "grad_norm": 2.314847946166992, + "learning_rate": 3.4642255219240196e-05, + "loss": 5.3551, + "step": 3879 + }, + { + "epoch": 0.3741562198649952, + "grad_norm": 2.199894905090332, + "learning_rate": 3.463526701787665e-05, + "loss": 5.6935, + "step": 3880 + }, + { + "epoch": 0.3742526518804243, + "grad_norm": 2.745969772338867, + "learning_rate": 3.462827793220016e-05, + "loss": 5.5538, + "step": 3881 + }, + { + "epoch": 0.3743490838958534, + "grad_norm": 2.683015823364258, + "learning_rate": 3.4621287962852164e-05, + "loss": 5.6886, + "step": 3882 + }, + { + "epoch": 0.37444551591128256, + "grad_norm": 2.099297285079956, + "learning_rate": 3.461429711047421e-05, + "loss": 5.5517, + "step": 3883 + }, + { + "epoch": 0.3745419479267117, + "grad_norm": 3.027846574783325, + "learning_rate": 3.460730537570789e-05, + "loss": 5.5518, + "step": 3884 + }, + { + "epoch": 0.3746383799421408, + "grad_norm": 3.0830090045928955, + "learning_rate": 3.460031275919492e-05, + "loss": 5.421, + "step": 3885 + }, + { + "epoch": 0.3747348119575699, + "grad_norm": 1.9466642141342163, + "learning_rate": 3.4593319261577054e-05, + "loss": 5.0897, + "step": 3886 + }, + { + "epoch": 0.37483124397299905, + "grad_norm": 1.846730351448059, + "learning_rate": 3.458632488349616e-05, + "loss": 5.044, + "step": 3887 + }, + { + "epoch": 0.37492767598842813, + "grad_norm": 2.2237777709960938, + "learning_rate": 3.4579329625594156e-05, + "loss": 5.2424, + "step": 3888 + }, + { + "epoch": 0.37502410800385727, + "grad_norm": 3.1555089950561523, + "learning_rate": 3.457233348851308e-05, + "loss": 5.4211, + "step": 3889 + }, + { + "epoch": 0.3751205400192864, + "grad_norm": 2.0908875465393066, + "learning_rate": 3.4565336472895e-05, + "loss": 5.701, + "step": 3890 + }, + { + "epoch": 0.37521697203471555, + "grad_norm": 2.3113198280334473, + "learning_rate": 3.455833857938213e-05, + "loss": 5.6599, + "step": 3891 + }, + { + "epoch": 0.37531340405014463, + "grad_norm": 2.268914222717285, + "learning_rate": 3.45513398086167e-05, + "loss": 5.6111, + "step": 3892 + }, + { + "epoch": 0.37540983606557377, + "grad_norm": 2.53668212890625, + "learning_rate": 3.4544340161241066e-05, + "loss": 5.7464, + "step": 3893 + }, + { + "epoch": 0.3755062680810029, + "grad_norm": 2.0670840740203857, + "learning_rate": 3.453733963789764e-05, + "loss": 5.712, + "step": 3894 + }, + { + "epoch": 0.37560270009643204, + "grad_norm": 2.684574842453003, + "learning_rate": 3.453033823922891e-05, + "loss": 5.6914, + "step": 3895 + }, + { + "epoch": 0.3756991321118611, + "grad_norm": 2.0992026329040527, + "learning_rate": 3.452333596587747e-05, + "loss": 5.8877, + "step": 3896 + }, + { + "epoch": 0.37579556412729026, + "grad_norm": 2.451260566711426, + "learning_rate": 3.451633281848597e-05, + "loss": 5.6042, + "step": 3897 + }, + { + "epoch": 0.3758919961427194, + "grad_norm": 2.2941510677337646, + "learning_rate": 3.450932879769717e-05, + "loss": 5.6335, + "step": 3898 + }, + { + "epoch": 0.3759884281581485, + "grad_norm": 2.74711275100708, + "learning_rate": 3.4502323904153874e-05, + "loss": 6.041, + "step": 3899 + }, + { + "epoch": 0.3760848601735776, + "grad_norm": 2.2964794635772705, + "learning_rate": 3.4495318138498985e-05, + "loss": 5.8739, + "step": 3900 + }, + { + "epoch": 0.37618129218900676, + "grad_norm": 3.842428684234619, + "learning_rate": 3.448831150137548e-05, + "loss": 5.6698, + "step": 3901 + }, + { + "epoch": 0.3762777242044359, + "grad_norm": 3.2527761459350586, + "learning_rate": 3.448130399342643e-05, + "loss": 5.5568, + "step": 3902 + }, + { + "epoch": 0.376374156219865, + "grad_norm": 3.256500720977783, + "learning_rate": 3.447429561529496e-05, + "loss": 5.2882, + "step": 3903 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 3.206212282180786, + "learning_rate": 3.446728636762431e-05, + "loss": 5.3045, + "step": 3904 + }, + { + "epoch": 0.37656702025072325, + "grad_norm": 5.099926948547363, + "learning_rate": 3.446027625105776e-05, + "loss": 5.2595, + "step": 3905 + }, + { + "epoch": 0.3766634522661524, + "grad_norm": 3.302608013153076, + "learning_rate": 3.44532652662387e-05, + "loss": 5.5534, + "step": 3906 + }, + { + "epoch": 0.37675988428158147, + "grad_norm": 4.38598108291626, + "learning_rate": 3.4446253413810595e-05, + "loss": 5.5158, + "step": 3907 + }, + { + "epoch": 0.3768563162970106, + "grad_norm": 3.4187920093536377, + "learning_rate": 3.443924069441697e-05, + "loss": 5.8401, + "step": 3908 + }, + { + "epoch": 0.37695274831243974, + "grad_norm": 2.9149091243743896, + "learning_rate": 3.4432227108701465e-05, + "loss": 5.9411, + "step": 3909 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 2.888810634613037, + "learning_rate": 3.4425212657307755e-05, + "loss": 5.7614, + "step": 3910 + }, + { + "epoch": 0.37714561234329796, + "grad_norm": 3.110914707183838, + "learning_rate": 3.4418197340879635e-05, + "loss": 5.6945, + "step": 3911 + }, + { + "epoch": 0.3772420443587271, + "grad_norm": 2.345407247543335, + "learning_rate": 3.4411181160060944e-05, + "loss": 5.6653, + "step": 3912 + }, + { + "epoch": 0.37733847637415624, + "grad_norm": 3.993349313735962, + "learning_rate": 3.440416411549564e-05, + "loss": 5.3031, + "step": 3913 + }, + { + "epoch": 0.3774349083895853, + "grad_norm": 4.542987823486328, + "learning_rate": 3.439714620782772e-05, + "loss": 5.7164, + "step": 3914 + }, + { + "epoch": 0.37753134040501446, + "grad_norm": 3.8353257179260254, + "learning_rate": 3.439012743770129e-05, + "loss": 5.7566, + "step": 3915 + }, + { + "epoch": 0.3776277724204436, + "grad_norm": 2.8371894359588623, + "learning_rate": 3.438310780576052e-05, + "loss": 5.9446, + "step": 3916 + }, + { + "epoch": 0.37772420443587273, + "grad_norm": 2.10927152633667, + "learning_rate": 3.4376087312649675e-05, + "loss": 5.7588, + "step": 3917 + }, + { + "epoch": 0.3778206364513018, + "grad_norm": 3.3414359092712402, + "learning_rate": 3.436906595901306e-05, + "loss": 5.8255, + "step": 3918 + }, + { + "epoch": 0.37791706846673095, + "grad_norm": 3.7450358867645264, + "learning_rate": 3.436204374549511e-05, + "loss": 5.7703, + "step": 3919 + }, + { + "epoch": 0.3780135004821601, + "grad_norm": 3.8962767124176025, + "learning_rate": 3.435502067274031e-05, + "loss": 5.3803, + "step": 3920 + }, + { + "epoch": 0.3781099324975892, + "grad_norm": 3.0666298866271973, + "learning_rate": 3.434799674139322e-05, + "loss": 5.5387, + "step": 3921 + }, + { + "epoch": 0.3782063645130183, + "grad_norm": 3.531798839569092, + "learning_rate": 3.4340971952098503e-05, + "loss": 5.4277, + "step": 3922 + }, + { + "epoch": 0.37830279652844745, + "grad_norm": 4.0507001876831055, + "learning_rate": 3.4333946305500877e-05, + "loss": 5.5324, + "step": 3923 + }, + { + "epoch": 0.3783992285438766, + "grad_norm": 4.509546756744385, + "learning_rate": 3.432691980224514e-05, + "loss": 5.4326, + "step": 3924 + }, + { + "epoch": 0.37849566055930567, + "grad_norm": 4.582789421081543, + "learning_rate": 3.431989244297619e-05, + "loss": 5.8225, + "step": 3925 + }, + { + "epoch": 0.3785920925747348, + "grad_norm": 3.0846681594848633, + "learning_rate": 3.4312864228338984e-05, + "loss": 5.6667, + "step": 3926 + }, + { + "epoch": 0.37868852459016394, + "grad_norm": 2.2187838554382324, + "learning_rate": 3.430583515897856e-05, + "loss": 5.661, + "step": 3927 + }, + { + "epoch": 0.3787849566055931, + "grad_norm": 2.242227077484131, + "learning_rate": 3.429880523554003e-05, + "loss": 5.9471, + "step": 3928 + }, + { + "epoch": 0.37888138862102216, + "grad_norm": 2.6028010845184326, + "learning_rate": 3.42917744586686e-05, + "loss": 5.684, + "step": 3929 + }, + { + "epoch": 0.3789778206364513, + "grad_norm": 2.235208749771118, + "learning_rate": 3.4284742829009566e-05, + "loss": 5.6116, + "step": 3930 + }, + { + "epoch": 0.37907425265188044, + "grad_norm": 2.05717134475708, + "learning_rate": 3.4277710347208244e-05, + "loss": 5.4827, + "step": 3931 + }, + { + "epoch": 0.3791706846673095, + "grad_norm": 2.230926752090454, + "learning_rate": 3.427067701391009e-05, + "loss": 5.3701, + "step": 3932 + }, + { + "epoch": 0.37926711668273866, + "grad_norm": 3.121218681335449, + "learning_rate": 3.4263642829760614e-05, + "loss": 5.3182, + "step": 3933 + }, + { + "epoch": 0.3793635486981678, + "grad_norm": 2.0425198078155518, + "learning_rate": 3.42566077954054e-05, + "loss": 5.0531, + "step": 3934 + }, + { + "epoch": 0.37945998071359693, + "grad_norm": 2.391866683959961, + "learning_rate": 3.424957191149011e-05, + "loss": 5.6294, + "step": 3935 + }, + { + "epoch": 0.379556412729026, + "grad_norm": 2.568824052810669, + "learning_rate": 3.42425351786605e-05, + "loss": 5.6418, + "step": 3936 + }, + { + "epoch": 0.37965284474445515, + "grad_norm": 2.346281051635742, + "learning_rate": 3.4235497597562384e-05, + "loss": 5.5846, + "step": 3937 + }, + { + "epoch": 0.3797492767598843, + "grad_norm": 2.6075844764709473, + "learning_rate": 3.422845916884166e-05, + "loss": 5.6217, + "step": 3938 + }, + { + "epoch": 0.37984570877531343, + "grad_norm": 3.1511268615722656, + "learning_rate": 3.422141989314432e-05, + "loss": 5.2656, + "step": 3939 + }, + { + "epoch": 0.3799421407907425, + "grad_norm": 2.581167221069336, + "learning_rate": 3.4214379771116405e-05, + "loss": 5.0327, + "step": 3940 + }, + { + "epoch": 0.38003857280617165, + "grad_norm": 2.043391704559326, + "learning_rate": 3.4207338803404067e-05, + "loss": 4.795, + "step": 3941 + }, + { + "epoch": 0.3801350048216008, + "grad_norm": 2.512082576751709, + "learning_rate": 3.4200296990653497e-05, + "loss": 4.7699, + "step": 3942 + }, + { + "epoch": 0.38023143683702987, + "grad_norm": 3.0732152462005615, + "learning_rate": 3.4193254333510994e-05, + "loss": 5.3047, + "step": 3943 + }, + { + "epoch": 0.380327868852459, + "grad_norm": 2.9129045009613037, + "learning_rate": 3.418621083262293e-05, + "loss": 5.5279, + "step": 3944 + }, + { + "epoch": 0.38042430086788814, + "grad_norm": 2.1921064853668213, + "learning_rate": 3.4179166488635736e-05, + "loss": 5.5497, + "step": 3945 + }, + { + "epoch": 0.3805207328833173, + "grad_norm": 2.533517360687256, + "learning_rate": 3.417212130219594e-05, + "loss": 5.5233, + "step": 3946 + }, + { + "epoch": 0.38061716489874636, + "grad_norm": 2.5110199451446533, + "learning_rate": 3.4165075273950134e-05, + "loss": 5.5869, + "step": 3947 + }, + { + "epoch": 0.3807135969141755, + "grad_norm": 3.177666664123535, + "learning_rate": 3.415802840454502e-05, + "loss": 5.4574, + "step": 3948 + }, + { + "epoch": 0.38081002892960464, + "grad_norm": 2.3526415824890137, + "learning_rate": 3.415098069462731e-05, + "loss": 5.522, + "step": 3949 + }, + { + "epoch": 0.3809064609450338, + "grad_norm": 2.483582019805908, + "learning_rate": 3.414393214484386e-05, + "loss": 5.7066, + "step": 3950 + }, + { + "epoch": 0.38100289296046286, + "grad_norm": 1.7759085893630981, + "learning_rate": 3.413688275584157e-05, + "loss": 5.7102, + "step": 3951 + }, + { + "epoch": 0.381099324975892, + "grad_norm": 2.2803080081939697, + "learning_rate": 3.412983252826743e-05, + "loss": 5.7215, + "step": 3952 + }, + { + "epoch": 0.38119575699132113, + "grad_norm": 2.3473634719848633, + "learning_rate": 3.412278146276851e-05, + "loss": 5.6384, + "step": 3953 + }, + { + "epoch": 0.3812921890067502, + "grad_norm": 3.108888626098633, + "learning_rate": 3.4115729559991915e-05, + "loss": 5.6368, + "step": 3954 + }, + { + "epoch": 0.38138862102217935, + "grad_norm": 3.309248924255371, + "learning_rate": 3.4108676820584894e-05, + "loss": 5.679, + "step": 3955 + }, + { + "epoch": 0.3814850530376085, + "grad_norm": 2.8737916946411133, + "learning_rate": 3.410162324519472e-05, + "loss": 5.6633, + "step": 3956 + }, + { + "epoch": 0.38158148505303763, + "grad_norm": 3.7020318508148193, + "learning_rate": 3.409456883446877e-05, + "loss": 5.6918, + "step": 3957 + }, + { + "epoch": 0.3816779170684667, + "grad_norm": 4.008025646209717, + "learning_rate": 3.408751358905448e-05, + "loss": 5.6644, + "step": 3958 + }, + { + "epoch": 0.38177434908389585, + "grad_norm": 2.6656103134155273, + "learning_rate": 3.408045750959938e-05, + "loss": 5.5788, + "step": 3959 + }, + { + "epoch": 0.381870781099325, + "grad_norm": 3.2502846717834473, + "learning_rate": 3.407340059675107e-05, + "loss": 5.1962, + "step": 3960 + }, + { + "epoch": 0.3819672131147541, + "grad_norm": 4.930192947387695, + "learning_rate": 3.406634285115721e-05, + "loss": 5.1008, + "step": 3961 + }, + { + "epoch": 0.3820636451301832, + "grad_norm": 2.631354331970215, + "learning_rate": 3.405928427346557e-05, + "loss": 5.4095, + "step": 3962 + }, + { + "epoch": 0.38216007714561234, + "grad_norm": 3.572439432144165, + "learning_rate": 3.405222486432397e-05, + "loss": 5.7605, + "step": 3963 + }, + { + "epoch": 0.3822565091610415, + "grad_norm": 6.190046310424805, + "learning_rate": 3.404516462438031e-05, + "loss": 5.6805, + "step": 3964 + }, + { + "epoch": 0.38235294117647056, + "grad_norm": 3.9542670249938965, + "learning_rate": 3.403810355428258e-05, + "loss": 5.7434, + "step": 3965 + }, + { + "epoch": 0.3824493731918997, + "grad_norm": 3.526855230331421, + "learning_rate": 3.403104165467883e-05, + "loss": 5.2463, + "step": 3966 + }, + { + "epoch": 0.38254580520732884, + "grad_norm": 3.9010956287384033, + "learning_rate": 3.402397892621719e-05, + "loss": 5.6073, + "step": 3967 + }, + { + "epoch": 0.382642237222758, + "grad_norm": 4.875743389129639, + "learning_rate": 3.401691536954586e-05, + "loss": 5.2082, + "step": 3968 + }, + { + "epoch": 0.38273866923818706, + "grad_norm": 4.622166156768799, + "learning_rate": 3.400985098531315e-05, + "loss": 5.5786, + "step": 3969 + }, + { + "epoch": 0.3828351012536162, + "grad_norm": 3.6193573474884033, + "learning_rate": 3.4002785774167404e-05, + "loss": 5.1433, + "step": 3970 + }, + { + "epoch": 0.38293153326904533, + "grad_norm": 3.551137685775757, + "learning_rate": 3.399571973675705e-05, + "loss": 5.3162, + "step": 3971 + }, + { + "epoch": 0.38302796528447447, + "grad_norm": 3.3760154247283936, + "learning_rate": 3.398865287373062e-05, + "loss": 5.4185, + "step": 3972 + }, + { + "epoch": 0.38312439729990355, + "grad_norm": 2.614119291305542, + "learning_rate": 3.3981585185736686e-05, + "loss": 6.0418, + "step": 3973 + }, + { + "epoch": 0.3832208293153327, + "grad_norm": 2.5954155921936035, + "learning_rate": 3.397451667342393e-05, + "loss": 5.918, + "step": 3974 + }, + { + "epoch": 0.3833172613307618, + "grad_norm": 2.152829170227051, + "learning_rate": 3.3967447337441065e-05, + "loss": 5.9296, + "step": 3975 + }, + { + "epoch": 0.3834136933461909, + "grad_norm": 2.9986276626586914, + "learning_rate": 3.396037717843693e-05, + "loss": 5.783, + "step": 3976 + }, + { + "epoch": 0.38351012536162005, + "grad_norm": 1.8780901432037354, + "learning_rate": 3.395330619706041e-05, + "loss": 5.6707, + "step": 3977 + }, + { + "epoch": 0.3836065573770492, + "grad_norm": 2.9543232917785645, + "learning_rate": 3.3946234393960455e-05, + "loss": 5.6706, + "step": 3978 + }, + { + "epoch": 0.3837029893924783, + "grad_norm": 3.5410001277923584, + "learning_rate": 3.393916176978612e-05, + "loss": 5.5256, + "step": 3979 + }, + { + "epoch": 0.3837994214079074, + "grad_norm": 2.941112995147705, + "learning_rate": 3.393208832518652e-05, + "loss": 5.5981, + "step": 3980 + }, + { + "epoch": 0.38389585342333654, + "grad_norm": 2.6342527866363525, + "learning_rate": 3.3925014060810856e-05, + "loss": 5.6911, + "step": 3981 + }, + { + "epoch": 0.3839922854387657, + "grad_norm": 3.0515787601470947, + "learning_rate": 3.3917938977308376e-05, + "loss": 5.4942, + "step": 3982 + }, + { + "epoch": 0.3840887174541948, + "grad_norm": 2.2314212322235107, + "learning_rate": 3.3910863075328435e-05, + "loss": 5.5306, + "step": 3983 + }, + { + "epoch": 0.3841851494696239, + "grad_norm": 2.762683153152466, + "learning_rate": 3.390378635552045e-05, + "loss": 5.7392, + "step": 3984 + }, + { + "epoch": 0.38428158148505304, + "grad_norm": 3.9214468002319336, + "learning_rate": 3.389670881853392e-05, + "loss": 5.6769, + "step": 3985 + }, + { + "epoch": 0.3843780135004822, + "grad_norm": 3.498608112335205, + "learning_rate": 3.3889630465018395e-05, + "loss": 5.7867, + "step": 3986 + }, + { + "epoch": 0.38447444551591126, + "grad_norm": 3.003823757171631, + "learning_rate": 3.388255129562353e-05, + "loss": 5.8095, + "step": 3987 + }, + { + "epoch": 0.3845708775313404, + "grad_norm": 2.897383213043213, + "learning_rate": 3.387547131099905e-05, + "loss": 5.8521, + "step": 3988 + }, + { + "epoch": 0.38466730954676953, + "grad_norm": 3.258641481399536, + "learning_rate": 3.3868390511794724e-05, + "loss": 5.9731, + "step": 3989 + }, + { + "epoch": 0.38476374156219867, + "grad_norm": 3.413020133972168, + "learning_rate": 3.3861308898660435e-05, + "loss": 5.2908, + "step": 3990 + }, + { + "epoch": 0.38486017357762775, + "grad_norm": 2.5997867584228516, + "learning_rate": 3.385422647224612e-05, + "loss": 5.0447, + "step": 3991 + }, + { + "epoch": 0.3849566055930569, + "grad_norm": 4.490036487579346, + "learning_rate": 3.38471432332018e-05, + "loss": 5.3435, + "step": 3992 + }, + { + "epoch": 0.385053037608486, + "grad_norm": 3.783010482788086, + "learning_rate": 3.3840059182177565e-05, + "loss": 5.4926, + "step": 3993 + }, + { + "epoch": 0.38514946962391516, + "grad_norm": 4.572159290313721, + "learning_rate": 3.3832974319823576e-05, + "loss": 5.2683, + "step": 3994 + }, + { + "epoch": 0.38524590163934425, + "grad_norm": 3.3619871139526367, + "learning_rate": 3.3825888646790074e-05, + "loss": 5.5841, + "step": 3995 + }, + { + "epoch": 0.3853423336547734, + "grad_norm": 2.855088949203491, + "learning_rate": 3.381880216372738e-05, + "loss": 5.1843, + "step": 3996 + }, + { + "epoch": 0.3854387656702025, + "grad_norm": 3.144944667816162, + "learning_rate": 3.3811714871285864e-05, + "loss": 5.705, + "step": 3997 + }, + { + "epoch": 0.3855351976856316, + "grad_norm": 3.2551348209381104, + "learning_rate": 3.3804626770116016e-05, + "loss": 5.1411, + "step": 3998 + }, + { + "epoch": 0.38563162970106074, + "grad_norm": 2.674509048461914, + "learning_rate": 3.3797537860868354e-05, + "loss": 5.6523, + "step": 3999 + }, + { + "epoch": 0.3857280617164899, + "grad_norm": 3.1957015991210938, + "learning_rate": 3.37904481441935e-05, + "loss": 5.702, + "step": 4000 + }, + { + "epoch": 0.385824493731919, + "grad_norm": 2.8175761699676514, + "learning_rate": 3.378335762074213e-05, + "loss": 5.6001, + "step": 4001 + }, + { + "epoch": 0.3859209257473481, + "grad_norm": 4.778249740600586, + "learning_rate": 3.377626629116501e-05, + "loss": 5.2118, + "step": 4002 + }, + { + "epoch": 0.38601735776277724, + "grad_norm": 4.647237777709961, + "learning_rate": 3.376917415611297e-05, + "loss": 5.8041, + "step": 4003 + }, + { + "epoch": 0.3861137897782064, + "grad_norm": 3.6643497943878174, + "learning_rate": 3.376208121623692e-05, + "loss": 5.3784, + "step": 4004 + }, + { + "epoch": 0.3862102217936355, + "grad_norm": 2.735495090484619, + "learning_rate": 3.3754987472187836e-05, + "loss": 5.3476, + "step": 4005 + }, + { + "epoch": 0.3863066538090646, + "grad_norm": 3.246772289276123, + "learning_rate": 3.374789292461679e-05, + "loss": 5.4538, + "step": 4006 + }, + { + "epoch": 0.38640308582449373, + "grad_norm": 3.475477933883667, + "learning_rate": 3.374079757417489e-05, + "loss": 5.2395, + "step": 4007 + }, + { + "epoch": 0.38649951783992287, + "grad_norm": 4.793619155883789, + "learning_rate": 3.373370142151335e-05, + "loss": 5.6473, + "step": 4008 + }, + { + "epoch": 0.38659594985535195, + "grad_norm": 2.5309553146362305, + "learning_rate": 3.372660446728343e-05, + "loss": 5.5889, + "step": 4009 + }, + { + "epoch": 0.3866923818707811, + "grad_norm": 2.6454107761383057, + "learning_rate": 3.371950671213651e-05, + "loss": 4.9761, + "step": 4010 + }, + { + "epoch": 0.3867888138862102, + "grad_norm": 3.1866679191589355, + "learning_rate": 3.3712408156723993e-05, + "loss": 4.9556, + "step": 4011 + }, + { + "epoch": 0.38688524590163936, + "grad_norm": 3.3625881671905518, + "learning_rate": 3.370530880169737e-05, + "loss": 5.536, + "step": 4012 + }, + { + "epoch": 0.38698167791706845, + "grad_norm": 3.031405210494995, + "learning_rate": 3.369820864770822e-05, + "loss": 5.5802, + "step": 4013 + }, + { + "epoch": 0.3870781099324976, + "grad_norm": 2.4261257648468018, + "learning_rate": 3.3691107695408194e-05, + "loss": 5.3689, + "step": 4014 + }, + { + "epoch": 0.3871745419479267, + "grad_norm": 2.543825149536133, + "learning_rate": 3.3684005945449006e-05, + "loss": 5.8279, + "step": 4015 + }, + { + "epoch": 0.38727097396335586, + "grad_norm": 2.5208699703216553, + "learning_rate": 3.3676903398482426e-05, + "loss": 5.722, + "step": 4016 + }, + { + "epoch": 0.38736740597878494, + "grad_norm": 1.9856246709823608, + "learning_rate": 3.366980005516034e-05, + "loss": 5.4316, + "step": 4017 + }, + { + "epoch": 0.3874638379942141, + "grad_norm": 2.135653018951416, + "learning_rate": 3.366269591613467e-05, + "loss": 5.2497, + "step": 4018 + }, + { + "epoch": 0.3875602700096432, + "grad_norm": 2.6133639812469482, + "learning_rate": 3.365559098205744e-05, + "loss": 5.2035, + "step": 4019 + }, + { + "epoch": 0.3876567020250723, + "grad_norm": 1.949495792388916, + "learning_rate": 3.364848525358071e-05, + "loss": 5.3366, + "step": 4020 + }, + { + "epoch": 0.38775313404050143, + "grad_norm": 2.5601391792297363, + "learning_rate": 3.364137873135665e-05, + "loss": 5.459, + "step": 4021 + }, + { + "epoch": 0.3878495660559306, + "grad_norm": 3.2551562786102295, + "learning_rate": 3.3634271416037496e-05, + "loss": 5.5208, + "step": 4022 + }, + { + "epoch": 0.3879459980713597, + "grad_norm": 1.7672137022018433, + "learning_rate": 3.362716330827554e-05, + "loss": 5.6241, + "step": 4023 + }, + { + "epoch": 0.3880424300867888, + "grad_norm": 2.367049217224121, + "learning_rate": 3.3620054408723146e-05, + "loss": 5.5399, + "step": 4024 + }, + { + "epoch": 0.38813886210221793, + "grad_norm": 2.2851054668426514, + "learning_rate": 3.361294471803277e-05, + "loss": 5.4281, + "step": 4025 + }, + { + "epoch": 0.38823529411764707, + "grad_norm": 3.2875828742980957, + "learning_rate": 3.360583423685692e-05, + "loss": 5.4238, + "step": 4026 + }, + { + "epoch": 0.3883317261330762, + "grad_norm": 2.416689395904541, + "learning_rate": 3.3598722965848204e-05, + "loss": 4.9698, + "step": 4027 + }, + { + "epoch": 0.3884281581485053, + "grad_norm": 3.918999671936035, + "learning_rate": 3.3591610905659275e-05, + "loss": 5.7336, + "step": 4028 + }, + { + "epoch": 0.3885245901639344, + "grad_norm": 2.4039227962493896, + "learning_rate": 3.358449805694288e-05, + "loss": 5.4973, + "step": 4029 + }, + { + "epoch": 0.38862102217936356, + "grad_norm": 2.2633252143859863, + "learning_rate": 3.357738442035181e-05, + "loss": 5.5125, + "step": 4030 + }, + { + "epoch": 0.38871745419479264, + "grad_norm": 2.5796406269073486, + "learning_rate": 3.357026999653895e-05, + "loss": 5.1994, + "step": 4031 + }, + { + "epoch": 0.3888138862102218, + "grad_norm": 1.8418258428573608, + "learning_rate": 3.356315478615728e-05, + "loss": 5.0773, + "step": 4032 + }, + { + "epoch": 0.3889103182256509, + "grad_norm": 2.029228925704956, + "learning_rate": 3.355603878985978e-05, + "loss": 5.0633, + "step": 4033 + }, + { + "epoch": 0.38900675024108006, + "grad_norm": 2.267949342727661, + "learning_rate": 3.354892200829958e-05, + "loss": 5.5321, + "step": 4034 + }, + { + "epoch": 0.38910318225650914, + "grad_norm": 2.166074752807617, + "learning_rate": 3.354180444212984e-05, + "loss": 5.3846, + "step": 4035 + }, + { + "epoch": 0.3891996142719383, + "grad_norm": 3.7718253135681152, + "learning_rate": 3.353468609200381e-05, + "loss": 5.348, + "step": 4036 + }, + { + "epoch": 0.3892960462873674, + "grad_norm": 2.1339709758758545, + "learning_rate": 3.352756695857478e-05, + "loss": 5.6282, + "step": 4037 + }, + { + "epoch": 0.38939247830279655, + "grad_norm": 3.3837692737579346, + "learning_rate": 3.352044704249616e-05, + "loss": 5.7381, + "step": 4038 + }, + { + "epoch": 0.38948891031822563, + "grad_norm": 2.7734761238098145, + "learning_rate": 3.351332634442139e-05, + "loss": 5.4615, + "step": 4039 + }, + { + "epoch": 0.38958534233365477, + "grad_norm": 4.7387566566467285, + "learning_rate": 3.350620486500402e-05, + "loss": 4.998, + "step": 4040 + }, + { + "epoch": 0.3896817743490839, + "grad_norm": 2.9183647632598877, + "learning_rate": 3.349908260489762e-05, + "loss": 4.7833, + "step": 4041 + }, + { + "epoch": 0.389778206364513, + "grad_norm": 2.707444429397583, + "learning_rate": 3.349195956475588e-05, + "loss": 5.215, + "step": 4042 + }, + { + "epoch": 0.38987463837994213, + "grad_norm": 2.865128755569458, + "learning_rate": 3.3484835745232554e-05, + "loss": 5.46, + "step": 4043 + }, + { + "epoch": 0.38997107039537127, + "grad_norm": 2.6374893188476562, + "learning_rate": 3.3477711146981436e-05, + "loss": 5.5292, + "step": 4044 + }, + { + "epoch": 0.3900675024108004, + "grad_norm": 3.181692123413086, + "learning_rate": 3.347058577065643e-05, + "loss": 5.6772, + "step": 4045 + }, + { + "epoch": 0.3901639344262295, + "grad_norm": 2.745508909225464, + "learning_rate": 3.346345961691149e-05, + "loss": 5.7221, + "step": 4046 + }, + { + "epoch": 0.3902603664416586, + "grad_norm": 2.7931346893310547, + "learning_rate": 3.345633268640064e-05, + "loss": 5.7299, + "step": 4047 + }, + { + "epoch": 0.39035679845708776, + "grad_norm": 2.159879684448242, + "learning_rate": 3.344920497977798e-05, + "loss": 5.8421, + "step": 4048 + }, + { + "epoch": 0.3904532304725169, + "grad_norm": 2.844418525695801, + "learning_rate": 3.344207649769769e-05, + "loss": 5.6467, + "step": 4049 + }, + { + "epoch": 0.390549662487946, + "grad_norm": 3.5481040477752686, + "learning_rate": 3.343494724081401e-05, + "loss": 5.4278, + "step": 4050 + }, + { + "epoch": 0.3906460945033751, + "grad_norm": 2.50339412689209, + "learning_rate": 3.342781720978126e-05, + "loss": 5.6956, + "step": 4051 + }, + { + "epoch": 0.39074252651880426, + "grad_norm": 2.3196513652801514, + "learning_rate": 3.342068640525381e-05, + "loss": 5.5916, + "step": 4052 + }, + { + "epoch": 0.39083895853423334, + "grad_norm": 2.253161668777466, + "learning_rate": 3.341355482788613e-05, + "loss": 5.6789, + "step": 4053 + }, + { + "epoch": 0.3909353905496625, + "grad_norm": 1.967070460319519, + "learning_rate": 3.3406422478332745e-05, + "loss": 5.5863, + "step": 4054 + }, + { + "epoch": 0.3910318225650916, + "grad_norm": 2.03579044342041, + "learning_rate": 3.339928935724825e-05, + "loss": 5.8217, + "step": 4055 + }, + { + "epoch": 0.39112825458052075, + "grad_norm": 2.483916759490967, + "learning_rate": 3.339215546528731e-05, + "loss": 5.8578, + "step": 4056 + }, + { + "epoch": 0.39122468659594983, + "grad_norm": 2.084522247314453, + "learning_rate": 3.3385020803104675e-05, + "loss": 5.8883, + "step": 4057 + }, + { + "epoch": 0.39132111861137897, + "grad_norm": 3.1234500408172607, + "learning_rate": 3.337788537135516e-05, + "loss": 5.5359, + "step": 4058 + }, + { + "epoch": 0.3914175506268081, + "grad_norm": 3.178632974624634, + "learning_rate": 3.337074917069362e-05, + "loss": 5.3948, + "step": 4059 + }, + { + "epoch": 0.39151398264223725, + "grad_norm": 2.911050319671631, + "learning_rate": 3.336361220177503e-05, + "loss": 5.3712, + "step": 4060 + }, + { + "epoch": 0.39161041465766633, + "grad_norm": 3.117084503173828, + "learning_rate": 3.335647446525441e-05, + "loss": 5.5076, + "step": 4061 + }, + { + "epoch": 0.39170684667309547, + "grad_norm": 5.851019859313965, + "learning_rate": 3.334933596178685e-05, + "loss": 5.0348, + "step": 4062 + }, + { + "epoch": 0.3918032786885246, + "grad_norm": 4.03144645690918, + "learning_rate": 3.334219669202751e-05, + "loss": 5.5204, + "step": 4063 + }, + { + "epoch": 0.3918997107039537, + "grad_norm": 3.6117963790893555, + "learning_rate": 3.333505665663162e-05, + "loss": 5.5845, + "step": 4064 + }, + { + "epoch": 0.3919961427193828, + "grad_norm": 3.636385679244995, + "learning_rate": 3.332791585625449e-05, + "loss": 5.2087, + "step": 4065 + }, + { + "epoch": 0.39209257473481196, + "grad_norm": 3.9605743885040283, + "learning_rate": 3.3320774291551495e-05, + "loss": 5.3782, + "step": 4066 + }, + { + "epoch": 0.3921890067502411, + "grad_norm": 3.482436180114746, + "learning_rate": 3.3313631963178075e-05, + "loss": 5.2938, + "step": 4067 + }, + { + "epoch": 0.3922854387656702, + "grad_norm": 2.5230772495269775, + "learning_rate": 3.3306488871789746e-05, + "loss": 5.3031, + "step": 4068 + }, + { + "epoch": 0.3923818707810993, + "grad_norm": 2.754932403564453, + "learning_rate": 3.32993450180421e-05, + "loss": 5.4321, + "step": 4069 + }, + { + "epoch": 0.39247830279652846, + "grad_norm": 2.8610198497772217, + "learning_rate": 3.3292200402590775e-05, + "loss": 5.5587, + "step": 4070 + }, + { + "epoch": 0.3925747348119576, + "grad_norm": 3.894740343093872, + "learning_rate": 3.32850550260915e-05, + "loss": 5.6253, + "step": 4071 + }, + { + "epoch": 0.3926711668273867, + "grad_norm": 3.1908986568450928, + "learning_rate": 3.327790888920008e-05, + "loss": 5.5011, + "step": 4072 + }, + { + "epoch": 0.3927675988428158, + "grad_norm": 3.162714719772339, + "learning_rate": 3.327076199257237e-05, + "loss": 5.4199, + "step": 4073 + }, + { + "epoch": 0.39286403085824495, + "grad_norm": 4.5152788162231445, + "learning_rate": 3.32636143368643e-05, + "loss": 5.2653, + "step": 4074 + }, + { + "epoch": 0.39296046287367403, + "grad_norm": 5.335099697113037, + "learning_rate": 3.3256465922731875e-05, + "loss": 5.4384, + "step": 4075 + }, + { + "epoch": 0.39305689488910317, + "grad_norm": 4.1537346839904785, + "learning_rate": 3.3249316750831184e-05, + "loss": 5.4248, + "step": 4076 + }, + { + "epoch": 0.3931533269045323, + "grad_norm": 3.844280481338501, + "learning_rate": 3.324216682181834e-05, + "loss": 5.5706, + "step": 4077 + }, + { + "epoch": 0.39324975891996145, + "grad_norm": 4.895838260650635, + "learning_rate": 3.323501613634957e-05, + "loss": 5.2387, + "step": 4078 + }, + { + "epoch": 0.3933461909353905, + "grad_norm": 5.267061710357666, + "learning_rate": 3.3227864695081155e-05, + "loss": 5.1585, + "step": 4079 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 4.314175128936768, + "learning_rate": 3.3220712498669456e-05, + "loss": 5.3856, + "step": 4080 + }, + { + "epoch": 0.3935390549662488, + "grad_norm": 4.790525913238525, + "learning_rate": 3.321355954777087e-05, + "loss": 5.3505, + "step": 4081 + }, + { + "epoch": 0.39363548698167794, + "grad_norm": 6.237986087799072, + "learning_rate": 3.3206405843041906e-05, + "loss": 5.7135, + "step": 4082 + }, + { + "epoch": 0.393731918997107, + "grad_norm": 4.227123737335205, + "learning_rate": 3.319925138513911e-05, + "loss": 5.6113, + "step": 4083 + }, + { + "epoch": 0.39382835101253616, + "grad_norm": 6.155198574066162, + "learning_rate": 3.319209617471911e-05, + "loss": 5.2867, + "step": 4084 + }, + { + "epoch": 0.3939247830279653, + "grad_norm": 7.252954006195068, + "learning_rate": 3.318494021243861e-05, + "loss": 5.3951, + "step": 4085 + }, + { + "epoch": 0.3940212150433944, + "grad_norm": 7.80514669418335, + "learning_rate": 3.317778349895437e-05, + "loss": 5.5049, + "step": 4086 + }, + { + "epoch": 0.3941176470588235, + "grad_norm": 7.787381172180176, + "learning_rate": 3.317062603492323e-05, + "loss": 5.3865, + "step": 4087 + }, + { + "epoch": 0.39421407907425265, + "grad_norm": 4.928526401519775, + "learning_rate": 3.316346782100208e-05, + "loss": 5.7302, + "step": 4088 + }, + { + "epoch": 0.3943105110896818, + "grad_norm": 5.860257625579834, + "learning_rate": 3.3156308857847906e-05, + "loss": 5.5523, + "step": 4089 + }, + { + "epoch": 0.3944069431051109, + "grad_norm": 5.736530780792236, + "learning_rate": 3.314914914611775e-05, + "loss": 5.6663, + "step": 4090 + }, + { + "epoch": 0.39450337512054, + "grad_norm": 4.799217700958252, + "learning_rate": 3.3141988686468714e-05, + "loss": 5.5296, + "step": 4091 + }, + { + "epoch": 0.39459980713596915, + "grad_norm": 4.531279563903809, + "learning_rate": 3.313482747955797e-05, + "loss": 5.3648, + "step": 4092 + }, + { + "epoch": 0.3946962391513983, + "grad_norm": 3.234956979751587, + "learning_rate": 3.3127665526042774e-05, + "loss": 5.3804, + "step": 4093 + }, + { + "epoch": 0.39479267116682737, + "grad_norm": 4.865374565124512, + "learning_rate": 3.3120502826580444e-05, + "loss": 5.1622, + "step": 4094 + }, + { + "epoch": 0.3948891031822565, + "grad_norm": 4.6744184494018555, + "learning_rate": 3.311333938182836e-05, + "loss": 5.5618, + "step": 4095 + }, + { + "epoch": 0.39498553519768564, + "grad_norm": 3.1087565422058105, + "learning_rate": 3.3106175192443964e-05, + "loss": 5.4481, + "step": 4096 + }, + { + "epoch": 0.3950819672131147, + "grad_norm": 2.5759098529815674, + "learning_rate": 3.309901025908479e-05, + "loss": 5.1588, + "step": 4097 + }, + { + "epoch": 0.39517839922854386, + "grad_norm": 5.118381500244141, + "learning_rate": 3.3091844582408424e-05, + "loss": 5.5217, + "step": 4098 + }, + { + "epoch": 0.395274831243973, + "grad_norm": 6.2748613357543945, + "learning_rate": 3.308467816307252e-05, + "loss": 5.6042, + "step": 4099 + }, + { + "epoch": 0.39537126325940214, + "grad_norm": 6.142618656158447, + "learning_rate": 3.307751100173481e-05, + "loss": 5.412, + "step": 4100 + }, + { + "epoch": 0.3954676952748312, + "grad_norm": 4.409951210021973, + "learning_rate": 3.307034309905308e-05, + "loss": 5.087, + "step": 4101 + }, + { + "epoch": 0.39556412729026036, + "grad_norm": 3.426699638366699, + "learning_rate": 3.30631744556852e-05, + "loss": 5.0422, + "step": 4102 + }, + { + "epoch": 0.3956605593056895, + "grad_norm": 4.654464244842529, + "learning_rate": 3.305600507228908e-05, + "loss": 5.3737, + "step": 4103 + }, + { + "epoch": 0.39575699132111863, + "grad_norm": 5.62491512298584, + "learning_rate": 3.3048834949522735e-05, + "loss": 5.7145, + "step": 4104 + }, + { + "epoch": 0.3958534233365477, + "grad_norm": 5.842170238494873, + "learning_rate": 3.3041664088044224e-05, + "loss": 5.2832, + "step": 4105 + }, + { + "epoch": 0.39594985535197685, + "grad_norm": 3.8514389991760254, + "learning_rate": 3.303449248851168e-05, + "loss": 5.7152, + "step": 4106 + }, + { + "epoch": 0.396046287367406, + "grad_norm": 3.775634288787842, + "learning_rate": 3.3027320151583305e-05, + "loss": 5.8473, + "step": 4107 + }, + { + "epoch": 0.39614271938283513, + "grad_norm": 4.390894412994385, + "learning_rate": 3.302014707791737e-05, + "loss": 5.333, + "step": 4108 + }, + { + "epoch": 0.3962391513982642, + "grad_norm": 3.302591562271118, + "learning_rate": 3.3012973268172207e-05, + "loss": 5.5122, + "step": 4109 + }, + { + "epoch": 0.39633558341369335, + "grad_norm": 3.5514333248138428, + "learning_rate": 3.300579872300621e-05, + "loss": 5.2833, + "step": 4110 + }, + { + "epoch": 0.3964320154291225, + "grad_norm": 3.094224452972412, + "learning_rate": 3.299862344307787e-05, + "loss": 5.4647, + "step": 4111 + }, + { + "epoch": 0.39652844744455157, + "grad_norm": 3.3443524837493896, + "learning_rate": 3.299144742904571e-05, + "loss": 5.1561, + "step": 4112 + }, + { + "epoch": 0.3966248794599807, + "grad_norm": 3.994269371032715, + "learning_rate": 3.298427068156835e-05, + "loss": 4.9533, + "step": 4113 + }, + { + "epoch": 0.39672131147540984, + "grad_norm": 4.823012828826904, + "learning_rate": 3.297709320130444e-05, + "loss": 5.1225, + "step": 4114 + }, + { + "epoch": 0.396817743490839, + "grad_norm": 4.433574199676514, + "learning_rate": 3.2969914988912744e-05, + "loss": 5.5286, + "step": 4115 + }, + { + "epoch": 0.39691417550626806, + "grad_norm": 3.0547945499420166, + "learning_rate": 3.296273604505206e-05, + "loss": 5.5444, + "step": 4116 + }, + { + "epoch": 0.3970106075216972, + "grad_norm": 3.153932571411133, + "learning_rate": 3.2955556370381256e-05, + "loss": 5.835, + "step": 4117 + }, + { + "epoch": 0.39710703953712634, + "grad_norm": 4.208669185638428, + "learning_rate": 3.294837596555928e-05, + "loss": 5.8676, + "step": 4118 + }, + { + "epoch": 0.3972034715525555, + "grad_norm": 2.916933059692383, + "learning_rate": 3.294119483124514e-05, + "loss": 5.7724, + "step": 4119 + }, + { + "epoch": 0.39729990356798456, + "grad_norm": 2.938516616821289, + "learning_rate": 3.2934012968097916e-05, + "loss": 5.416, + "step": 4120 + }, + { + "epoch": 0.3973963355834137, + "grad_norm": 2.740096092224121, + "learning_rate": 3.292683037677675e-05, + "loss": 5.5601, + "step": 4121 + }, + { + "epoch": 0.39749276759884283, + "grad_norm": 3.5788567066192627, + "learning_rate": 3.291964705794085e-05, + "loss": 5.5237, + "step": 4122 + }, + { + "epoch": 0.3975891996142719, + "grad_norm": 2.724755048751831, + "learning_rate": 3.2912463012249485e-05, + "loss": 5.4639, + "step": 4123 + }, + { + "epoch": 0.39768563162970105, + "grad_norm": 2.742488384246826, + "learning_rate": 3.290527824036201e-05, + "loss": 5.7949, + "step": 4124 + }, + { + "epoch": 0.3977820636451302, + "grad_norm": 2.353809356689453, + "learning_rate": 3.2898092742937817e-05, + "loss": 5.8278, + "step": 4125 + }, + { + "epoch": 0.39787849566055933, + "grad_norm": 2.7456610202789307, + "learning_rate": 3.28909065206364e-05, + "loss": 5.9049, + "step": 4126 + }, + { + "epoch": 0.3979749276759884, + "grad_norm": 2.851208448410034, + "learning_rate": 3.288371957411729e-05, + "loss": 5.4451, + "step": 4127 + }, + { + "epoch": 0.39807135969141755, + "grad_norm": 2.867142915725708, + "learning_rate": 3.287653190404011e-05, + "loss": 5.5686, + "step": 4128 + }, + { + "epoch": 0.3981677917068467, + "grad_norm": 3.5883431434631348, + "learning_rate": 3.286934351106452e-05, + "loss": 4.8375, + "step": 4129 + }, + { + "epoch": 0.3982642237222758, + "grad_norm": 1.91608464717865, + "learning_rate": 3.286215439585026e-05, + "loss": 4.8433, + "step": 4130 + }, + { + "epoch": 0.3983606557377049, + "grad_norm": 3.2059521675109863, + "learning_rate": 3.285496455905715e-05, + "loss": 5.1802, + "step": 4131 + }, + { + "epoch": 0.39845708775313404, + "grad_norm": 4.7306952476501465, + "learning_rate": 3.284777400134507e-05, + "loss": 5.5099, + "step": 4132 + }, + { + "epoch": 0.3985535197685632, + "grad_norm": 4.0267791748046875, + "learning_rate": 3.2840582723373946e-05, + "loss": 5.584, + "step": 4133 + }, + { + "epoch": 0.39864995178399226, + "grad_norm": 3.1313064098358154, + "learning_rate": 3.283339072580379e-05, + "loss": 5.3771, + "step": 4134 + }, + { + "epoch": 0.3987463837994214, + "grad_norm": 3.371314764022827, + "learning_rate": 3.282619800929468e-05, + "loss": 5.7827, + "step": 4135 + }, + { + "epoch": 0.39884281581485054, + "grad_norm": 4.51704216003418, + "learning_rate": 3.281900457450674e-05, + "loss": 5.6613, + "step": 4136 + }, + { + "epoch": 0.3989392478302797, + "grad_norm": 4.328499794006348, + "learning_rate": 3.2811810422100184e-05, + "loss": 5.7842, + "step": 4137 + }, + { + "epoch": 0.39903567984570876, + "grad_norm": 4.460198402404785, + "learning_rate": 3.280461555273529e-05, + "loss": 5.6081, + "step": 4138 + }, + { + "epoch": 0.3991321118611379, + "grad_norm": 4.854811191558838, + "learning_rate": 3.279741996707239e-05, + "loss": 5.0857, + "step": 4139 + }, + { + "epoch": 0.39922854387656703, + "grad_norm": 3.8178799152374268, + "learning_rate": 3.279022366577187e-05, + "loss": 5.6742, + "step": 4140 + }, + { + "epoch": 0.39932497589199617, + "grad_norm": 5.385692119598389, + "learning_rate": 3.278302664949422e-05, + "loss": 5.519, + "step": 4141 + }, + { + "epoch": 0.39942140790742525, + "grad_norm": 4.95258092880249, + "learning_rate": 3.2775828918899954e-05, + "loss": 5.3214, + "step": 4142 + }, + { + "epoch": 0.3995178399228544, + "grad_norm": 2.2371537685394287, + "learning_rate": 3.276863047464969e-05, + "loss": 5.6167, + "step": 4143 + }, + { + "epoch": 0.3996142719382835, + "grad_norm": 3.271345853805542, + "learning_rate": 3.2761431317404075e-05, + "loss": 5.47, + "step": 4144 + }, + { + "epoch": 0.3997107039537126, + "grad_norm": 2.479498863220215, + "learning_rate": 3.275423144782385e-05, + "loss": 5.5591, + "step": 4145 + }, + { + "epoch": 0.39980713596914175, + "grad_norm": 3.8567564487457275, + "learning_rate": 3.274703086656981e-05, + "loss": 5.3203, + "step": 4146 + }, + { + "epoch": 0.3999035679845709, + "grad_norm": 2.41668438911438, + "learning_rate": 3.27398295743028e-05, + "loss": 5.2638, + "step": 4147 + }, + { + "epoch": 0.4, + "grad_norm": 4.396237850189209, + "learning_rate": 3.2732627571683766e-05, + "loss": 5.7049, + "step": 4148 + }, + { + "epoch": 0.4000964320154291, + "grad_norm": 6.406973361968994, + "learning_rate": 3.272542485937369e-05, + "loss": 5.1202, + "step": 4149 + }, + { + "epoch": 0.40019286403085824, + "grad_norm": 4.271142482757568, + "learning_rate": 3.271822143803363e-05, + "loss": 5.1045, + "step": 4150 + }, + { + "epoch": 0.4002892960462874, + "grad_norm": 4.520002365112305, + "learning_rate": 3.27110173083247e-05, + "loss": 5.6322, + "step": 4151 + }, + { + "epoch": 0.4003857280617165, + "grad_norm": 5.590881824493408, + "learning_rate": 3.27038124709081e-05, + "loss": 5.5017, + "step": 4152 + }, + { + "epoch": 0.4004821600771456, + "grad_norm": 4.346375942230225, + "learning_rate": 3.2696606926445076e-05, + "loss": 5.5876, + "step": 4153 + }, + { + "epoch": 0.40057859209257474, + "grad_norm": 2.383521318435669, + "learning_rate": 3.2689400675596934e-05, + "loss": 5.416, + "step": 4154 + }, + { + "epoch": 0.4006750241080039, + "grad_norm": 2.848355293273926, + "learning_rate": 3.268219371902506e-05, + "loss": 5.1805, + "step": 4155 + }, + { + "epoch": 0.40077145612343296, + "grad_norm": 3.204556465148926, + "learning_rate": 3.2674986057390903e-05, + "loss": 5.1781, + "step": 4156 + }, + { + "epoch": 0.4008678881388621, + "grad_norm": 5.277294635772705, + "learning_rate": 3.266777769135598e-05, + "loss": 5.7024, + "step": 4157 + }, + { + "epoch": 0.40096432015429123, + "grad_norm": 3.5135064125061035, + "learning_rate": 3.266056862158185e-05, + "loss": 5.4185, + "step": 4158 + }, + { + "epoch": 0.40106075216972037, + "grad_norm": 2.617922306060791, + "learning_rate": 3.2653358848730164e-05, + "loss": 5.5522, + "step": 4159 + }, + { + "epoch": 0.40115718418514945, + "grad_norm": 4.940831661224365, + "learning_rate": 3.2646148373462624e-05, + "loss": 5.5759, + "step": 4160 + }, + { + "epoch": 0.4012536162005786, + "grad_norm": 4.392367362976074, + "learning_rate": 3.263893719644099e-05, + "loss": 5.5932, + "step": 4161 + }, + { + "epoch": 0.4013500482160077, + "grad_norm": 3.6703693866729736, + "learning_rate": 3.2631725318327114e-05, + "loss": 5.3079, + "step": 4162 + }, + { + "epoch": 0.40144648023143686, + "grad_norm": 4.052944183349609, + "learning_rate": 3.262451273978287e-05, + "loss": 5.5489, + "step": 4163 + }, + { + "epoch": 0.40154291224686595, + "grad_norm": 3.286590576171875, + "learning_rate": 3.261729946147024e-05, + "loss": 5.7469, + "step": 4164 + }, + { + "epoch": 0.4016393442622951, + "grad_norm": 3.795409917831421, + "learning_rate": 3.2610085484051235e-05, + "loss": 5.4772, + "step": 4165 + }, + { + "epoch": 0.4017357762777242, + "grad_norm": 2.447014808654785, + "learning_rate": 3.260287080818795e-05, + "loss": 5.4823, + "step": 4166 + }, + { + "epoch": 0.4018322082931533, + "grad_norm": 2.398527145385742, + "learning_rate": 3.2595655434542546e-05, + "loss": 5.4147, + "step": 4167 + }, + { + "epoch": 0.40192864030858244, + "grad_norm": 2.586013078689575, + "learning_rate": 3.2588439363777227e-05, + "loss": 5.4473, + "step": 4168 + }, + { + "epoch": 0.4020250723240116, + "grad_norm": 2.2515900135040283, + "learning_rate": 3.258122259655429e-05, + "loss": 5.3682, + "step": 4169 + }, + { + "epoch": 0.4021215043394407, + "grad_norm": 2.520484209060669, + "learning_rate": 3.257400513353607e-05, + "loss": 5.4027, + "step": 4170 + }, + { + "epoch": 0.4022179363548698, + "grad_norm": 2.5606017112731934, + "learning_rate": 3.256678697538498e-05, + "loss": 5.5799, + "step": 4171 + }, + { + "epoch": 0.40231436837029894, + "grad_norm": 2.11391544342041, + "learning_rate": 3.255956812276349e-05, + "loss": 5.4622, + "step": 4172 + }, + { + "epoch": 0.4024108003857281, + "grad_norm": 3.5488219261169434, + "learning_rate": 3.2552348576334155e-05, + "loss": 5.5594, + "step": 4173 + }, + { + "epoch": 0.4025072324011572, + "grad_norm": 3.082155704498291, + "learning_rate": 3.2545128336759555e-05, + "loss": 5.0013, + "step": 4174 + }, + { + "epoch": 0.4026036644165863, + "grad_norm": 2.727973699569702, + "learning_rate": 3.253790740470236e-05, + "loss": 4.9564, + "step": 4175 + }, + { + "epoch": 0.40270009643201543, + "grad_norm": 3.0710434913635254, + "learning_rate": 3.253068578082531e-05, + "loss": 5.1276, + "step": 4176 + }, + { + "epoch": 0.40279652844744457, + "grad_norm": 6.045331954956055, + "learning_rate": 3.252346346579118e-05, + "loss": 5.1487, + "step": 4177 + }, + { + "epoch": 0.40289296046287365, + "grad_norm": 2.9831111431121826, + "learning_rate": 3.251624046026284e-05, + "loss": 5.6818, + "step": 4178 + }, + { + "epoch": 0.4029893924783028, + "grad_norm": 2.8024675846099854, + "learning_rate": 3.2509016764903197e-05, + "loss": 5.5932, + "step": 4179 + }, + { + "epoch": 0.4030858244937319, + "grad_norm": 4.1128830909729, + "learning_rate": 3.250179238037524e-05, + "loss": 5.3387, + "step": 4180 + }, + { + "epoch": 0.40318225650916106, + "grad_norm": 3.8220975399017334, + "learning_rate": 3.249456730734201e-05, + "loss": 5.4483, + "step": 4181 + }, + { + "epoch": 0.40327868852459015, + "grad_norm": 3.700718641281128, + "learning_rate": 3.2487341546466615e-05, + "loss": 5.2337, + "step": 4182 + }, + { + "epoch": 0.4033751205400193, + "grad_norm": 3.046687364578247, + "learning_rate": 3.2480115098412234e-05, + "loss": 4.9914, + "step": 4183 + }, + { + "epoch": 0.4034715525554484, + "grad_norm": 3.773714780807495, + "learning_rate": 3.2472887963842094e-05, + "loss": 5.4764, + "step": 4184 + }, + { + "epoch": 0.40356798457087756, + "grad_norm": 2.9352779388427734, + "learning_rate": 3.2465660143419484e-05, + "loss": 5.6967, + "step": 4185 + }, + { + "epoch": 0.40366441658630664, + "grad_norm": 2.7127814292907715, + "learning_rate": 3.2458431637807785e-05, + "loss": 5.8899, + "step": 4186 + }, + { + "epoch": 0.4037608486017358, + "grad_norm": 3.3588171005249023, + "learning_rate": 3.2451202447670415e-05, + "loss": 5.6931, + "step": 4187 + }, + { + "epoch": 0.4038572806171649, + "grad_norm": 5.991399765014648, + "learning_rate": 3.2443972573670844e-05, + "loss": 5.1954, + "step": 4188 + }, + { + "epoch": 0.403953712632594, + "grad_norm": 4.377965927124023, + "learning_rate": 3.243674201647263e-05, + "loss": 5.1025, + "step": 4189 + }, + { + "epoch": 0.40405014464802314, + "grad_norm": 5.155945301055908, + "learning_rate": 3.24295107767394e-05, + "loss": 5.5711, + "step": 4190 + }, + { + "epoch": 0.4041465766634523, + "grad_norm": 4.648390769958496, + "learning_rate": 3.242227885513481e-05, + "loss": 5.6438, + "step": 4191 + }, + { + "epoch": 0.4042430086788814, + "grad_norm": 3.693077325820923, + "learning_rate": 3.2415046252322604e-05, + "loss": 5.4469, + "step": 4192 + }, + { + "epoch": 0.4043394406943105, + "grad_norm": 3.1525750160217285, + "learning_rate": 3.240781296896656e-05, + "loss": 5.6519, + "step": 4193 + }, + { + "epoch": 0.40443587270973963, + "grad_norm": 3.353874921798706, + "learning_rate": 3.240057900573058e-05, + "loss": 5.6819, + "step": 4194 + }, + { + "epoch": 0.40453230472516877, + "grad_norm": 3.6890687942504883, + "learning_rate": 3.239334436327857e-05, + "loss": 5.3295, + "step": 4195 + }, + { + "epoch": 0.4046287367405979, + "grad_norm": 4.059730529785156, + "learning_rate": 3.2386109042274496e-05, + "loss": 5.6337, + "step": 4196 + }, + { + "epoch": 0.404725168756027, + "grad_norm": 6.037885665893555, + "learning_rate": 3.237887304338244e-05, + "loss": 5.0832, + "step": 4197 + }, + { + "epoch": 0.4048216007714561, + "grad_norm": 4.62246036529541, + "learning_rate": 3.237163636726649e-05, + "loss": 5.2329, + "step": 4198 + }, + { + "epoch": 0.40491803278688526, + "grad_norm": 3.51440167427063, + "learning_rate": 3.2364399014590836e-05, + "loss": 5.2629, + "step": 4199 + }, + { + "epoch": 0.40501446480231434, + "grad_norm": 4.263648986816406, + "learning_rate": 3.2357160986019697e-05, + "loss": 5.5454, + "step": 4200 + }, + { + "epoch": 0.4051108968177435, + "grad_norm": 5.277796268463135, + "learning_rate": 3.2349922282217385e-05, + "loss": 5.6681, + "step": 4201 + }, + { + "epoch": 0.4052073288331726, + "grad_norm": 4.958438873291016, + "learning_rate": 3.234268290384824e-05, + "loss": 5.5057, + "step": 4202 + }, + { + "epoch": 0.40530376084860176, + "grad_norm": 3.5174617767333984, + "learning_rate": 3.233544285157671e-05, + "loss": 5.3609, + "step": 4203 + }, + { + "epoch": 0.40540019286403084, + "grad_norm": 4.49488639831543, + "learning_rate": 3.232820212606725e-05, + "loss": 5.2246, + "step": 4204 + }, + { + "epoch": 0.40549662487946, + "grad_norm": 6.164755821228027, + "learning_rate": 3.232096072798443e-05, + "loss": 5.5562, + "step": 4205 + }, + { + "epoch": 0.4055930568948891, + "grad_norm": 5.2284932136535645, + "learning_rate": 3.2313718657992843e-05, + "loss": 5.8226, + "step": 4206 + }, + { + "epoch": 0.40568948891031825, + "grad_norm": 3.224945068359375, + "learning_rate": 3.2306475916757154e-05, + "loss": 5.529, + "step": 4207 + }, + { + "epoch": 0.40578592092574733, + "grad_norm": 2.561353921890259, + "learning_rate": 3.22992325049421e-05, + "loss": 5.5739, + "step": 4208 + }, + { + "epoch": 0.40588235294117647, + "grad_norm": 5.087508201599121, + "learning_rate": 3.229198842321247e-05, + "loss": 5.4377, + "step": 4209 + }, + { + "epoch": 0.4059787849566056, + "grad_norm": 4.269638538360596, + "learning_rate": 3.228474367223312e-05, + "loss": 5.3548, + "step": 4210 + }, + { + "epoch": 0.4060752169720347, + "grad_norm": 2.4782750606536865, + "learning_rate": 3.227749825266896e-05, + "loss": 5.3822, + "step": 4211 + }, + { + "epoch": 0.40617164898746383, + "grad_norm": 2.45158314704895, + "learning_rate": 3.2270252165184974e-05, + "loss": 5.7105, + "step": 4212 + }, + { + "epoch": 0.40626808100289297, + "grad_norm": 2.8439948558807373, + "learning_rate": 3.226300541044618e-05, + "loss": 5.6208, + "step": 4213 + }, + { + "epoch": 0.4063645130183221, + "grad_norm": 2.518712282180786, + "learning_rate": 3.2255757989117696e-05, + "loss": 5.7048, + "step": 4214 + }, + { + "epoch": 0.4064609450337512, + "grad_norm": 3.711195468902588, + "learning_rate": 3.224850990186467e-05, + "loss": 5.5982, + "step": 4215 + }, + { + "epoch": 0.4065573770491803, + "grad_norm": 2.794579029083252, + "learning_rate": 3.224126114935233e-05, + "loss": 5.2473, + "step": 4216 + }, + { + "epoch": 0.40665380906460946, + "grad_norm": 2.64958119392395, + "learning_rate": 3.223401173224595e-05, + "loss": 4.4655, + "step": 4217 + }, + { + "epoch": 0.4067502410800386, + "grad_norm": 2.241612434387207, + "learning_rate": 3.2226761651210884e-05, + "loss": 4.3979, + "step": 4218 + }, + { + "epoch": 0.4068466730954677, + "grad_norm": 2.1111207008361816, + "learning_rate": 3.221951090691252e-05, + "loss": 4.8313, + "step": 4219 + }, + { + "epoch": 0.4069431051108968, + "grad_norm": 3.1623101234436035, + "learning_rate": 3.221225950001635e-05, + "loss": 5.0835, + "step": 4220 + }, + { + "epoch": 0.40703953712632596, + "grad_norm": 2.3183226585388184, + "learning_rate": 3.2205007431187854e-05, + "loss": 5.1483, + "step": 4221 + }, + { + "epoch": 0.40713596914175504, + "grad_norm": 5.448138236999512, + "learning_rate": 3.219775470109266e-05, + "loss": 4.5601, + "step": 4222 + }, + { + "epoch": 0.4072324011571842, + "grad_norm": 3.518589735031128, + "learning_rate": 3.21905013103964e-05, + "loss": 4.7092, + "step": 4223 + }, + { + "epoch": 0.4073288331726133, + "grad_norm": 3.4481942653656006, + "learning_rate": 3.2183247259764773e-05, + "loss": 4.7029, + "step": 4224 + }, + { + "epoch": 0.40742526518804245, + "grad_norm": 4.025399684906006, + "learning_rate": 3.217599254986356e-05, + "loss": 5.6847, + "step": 4225 + }, + { + "epoch": 0.40752169720347153, + "grad_norm": 3.219653844833374, + "learning_rate": 3.216873718135857e-05, + "loss": 5.6792, + "step": 4226 + }, + { + "epoch": 0.40761812921890067, + "grad_norm": 3.1854097843170166, + "learning_rate": 3.2161481154915714e-05, + "loss": 5.7189, + "step": 4227 + }, + { + "epoch": 0.4077145612343298, + "grad_norm": 3.025808811187744, + "learning_rate": 3.215422447120093e-05, + "loss": 5.6542, + "step": 4228 + }, + { + "epoch": 0.40781099324975895, + "grad_norm": 2.5448803901672363, + "learning_rate": 3.214696713088023e-05, + "loss": 5.5856, + "step": 4229 + }, + { + "epoch": 0.40790742526518803, + "grad_norm": 3.2923340797424316, + "learning_rate": 3.2139709134619676e-05, + "loss": 5.5813, + "step": 4230 + }, + { + "epoch": 0.40800385728061717, + "grad_norm": 4.119852066040039, + "learning_rate": 3.213245048308542e-05, + "loss": 5.6742, + "step": 4231 + }, + { + "epoch": 0.4081002892960463, + "grad_norm": 3.6665592193603516, + "learning_rate": 3.212519117694363e-05, + "loss": 5.785, + "step": 4232 + }, + { + "epoch": 0.4081967213114754, + "grad_norm": 2.4189348220825195, + "learning_rate": 3.211793121686056e-05, + "loss": 5.791, + "step": 4233 + }, + { + "epoch": 0.4082931533269045, + "grad_norm": 2.8215436935424805, + "learning_rate": 3.211067060350253e-05, + "loss": 5.5442, + "step": 4234 + }, + { + "epoch": 0.40838958534233366, + "grad_norm": 2.3910162448883057, + "learning_rate": 3.21034093375359e-05, + "loss": 5.7158, + "step": 4235 + }, + { + "epoch": 0.4084860173577628, + "grad_norm": 7.084027290344238, + "learning_rate": 3.20961474196271e-05, + "loss": 4.6995, + "step": 4236 + }, + { + "epoch": 0.4085824493731919, + "grad_norm": 4.133822441101074, + "learning_rate": 3.208888485044263e-05, + "loss": 5.2712, + "step": 4237 + }, + { + "epoch": 0.408678881388621, + "grad_norm": 5.115276336669922, + "learning_rate": 3.208162163064903e-05, + "loss": 5.6962, + "step": 4238 + }, + { + "epoch": 0.40877531340405016, + "grad_norm": 4.135470867156982, + "learning_rate": 3.2074357760912913e-05, + "loss": 5.7687, + "step": 4239 + }, + { + "epoch": 0.4088717454194793, + "grad_norm": 6.1089982986450195, + "learning_rate": 3.2067093241900945e-05, + "loss": 5.9805, + "step": 4240 + }, + { + "epoch": 0.4089681774349084, + "grad_norm": 5.974060535430908, + "learning_rate": 3.205982807427986e-05, + "loss": 6.0356, + "step": 4241 + }, + { + "epoch": 0.4090646094503375, + "grad_norm": 4.218937873840332, + "learning_rate": 3.205256225871646e-05, + "loss": 5.7986, + "step": 4242 + }, + { + "epoch": 0.40916104146576665, + "grad_norm": 3.4704134464263916, + "learning_rate": 3.2045295795877554e-05, + "loss": 5.7637, + "step": 4243 + }, + { + "epoch": 0.40925747348119573, + "grad_norm": 4.478876113891602, + "learning_rate": 3.2038028686430077e-05, + "loss": 5.6304, + "step": 4244 + }, + { + "epoch": 0.40935390549662487, + "grad_norm": 4.409725189208984, + "learning_rate": 3.2030760931041e-05, + "loss": 5.2471, + "step": 4245 + }, + { + "epoch": 0.409450337512054, + "grad_norm": 3.7972779273986816, + "learning_rate": 3.2023492530377324e-05, + "loss": 4.8314, + "step": 4246 + }, + { + "epoch": 0.40954676952748315, + "grad_norm": 4.496339321136475, + "learning_rate": 3.201622348510615e-05, + "loss": 4.8851, + "step": 4247 + }, + { + "epoch": 0.4096432015429122, + "grad_norm": 5.500895977020264, + "learning_rate": 3.200895379589462e-05, + "loss": 4.8494, + "step": 4248 + }, + { + "epoch": 0.40973963355834137, + "grad_norm": 3.7674248218536377, + "learning_rate": 3.2001683463409946e-05, + "loss": 5.0124, + "step": 4249 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 3.4670794010162354, + "learning_rate": 3.199441248831938e-05, + "loss": 4.9298, + "step": 4250 + }, + { + "epoch": 0.40993249758919964, + "grad_norm": 4.617752552032471, + "learning_rate": 3.1987140871290236e-05, + "loss": 5.0329, + "step": 4251 + }, + { + "epoch": 0.4100289296046287, + "grad_norm": 3.9151837825775146, + "learning_rate": 3.197986861298991e-05, + "loss": 5.0172, + "step": 4252 + }, + { + "epoch": 0.41012536162005786, + "grad_norm": 3.6127705574035645, + "learning_rate": 3.197259571408583e-05, + "loss": 4.8112, + "step": 4253 + }, + { + "epoch": 0.410221793635487, + "grad_norm": 2.9122531414031982, + "learning_rate": 3.19653221752455e-05, + "loss": 4.7268, + "step": 4254 + }, + { + "epoch": 0.4103182256509161, + "grad_norm": 4.364292144775391, + "learning_rate": 3.195804799713647e-05, + "loss": 4.5525, + "step": 4255 + }, + { + "epoch": 0.4104146576663452, + "grad_norm": 2.9704244136810303, + "learning_rate": 3.1950773180426365e-05, + "loss": 4.6606, + "step": 4256 + }, + { + "epoch": 0.41051108968177435, + "grad_norm": 4.162683486938477, + "learning_rate": 3.194349772578285e-05, + "loss": 4.8958, + "step": 4257 + }, + { + "epoch": 0.4106075216972035, + "grad_norm": 6.057864189147949, + "learning_rate": 3.1936221633873655e-05, + "loss": 4.811, + "step": 4258 + }, + { + "epoch": 0.4107039537126326, + "grad_norm": 5.172679424285889, + "learning_rate": 3.1928944905366574e-05, + "loss": 5.165, + "step": 4259 + }, + { + "epoch": 0.4108003857280617, + "grad_norm": 4.574094772338867, + "learning_rate": 3.192166754092947e-05, + "loss": 5.8614, + "step": 4260 + }, + { + "epoch": 0.41089681774349085, + "grad_norm": 3.64286470413208, + "learning_rate": 3.1914389541230225e-05, + "loss": 5.7781, + "step": 4261 + }, + { + "epoch": 0.41099324975892, + "grad_norm": 4.581219673156738, + "learning_rate": 3.1907110906936824e-05, + "loss": 5.8601, + "step": 4262 + }, + { + "epoch": 0.41108968177434907, + "grad_norm": 3.956117630004883, + "learning_rate": 3.189983163871728e-05, + "loss": 6.0454, + "step": 4263 + }, + { + "epoch": 0.4111861137897782, + "grad_norm": 5.251752853393555, + "learning_rate": 3.189255173723969e-05, + "loss": 5.4128, + "step": 4264 + }, + { + "epoch": 0.41128254580520734, + "grad_norm": 3.591907262802124, + "learning_rate": 3.188527120317218e-05, + "loss": 5.4564, + "step": 4265 + }, + { + "epoch": 0.4113789778206364, + "grad_norm": 3.500206470489502, + "learning_rate": 3.187799003718295e-05, + "loss": 5.7209, + "step": 4266 + }, + { + "epoch": 0.41147540983606556, + "grad_norm": 3.512543201446533, + "learning_rate": 3.187070823994027e-05, + "loss": 5.6362, + "step": 4267 + }, + { + "epoch": 0.4115718418514947, + "grad_norm": 3.0614683628082275, + "learning_rate": 3.1863425812112434e-05, + "loss": 5.6462, + "step": 4268 + }, + { + "epoch": 0.41166827386692384, + "grad_norm": 2.8728010654449463, + "learning_rate": 3.185614275436783e-05, + "loss": 5.5095, + "step": 4269 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 4.172845363616943, + "learning_rate": 3.184885906737488e-05, + "loss": 5.2566, + "step": 4270 + }, + { + "epoch": 0.41186113789778206, + "grad_norm": 3.186784505844116, + "learning_rate": 3.1841574751802076e-05, + "loss": 5.675, + "step": 4271 + }, + { + "epoch": 0.4119575699132112, + "grad_norm": 3.1082704067230225, + "learning_rate": 3.183428980831796e-05, + "loss": 5.4139, + "step": 4272 + }, + { + "epoch": 0.41205400192864033, + "grad_norm": 2.4088308811187744, + "learning_rate": 3.182700423759114e-05, + "loss": 5.3544, + "step": 4273 + }, + { + "epoch": 0.4121504339440694, + "grad_norm": 4.547432899475098, + "learning_rate": 3.1819718040290266e-05, + "loss": 4.7723, + "step": 4274 + }, + { + "epoch": 0.41224686595949855, + "grad_norm": 4.366162300109863, + "learning_rate": 3.1812431217084074e-05, + "loss": 5.4476, + "step": 4275 + }, + { + "epoch": 0.4123432979749277, + "grad_norm": 3.2806732654571533, + "learning_rate": 3.1805143768641324e-05, + "loss": 4.937, + "step": 4276 + }, + { + "epoch": 0.4124397299903568, + "grad_norm": 4.640842914581299, + "learning_rate": 3.1797855695630856e-05, + "loss": 4.8644, + "step": 4277 + }, + { + "epoch": 0.4125361620057859, + "grad_norm": 2.6664583683013916, + "learning_rate": 3.179056699872156e-05, + "loss": 5.1475, + "step": 4278 + }, + { + "epoch": 0.41263259402121505, + "grad_norm": 4.551568984985352, + "learning_rate": 3.178327767858238e-05, + "loss": 5.5981, + "step": 4279 + }, + { + "epoch": 0.4127290260366442, + "grad_norm": 5.536022186279297, + "learning_rate": 3.177598773588232e-05, + "loss": 5.3403, + "step": 4280 + }, + { + "epoch": 0.41282545805207327, + "grad_norm": 4.471388816833496, + "learning_rate": 3.176869717129045e-05, + "loss": 4.9572, + "step": 4281 + }, + { + "epoch": 0.4129218900675024, + "grad_norm": 2.8834829330444336, + "learning_rate": 3.176140598547589e-05, + "loss": 5.3876, + "step": 4282 + }, + { + "epoch": 0.41301832208293154, + "grad_norm": 3.317505121231079, + "learning_rate": 3.1754114179107807e-05, + "loss": 5.7151, + "step": 4283 + }, + { + "epoch": 0.4131147540983607, + "grad_norm": 2.8127782344818115, + "learning_rate": 3.174682175285543e-05, + "loss": 5.5911, + "step": 4284 + }, + { + "epoch": 0.41321118611378976, + "grad_norm": 4.075017929077148, + "learning_rate": 3.1739528707388066e-05, + "loss": 5.6947, + "step": 4285 + }, + { + "epoch": 0.4133076181292189, + "grad_norm": 3.921351432800293, + "learning_rate": 3.173223504337505e-05, + "loss": 5.721, + "step": 4286 + }, + { + "epoch": 0.41340405014464804, + "grad_norm": 4.319389820098877, + "learning_rate": 3.1724940761485785e-05, + "loss": 5.735, + "step": 4287 + }, + { + "epoch": 0.4135004821600771, + "grad_norm": 2.8354413509368896, + "learning_rate": 3.171764586238974e-05, + "loss": 5.6926, + "step": 4288 + }, + { + "epoch": 0.41359691417550626, + "grad_norm": 4.2629547119140625, + "learning_rate": 3.171035034675642e-05, + "loss": 5.8638, + "step": 4289 + }, + { + "epoch": 0.4136933461909354, + "grad_norm": 4.065459728240967, + "learning_rate": 3.170305421525541e-05, + "loss": 5.7482, + "step": 4290 + }, + { + "epoch": 0.41378977820636453, + "grad_norm": 3.5548853874206543, + "learning_rate": 3.169575746855633e-05, + "loss": 5.8312, + "step": 4291 + }, + { + "epoch": 0.4138862102217936, + "grad_norm": 3.9152605533599854, + "learning_rate": 3.168846010732886e-05, + "loss": 5.6852, + "step": 4292 + }, + { + "epoch": 0.41398264223722275, + "grad_norm": 3.643127679824829, + "learning_rate": 3.1681162132242775e-05, + "loss": 5.6202, + "step": 4293 + }, + { + "epoch": 0.4140790742526519, + "grad_norm": 2.898481845855713, + "learning_rate": 3.167386354396784e-05, + "loss": 5.5646, + "step": 4294 + }, + { + "epoch": 0.41417550626808103, + "grad_norm": 4.449687957763672, + "learning_rate": 3.166656434317393e-05, + "loss": 5.5952, + "step": 4295 + }, + { + "epoch": 0.4142719382835101, + "grad_norm": 4.292853832244873, + "learning_rate": 3.165926453053094e-05, + "loss": 5.5798, + "step": 4296 + }, + { + "epoch": 0.41436837029893925, + "grad_norm": 4.409642219543457, + "learning_rate": 3.165196410670886e-05, + "loss": 5.7223, + "step": 4297 + }, + { + "epoch": 0.4144648023143684, + "grad_norm": 3.4861390590667725, + "learning_rate": 3.164466307237769e-05, + "loss": 5.5996, + "step": 4298 + }, + { + "epoch": 0.41456123432979747, + "grad_norm": 3.180835008621216, + "learning_rate": 3.163736142820752e-05, + "loss": 5.5947, + "step": 4299 + }, + { + "epoch": 0.4146576663452266, + "grad_norm": 3.5834155082702637, + "learning_rate": 3.16300591748685e-05, + "loss": 5.65, + "step": 4300 + }, + { + "epoch": 0.41475409836065574, + "grad_norm": 3.294375419616699, + "learning_rate": 3.16227563130308e-05, + "loss": 5.6457, + "step": 4301 + }, + { + "epoch": 0.4148505303760849, + "grad_norm": 5.211737632751465, + "learning_rate": 3.1615452843364676e-05, + "loss": 4.8109, + "step": 4302 + }, + { + "epoch": 0.41494696239151396, + "grad_norm": 3.278352975845337, + "learning_rate": 3.1608148766540434e-05, + "loss": 4.8347, + "step": 4303 + }, + { + "epoch": 0.4150433944069431, + "grad_norm": 5.05134916305542, + "learning_rate": 3.160084408322844e-05, + "loss": 4.7102, + "step": 4304 + }, + { + "epoch": 0.41513982642237224, + "grad_norm": 4.744969844818115, + "learning_rate": 3.1593538794099086e-05, + "loss": 5.1978, + "step": 4305 + }, + { + "epoch": 0.4152362584378014, + "grad_norm": 5.185204029083252, + "learning_rate": 3.1586232899822866e-05, + "loss": 5.7678, + "step": 4306 + }, + { + "epoch": 0.41533269045323046, + "grad_norm": 3.8359525203704834, + "learning_rate": 3.157892640107029e-05, + "loss": 5.6192, + "step": 4307 + }, + { + "epoch": 0.4154291224686596, + "grad_norm": 3.4026172161102295, + "learning_rate": 3.1571619298511954e-05, + "loss": 5.6203, + "step": 4308 + }, + { + "epoch": 0.41552555448408873, + "grad_norm": 3.324382781982422, + "learning_rate": 3.156431159281849e-05, + "loss": 5.5618, + "step": 4309 + }, + { + "epoch": 0.4156219864995178, + "grad_norm": 2.8930153846740723, + "learning_rate": 3.155700328466058e-05, + "loss": 5.6974, + "step": 4310 + }, + { + "epoch": 0.41571841851494695, + "grad_norm": 3.8558878898620605, + "learning_rate": 3.154969437470898e-05, + "loss": 5.5441, + "step": 4311 + }, + { + "epoch": 0.4158148505303761, + "grad_norm": 4.422835826873779, + "learning_rate": 3.1542384863634496e-05, + "loss": 5.8655, + "step": 4312 + }, + { + "epoch": 0.4159112825458052, + "grad_norm": 3.4491701126098633, + "learning_rate": 3.1535074752107975e-05, + "loss": 5.8124, + "step": 4313 + }, + { + "epoch": 0.4160077145612343, + "grad_norm": 4.584539890289307, + "learning_rate": 3.152776404080033e-05, + "loss": 5.5533, + "step": 4314 + }, + { + "epoch": 0.41610414657666345, + "grad_norm": 4.644968509674072, + "learning_rate": 3.152045273038256e-05, + "loss": 5.6708, + "step": 4315 + }, + { + "epoch": 0.4162005785920926, + "grad_norm": 3.2213714122772217, + "learning_rate": 3.1513140821525645e-05, + "loss": 5.7517, + "step": 4316 + }, + { + "epoch": 0.4162970106075217, + "grad_norm": 2.5693702697753906, + "learning_rate": 3.150582831490068e-05, + "loss": 5.7393, + "step": 4317 + }, + { + "epoch": 0.4163934426229508, + "grad_norm": 3.0089588165283203, + "learning_rate": 3.149851521117881e-05, + "loss": 5.6613, + "step": 4318 + }, + { + "epoch": 0.41648987463837994, + "grad_norm": 3.9708852767944336, + "learning_rate": 3.149120151103121e-05, + "loss": 5.6496, + "step": 4319 + }, + { + "epoch": 0.4165863066538091, + "grad_norm": 2.901902198791504, + "learning_rate": 3.148388721512912e-05, + "loss": 5.3511, + "step": 4320 + }, + { + "epoch": 0.41668273866923816, + "grad_norm": 4.901095867156982, + "learning_rate": 3.147657232414384e-05, + "loss": 5.079, + "step": 4321 + }, + { + "epoch": 0.4167791706846673, + "grad_norm": 4.664535999298096, + "learning_rate": 3.146925683874673e-05, + "loss": 4.9891, + "step": 4322 + }, + { + "epoch": 0.41687560270009644, + "grad_norm": 3.4113681316375732, + "learning_rate": 3.1461940759609187e-05, + "loss": 4.9761, + "step": 4323 + }, + { + "epoch": 0.4169720347155256, + "grad_norm": 3.108948230743408, + "learning_rate": 3.1454624087402675e-05, + "loss": 5.518, + "step": 4324 + }, + { + "epoch": 0.41706846673095466, + "grad_norm": 3.53255033493042, + "learning_rate": 3.144730682279871e-05, + "loss": 5.5523, + "step": 4325 + }, + { + "epoch": 0.4171648987463838, + "grad_norm": 5.551013469696045, + "learning_rate": 3.143998896646886e-05, + "loss": 5.2644, + "step": 4326 + }, + { + "epoch": 0.41726133076181293, + "grad_norm": 4.595366954803467, + "learning_rate": 3.143267051908474e-05, + "loss": 5.5699, + "step": 4327 + }, + { + "epoch": 0.41735776277724207, + "grad_norm": 4.313508033752441, + "learning_rate": 3.142535148131804e-05, + "loss": 5.8241, + "step": 4328 + }, + { + "epoch": 0.41745419479267115, + "grad_norm": 3.127732515335083, + "learning_rate": 3.14180318538405e-05, + "loss": 5.5987, + "step": 4329 + }, + { + "epoch": 0.4175506268081003, + "grad_norm": 2.2853617668151855, + "learning_rate": 3.141071163732389e-05, + "loss": 5.2279, + "step": 4330 + }, + { + "epoch": 0.4176470588235294, + "grad_norm": 5.02640438079834, + "learning_rate": 3.140339083244006e-05, + "loss": 5.2445, + "step": 4331 + }, + { + "epoch": 0.4177434908389585, + "grad_norm": 7.752425670623779, + "learning_rate": 3.1396069439860894e-05, + "loss": 5.6842, + "step": 4332 + }, + { + "epoch": 0.41783992285438765, + "grad_norm": 10.403379440307617, + "learning_rate": 3.138874746025835e-05, + "loss": 5.5453, + "step": 4333 + }, + { + "epoch": 0.4179363548698168, + "grad_norm": 7.125958442687988, + "learning_rate": 3.138142489430443e-05, + "loss": 5.3812, + "step": 4334 + }, + { + "epoch": 0.4180327868852459, + "grad_norm": 3.545464038848877, + "learning_rate": 3.137410174267118e-05, + "loss": 5.3317, + "step": 4335 + }, + { + "epoch": 0.418129218900675, + "grad_norm": 6.177037239074707, + "learning_rate": 3.136677800603072e-05, + "loss": 5.8811, + "step": 4336 + }, + { + "epoch": 0.41822565091610414, + "grad_norm": 7.0778961181640625, + "learning_rate": 3.135945368505522e-05, + "loss": 6.1124, + "step": 4337 + }, + { + "epoch": 0.4183220829315333, + "grad_norm": 8.396934509277344, + "learning_rate": 3.135212878041688e-05, + "loss": 5.6781, + "step": 4338 + }, + { + "epoch": 0.4184185149469624, + "grad_norm": 6.212292194366455, + "learning_rate": 3.134480329278797e-05, + "loss": 5.5652, + "step": 4339 + }, + { + "epoch": 0.4185149469623915, + "grad_norm": 4.009922027587891, + "learning_rate": 3.1337477222840836e-05, + "loss": 5.6272, + "step": 4340 + }, + { + "epoch": 0.41861137897782064, + "grad_norm": 4.893865585327148, + "learning_rate": 3.133015057124784e-05, + "loss": 5.6765, + "step": 4341 + }, + { + "epoch": 0.4187078109932498, + "grad_norm": 5.461091041564941, + "learning_rate": 3.1322823338681415e-05, + "loss": 5.742, + "step": 4342 + }, + { + "epoch": 0.41880424300867886, + "grad_norm": 4.6347551345825195, + "learning_rate": 3.131549552581404e-05, + "loss": 4.9866, + "step": 4343 + }, + { + "epoch": 0.418900675024108, + "grad_norm": 3.1955292224884033, + "learning_rate": 3.1308167133318265e-05, + "loss": 4.8673, + "step": 4344 + }, + { + "epoch": 0.41899710703953713, + "grad_norm": 2.5966200828552246, + "learning_rate": 3.1300838161866676e-05, + "loss": 5.1799, + "step": 4345 + }, + { + "epoch": 0.41909353905496627, + "grad_norm": 2.59587025642395, + "learning_rate": 3.129350861213191e-05, + "loss": 5.546, + "step": 4346 + }, + { + "epoch": 0.41918997107039535, + "grad_norm": 2.3206992149353027, + "learning_rate": 3.1286178484786675e-05, + "loss": 5.5534, + "step": 4347 + }, + { + "epoch": 0.4192864030858245, + "grad_norm": 3.0534849166870117, + "learning_rate": 3.127884778050372e-05, + "loss": 5.6202, + "step": 4348 + }, + { + "epoch": 0.4193828351012536, + "grad_norm": 2.8081185817718506, + "learning_rate": 3.1271516499955835e-05, + "loss": 5.5282, + "step": 4349 + }, + { + "epoch": 0.41947926711668276, + "grad_norm": 2.811516761779785, + "learning_rate": 3.1264184643815887e-05, + "loss": 5.7663, + "step": 4350 + }, + { + "epoch": 0.41957569913211185, + "grad_norm": 3.622166156768799, + "learning_rate": 3.1256852212756785e-05, + "loss": 5.7752, + "step": 4351 + }, + { + "epoch": 0.419672131147541, + "grad_norm": 2.341688394546509, + "learning_rate": 3.1249519207451486e-05, + "loss": 5.2032, + "step": 4352 + }, + { + "epoch": 0.4197685631629701, + "grad_norm": 2.6926515102386475, + "learning_rate": 3.1242185628573e-05, + "loss": 5.3201, + "step": 4353 + }, + { + "epoch": 0.4198649951783992, + "grad_norm": 2.3428027629852295, + "learning_rate": 3.1234851476794406e-05, + "loss": 5.7713, + "step": 4354 + }, + { + "epoch": 0.41996142719382834, + "grad_norm": 2.201723575592041, + "learning_rate": 3.122751675278882e-05, + "loss": 5.7964, + "step": 4355 + }, + { + "epoch": 0.4200578592092575, + "grad_norm": 3.5069358348846436, + "learning_rate": 3.1220181457229406e-05, + "loss": 5.8241, + "step": 4356 + }, + { + "epoch": 0.4201542912246866, + "grad_norm": 3.331982135772705, + "learning_rate": 3.12128455907894e-05, + "loss": 5.9384, + "step": 4357 + }, + { + "epoch": 0.4202507232401157, + "grad_norm": 3.1413936614990234, + "learning_rate": 3.120550915414206e-05, + "loss": 5.8666, + "step": 4358 + }, + { + "epoch": 0.42034715525554484, + "grad_norm": 2.3794281482696533, + "learning_rate": 3.1198172147960744e-05, + "loss": 5.5111, + "step": 4359 + }, + { + "epoch": 0.420443587270974, + "grad_norm": 3.9358391761779785, + "learning_rate": 3.119083457291881e-05, + "loss": 4.7105, + "step": 4360 + }, + { + "epoch": 0.4205400192864031, + "grad_norm": 4.454640865325928, + "learning_rate": 3.11834964296897e-05, + "loss": 4.9305, + "step": 4361 + }, + { + "epoch": 0.4206364513018322, + "grad_norm": 3.4538638591766357, + "learning_rate": 3.1176157718946885e-05, + "loss": 4.7849, + "step": 4362 + }, + { + "epoch": 0.42073288331726133, + "grad_norm": 5.616940975189209, + "learning_rate": 3.1168818441363934e-05, + "loss": 5.3538, + "step": 4363 + }, + { + "epoch": 0.42082931533269047, + "grad_norm": 5.751980781555176, + "learning_rate": 3.116147859761441e-05, + "loss": 5.2113, + "step": 4364 + }, + { + "epoch": 0.42092574734811955, + "grad_norm": 3.8540194034576416, + "learning_rate": 3.115413818837196e-05, + "loss": 5.6593, + "step": 4365 + }, + { + "epoch": 0.4210221793635487, + "grad_norm": 4.291193008422852, + "learning_rate": 3.114679721431029e-05, + "loss": 5.5035, + "step": 4366 + }, + { + "epoch": 0.4211186113789778, + "grad_norm": 4.824197769165039, + "learning_rate": 3.1139455676103134e-05, + "loss": 5.1155, + "step": 4367 + }, + { + "epoch": 0.42121504339440696, + "grad_norm": 3.580972194671631, + "learning_rate": 3.113211357442429e-05, + "loss": 5.6992, + "step": 4368 + }, + { + "epoch": 0.42131147540983604, + "grad_norm": 3.040675640106201, + "learning_rate": 3.112477090994761e-05, + "loss": 5.9218, + "step": 4369 + }, + { + "epoch": 0.4214079074252652, + "grad_norm": 3.725537061691284, + "learning_rate": 3.1117427683347e-05, + "loss": 6.1514, + "step": 4370 + }, + { + "epoch": 0.4215043394406943, + "grad_norm": 3.773148536682129, + "learning_rate": 3.11100838952964e-05, + "loss": 6.0275, + "step": 4371 + }, + { + "epoch": 0.42160077145612346, + "grad_norm": 4.032921314239502, + "learning_rate": 3.110273954646983e-05, + "loss": 6.2439, + "step": 4372 + }, + { + "epoch": 0.42169720347155254, + "grad_norm": 4.488618850708008, + "learning_rate": 3.109539463754133e-05, + "loss": 6.3286, + "step": 4373 + }, + { + "epoch": 0.4217936354869817, + "grad_norm": 3.4174468517303467, + "learning_rate": 3.108804916918501e-05, + "loss": 6.1873, + "step": 4374 + }, + { + "epoch": 0.4218900675024108, + "grad_norm": 3.1257057189941406, + "learning_rate": 3.108070314207503e-05, + "loss": 6.1601, + "step": 4375 + }, + { + "epoch": 0.4219864995178399, + "grad_norm": 2.4066531658172607, + "learning_rate": 3.1073356556885605e-05, + "loss": 6.3188, + "step": 4376 + }, + { + "epoch": 0.42208293153326903, + "grad_norm": 2.1198043823242188, + "learning_rate": 3.1066009414290985e-05, + "loss": 6.1814, + "step": 4377 + }, + { + "epoch": 0.42217936354869817, + "grad_norm": 3.2345993518829346, + "learning_rate": 3.105866171496549e-05, + "loss": 6.1083, + "step": 4378 + }, + { + "epoch": 0.4222757955641273, + "grad_norm": 3.0995092391967773, + "learning_rate": 3.105131345958348e-05, + "loss": 6.044, + "step": 4379 + }, + { + "epoch": 0.4223722275795564, + "grad_norm": 2.0474205017089844, + "learning_rate": 3.104396464881936e-05, + "loss": 6.1383, + "step": 4380 + }, + { + "epoch": 0.42246865959498553, + "grad_norm": 2.630143880844116, + "learning_rate": 3.103661528334762e-05, + "loss": 6.2926, + "step": 4381 + }, + { + "epoch": 0.42256509161041467, + "grad_norm": 2.690675735473633, + "learning_rate": 3.102926536384275e-05, + "loss": 6.3585, + "step": 4382 + }, + { + "epoch": 0.4226615236258438, + "grad_norm": 3.0712292194366455, + "learning_rate": 3.102191489097933e-05, + "loss": 6.126, + "step": 4383 + }, + { + "epoch": 0.4227579556412729, + "grad_norm": 2.7615716457366943, + "learning_rate": 3.1014563865431965e-05, + "loss": 6.1586, + "step": 4384 + }, + { + "epoch": 0.422854387656702, + "grad_norm": 2.472567558288574, + "learning_rate": 3.100721228787533e-05, + "loss": 6.229, + "step": 4385 + }, + { + "epoch": 0.42295081967213116, + "grad_norm": 2.7784323692321777, + "learning_rate": 3.099986015898415e-05, + "loss": 6.0608, + "step": 4386 + }, + { + "epoch": 0.42304725168756024, + "grad_norm": 2.7623391151428223, + "learning_rate": 3.099250747943319e-05, + "loss": 6.033, + "step": 4387 + }, + { + "epoch": 0.4231436837029894, + "grad_norm": 2.4268386363983154, + "learning_rate": 3.098515424989727e-05, + "loss": 6.1371, + "step": 4388 + }, + { + "epoch": 0.4232401157184185, + "grad_norm": 2.914217948913574, + "learning_rate": 3.097780047105126e-05, + "loss": 6.0676, + "step": 4389 + }, + { + "epoch": 0.42333654773384766, + "grad_norm": 2.816890001296997, + "learning_rate": 3.0970446143570075e-05, + "loss": 5.9925, + "step": 4390 + }, + { + "epoch": 0.42343297974927674, + "grad_norm": 2.3451240062713623, + "learning_rate": 3.096309126812869e-05, + "loss": 6.1396, + "step": 4391 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 2.3612167835235596, + "learning_rate": 3.095573584540213e-05, + "loss": 5.3923, + "step": 4392 + }, + { + "epoch": 0.423625843780135, + "grad_norm": 3.6404101848602295, + "learning_rate": 3.094837987606547e-05, + "loss": 6.1513, + "step": 4393 + }, + { + "epoch": 0.42372227579556415, + "grad_norm": 3.6208133697509766, + "learning_rate": 3.094102336079382e-05, + "loss": 6.214, + "step": 4394 + }, + { + "epoch": 0.42381870781099323, + "grad_norm": 2.6462461948394775, + "learning_rate": 3.0933666300262363e-05, + "loss": 6.2498, + "step": 4395 + }, + { + "epoch": 0.42391513982642237, + "grad_norm": 2.7120420932769775, + "learning_rate": 3.092630869514632e-05, + "loss": 6.0292, + "step": 4396 + }, + { + "epoch": 0.4240115718418515, + "grad_norm": 3.255413055419922, + "learning_rate": 3.091895054612095e-05, + "loss": 6.0856, + "step": 4397 + }, + { + "epoch": 0.4241080038572806, + "grad_norm": 2.8234059810638428, + "learning_rate": 3.091159185386159e-05, + "loss": 6.0659, + "step": 4398 + }, + { + "epoch": 0.42420443587270973, + "grad_norm": 2.1785881519317627, + "learning_rate": 3.09042326190436e-05, + "loss": 5.9609, + "step": 4399 + }, + { + "epoch": 0.42430086788813887, + "grad_norm": 1.9983094930648804, + "learning_rate": 3.089687284234241e-05, + "loss": 6.032, + "step": 4400 + }, + { + "epoch": 0.424397299903568, + "grad_norm": 2.8947391510009766, + "learning_rate": 3.088951252443349e-05, + "loss": 6.0158, + "step": 4401 + }, + { + "epoch": 0.4244937319189971, + "grad_norm": 2.8418667316436768, + "learning_rate": 3.088215166599235e-05, + "loss": 6.0675, + "step": 4402 + }, + { + "epoch": 0.4245901639344262, + "grad_norm": 2.118518114089966, + "learning_rate": 3.087479026769459e-05, + "loss": 6.0447, + "step": 4403 + }, + { + "epoch": 0.42468659594985536, + "grad_norm": 2.156247615814209, + "learning_rate": 3.086742833021579e-05, + "loss": 6.0348, + "step": 4404 + }, + { + "epoch": 0.4247830279652845, + "grad_norm": 2.7349579334259033, + "learning_rate": 3.086006585423165e-05, + "loss": 5.9683, + "step": 4405 + }, + { + "epoch": 0.4248794599807136, + "grad_norm": 2.164454460144043, + "learning_rate": 3.085270284041787e-05, + "loss": 6.0498, + "step": 4406 + }, + { + "epoch": 0.4249758919961427, + "grad_norm": 2.575610876083374, + "learning_rate": 3.0845339289450234e-05, + "loss": 6.1428, + "step": 4407 + }, + { + "epoch": 0.42507232401157186, + "grad_norm": 2.0492475032806396, + "learning_rate": 3.083797520200455e-05, + "loss": 5.9076, + "step": 4408 + }, + { + "epoch": 0.42516875602700094, + "grad_norm": 1.6269171237945557, + "learning_rate": 3.0830610578756684e-05, + "loss": 6.0594, + "step": 4409 + }, + { + "epoch": 0.4252651880424301, + "grad_norm": 2.3574559688568115, + "learning_rate": 3.082324542038256e-05, + "loss": 6.035, + "step": 4410 + }, + { + "epoch": 0.4253616200578592, + "grad_norm": 2.369328737258911, + "learning_rate": 3.081587972755814e-05, + "loss": 6.0604, + "step": 4411 + }, + { + "epoch": 0.42545805207328835, + "grad_norm": 1.8043686151504517, + "learning_rate": 3.080851350095943e-05, + "loss": 6.1385, + "step": 4412 + }, + { + "epoch": 0.42555448408871743, + "grad_norm": 1.6586300134658813, + "learning_rate": 3.080114674126251e-05, + "loss": 5.9737, + "step": 4413 + }, + { + "epoch": 0.42565091610414657, + "grad_norm": 1.8654643297195435, + "learning_rate": 3.079377944914348e-05, + "loss": 5.9617, + "step": 4414 + }, + { + "epoch": 0.4257473481195757, + "grad_norm": 1.6123952865600586, + "learning_rate": 3.07864116252785e-05, + "loss": 6.0089, + "step": 4415 + }, + { + "epoch": 0.42584378013500485, + "grad_norm": 1.9785805940628052, + "learning_rate": 3.0779043270343774e-05, + "loss": 6.0999, + "step": 4416 + }, + { + "epoch": 0.42594021215043393, + "grad_norm": 1.8903260231018066, + "learning_rate": 3.077167438501558e-05, + "loss": 6.0066, + "step": 4417 + }, + { + "epoch": 0.42603664416586307, + "grad_norm": 2.0598058700561523, + "learning_rate": 3.076430496997022e-05, + "loss": 6.0406, + "step": 4418 + }, + { + "epoch": 0.4261330761812922, + "grad_norm": 1.5507771968841553, + "learning_rate": 3.075693502588404e-05, + "loss": 6.121, + "step": 4419 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 1.7562925815582275, + "learning_rate": 3.074956455343344e-05, + "loss": 6.0117, + "step": 4420 + }, + { + "epoch": 0.4263259402121504, + "grad_norm": 2.071751594543457, + "learning_rate": 3.07421935532949e-05, + "loss": 6.0997, + "step": 4421 + }, + { + "epoch": 0.42642237222757956, + "grad_norm": 1.9922246932983398, + "learning_rate": 3.073482202614488e-05, + "loss": 6.0066, + "step": 4422 + }, + { + "epoch": 0.4265188042430087, + "grad_norm": 1.6551204919815063, + "learning_rate": 3.072744997265997e-05, + "loss": 6.0495, + "step": 4423 + }, + { + "epoch": 0.4266152362584378, + "grad_norm": 1.9730483293533325, + "learning_rate": 3.072007739351674e-05, + "loss": 5.9552, + "step": 4424 + }, + { + "epoch": 0.4267116682738669, + "grad_norm": 1.9793986082077026, + "learning_rate": 3.0712704289391855e-05, + "loss": 6.006, + "step": 4425 + }, + { + "epoch": 0.42680810028929606, + "grad_norm": 1.888657808303833, + "learning_rate": 3.0705330660962e-05, + "loss": 6.0229, + "step": 4426 + }, + { + "epoch": 0.4269045323047252, + "grad_norm": 1.1180925369262695, + "learning_rate": 3.0697956508903916e-05, + "loss": 6.0196, + "step": 4427 + }, + { + "epoch": 0.4270009643201543, + "grad_norm": 1.8313255310058594, + "learning_rate": 3.06905818338944e-05, + "loss": 5.9359, + "step": 4428 + }, + { + "epoch": 0.4270973963355834, + "grad_norm": 2.045642614364624, + "learning_rate": 3.0683206636610286e-05, + "loss": 6.0319, + "step": 4429 + }, + { + "epoch": 0.42719382835101255, + "grad_norm": 1.322089672088623, + "learning_rate": 3.067583091772846e-05, + "loss": 5.9734, + "step": 4430 + }, + { + "epoch": 0.42729026036644163, + "grad_norm": 1.8459985256195068, + "learning_rate": 3.0668454677925855e-05, + "loss": 5.9807, + "step": 4431 + }, + { + "epoch": 0.42738669238187077, + "grad_norm": 2.3729968070983887, + "learning_rate": 3.066107791787947e-05, + "loss": 5.9584, + "step": 4432 + }, + { + "epoch": 0.4274831243972999, + "grad_norm": 1.377791166305542, + "learning_rate": 3.065370063826631e-05, + "loss": 5.9682, + "step": 4433 + }, + { + "epoch": 0.42757955641272904, + "grad_norm": 1.7881115674972534, + "learning_rate": 3.0646322839763465e-05, + "loss": 5.9496, + "step": 4434 + }, + { + "epoch": 0.4276759884281581, + "grad_norm": 1.9791338443756104, + "learning_rate": 3.063894452304806e-05, + "loss": 6.0356, + "step": 4435 + }, + { + "epoch": 0.42777242044358726, + "grad_norm": 3.794464111328125, + "learning_rate": 3.063156568879727e-05, + "loss": 5.9538, + "step": 4436 + }, + { + "epoch": 0.4278688524590164, + "grad_norm": 1.9020928144454956, + "learning_rate": 3.062418633768831e-05, + "loss": 5.9441, + "step": 4437 + }, + { + "epoch": 0.42796528447444554, + "grad_norm": 2.1376140117645264, + "learning_rate": 3.061680647039845e-05, + "loss": 5.8663, + "step": 4438 + }, + { + "epoch": 0.4280617164898746, + "grad_norm": 2.044980049133301, + "learning_rate": 3.0609426087605e-05, + "loss": 5.8145, + "step": 4439 + }, + { + "epoch": 0.42815814850530376, + "grad_norm": 2.1787548065185547, + "learning_rate": 3.060204518998534e-05, + "loss": 5.8847, + "step": 4440 + }, + { + "epoch": 0.4282545805207329, + "grad_norm": 1.8018205165863037, + "learning_rate": 3.0594663778216866e-05, + "loss": 5.8981, + "step": 4441 + }, + { + "epoch": 0.428351012536162, + "grad_norm": 2.767059564590454, + "learning_rate": 3.058728185297703e-05, + "loss": 5.963, + "step": 4442 + }, + { + "epoch": 0.4284474445515911, + "grad_norm": 2.3508951663970947, + "learning_rate": 3.057989941494335e-05, + "loss": 6.0059, + "step": 4443 + }, + { + "epoch": 0.42854387656702025, + "grad_norm": 1.5310100317001343, + "learning_rate": 3.0572516464793354e-05, + "loss": 6.0079, + "step": 4444 + }, + { + "epoch": 0.4286403085824494, + "grad_norm": 1.9829349517822266, + "learning_rate": 3.0565133003204665e-05, + "loss": 5.9388, + "step": 4445 + }, + { + "epoch": 0.4287367405978785, + "grad_norm": 1.6321916580200195, + "learning_rate": 3.055774903085492e-05, + "loss": 5.8991, + "step": 4446 + }, + { + "epoch": 0.4288331726133076, + "grad_norm": 2.252790689468384, + "learning_rate": 3.055036454842181e-05, + "loss": 6.0042, + "step": 4447 + }, + { + "epoch": 0.42892960462873675, + "grad_norm": 2.3205156326293945, + "learning_rate": 3.054297955658307e-05, + "loss": 6.0475, + "step": 4448 + }, + { + "epoch": 0.4290260366441659, + "grad_norm": 3.1712841987609863, + "learning_rate": 3.053559405601649e-05, + "loss": 5.9221, + "step": 4449 + }, + { + "epoch": 0.42912246865959497, + "grad_norm": 4.352069854736328, + "learning_rate": 3.0528208047399895e-05, + "loss": 5.7522, + "step": 4450 + }, + { + "epoch": 0.4292189006750241, + "grad_norm": 3.1827523708343506, + "learning_rate": 3.0520821531411176e-05, + "loss": 5.6902, + "step": 4451 + }, + { + "epoch": 0.42931533269045324, + "grad_norm": 2.9556596279144287, + "learning_rate": 3.051343450872825e-05, + "loss": 5.6583, + "step": 4452 + }, + { + "epoch": 0.4294117647058823, + "grad_norm": 3.7073776721954346, + "learning_rate": 3.0506046980029092e-05, + "loss": 5.6196, + "step": 4453 + }, + { + "epoch": 0.42950819672131146, + "grad_norm": 2.9140772819519043, + "learning_rate": 3.049865894599172e-05, + "loss": 5.6079, + "step": 4454 + }, + { + "epoch": 0.4296046287367406, + "grad_norm": 3.8415629863739014, + "learning_rate": 3.0491270407294192e-05, + "loss": 6.0251, + "step": 4455 + }, + { + "epoch": 0.42970106075216974, + "grad_norm": 3.049053430557251, + "learning_rate": 3.0483881364614632e-05, + "loss": 6.1455, + "step": 4456 + }, + { + "epoch": 0.4297974927675988, + "grad_norm": 3.318434476852417, + "learning_rate": 3.047649181863118e-05, + "loss": 5.9971, + "step": 4457 + }, + { + "epoch": 0.42989392478302796, + "grad_norm": 1.8702200651168823, + "learning_rate": 3.0469101770022062e-05, + "loss": 5.9925, + "step": 4458 + }, + { + "epoch": 0.4299903567984571, + "grad_norm": 2.952230215072632, + "learning_rate": 3.0461711219465512e-05, + "loss": 6.0746, + "step": 4459 + }, + { + "epoch": 0.43008678881388623, + "grad_norm": 2.0051352977752686, + "learning_rate": 3.0454320167639823e-05, + "loss": 6.1272, + "step": 4460 + }, + { + "epoch": 0.4301832208293153, + "grad_norm": 2.1203091144561768, + "learning_rate": 3.044692861522334e-05, + "loss": 5.9764, + "step": 4461 + }, + { + "epoch": 0.43027965284474445, + "grad_norm": 1.9102866649627686, + "learning_rate": 3.043953656289446e-05, + "loss": 5.9854, + "step": 4462 + }, + { + "epoch": 0.4303760848601736, + "grad_norm": 1.9572067260742188, + "learning_rate": 3.043214401133161e-05, + "loss": 6.0698, + "step": 4463 + }, + { + "epoch": 0.4304725168756027, + "grad_norm": 2.617720365524292, + "learning_rate": 3.042475096121326e-05, + "loss": 5.9461, + "step": 4464 + }, + { + "epoch": 0.4305689488910318, + "grad_norm": 2.710683584213257, + "learning_rate": 3.0417357413217956e-05, + "loss": 5.8772, + "step": 4465 + }, + { + "epoch": 0.43066538090646095, + "grad_norm": 2.9239540100097656, + "learning_rate": 3.0409963368024246e-05, + "loss": 5.9017, + "step": 4466 + }, + { + "epoch": 0.4307618129218901, + "grad_norm": 2.2989535331726074, + "learning_rate": 3.0402568826310756e-05, + "loss": 6.0467, + "step": 4467 + }, + { + "epoch": 0.43085824493731917, + "grad_norm": 2.538550853729248, + "learning_rate": 3.0395173788756155e-05, + "loss": 5.9057, + "step": 4468 + }, + { + "epoch": 0.4309546769527483, + "grad_norm": 2.809208869934082, + "learning_rate": 3.0387778256039145e-05, + "loss": 6.0409, + "step": 4469 + }, + { + "epoch": 0.43105110896817744, + "grad_norm": 1.9321829080581665, + "learning_rate": 3.0380382228838472e-05, + "loss": 5.9948, + "step": 4470 + }, + { + "epoch": 0.4311475409836066, + "grad_norm": 2.482245445251465, + "learning_rate": 3.037298570783294e-05, + "loss": 5.9154, + "step": 4471 + }, + { + "epoch": 0.43124397299903566, + "grad_norm": 2.2414376735687256, + "learning_rate": 3.0365588693701396e-05, + "loss": 6.0412, + "step": 4472 + }, + { + "epoch": 0.4313404050144648, + "grad_norm": 2.1842308044433594, + "learning_rate": 3.0358191187122726e-05, + "loss": 6.0086, + "step": 4473 + }, + { + "epoch": 0.43143683702989394, + "grad_norm": 3.074014186859131, + "learning_rate": 3.035079318877586e-05, + "loss": 5.9406, + "step": 4474 + }, + { + "epoch": 0.431533269045323, + "grad_norm": 1.8939110040664673, + "learning_rate": 3.0343394699339785e-05, + "loss": 6.222, + "step": 4475 + }, + { + "epoch": 0.43162970106075216, + "grad_norm": 1.5995899438858032, + "learning_rate": 3.033599571949352e-05, + "loss": 6.041, + "step": 4476 + }, + { + "epoch": 0.4317261330761813, + "grad_norm": 1.9957300424575806, + "learning_rate": 3.0328596249916135e-05, + "loss": 5.9492, + "step": 4477 + }, + { + "epoch": 0.43182256509161043, + "grad_norm": 1.9891420602798462, + "learning_rate": 3.0321196291286742e-05, + "loss": 5.945, + "step": 4478 + }, + { + "epoch": 0.4319189971070395, + "grad_norm": 1.9223721027374268, + "learning_rate": 3.0313795844284502e-05, + "loss": 5.8015, + "step": 4479 + }, + { + "epoch": 0.43201542912246865, + "grad_norm": 1.8288617134094238, + "learning_rate": 3.030639490958863e-05, + "loss": 5.978, + "step": 4480 + }, + { + "epoch": 0.4321118611378978, + "grad_norm": 2.1895554065704346, + "learning_rate": 3.0298993487878357e-05, + "loss": 6.0019, + "step": 4481 + }, + { + "epoch": 0.43220829315332693, + "grad_norm": 1.942736029624939, + "learning_rate": 3.0291591579832977e-05, + "loss": 6.0019, + "step": 4482 + }, + { + "epoch": 0.432304725168756, + "grad_norm": 3.137251615524292, + "learning_rate": 3.0284189186131845e-05, + "loss": 5.7455, + "step": 4483 + }, + { + "epoch": 0.43240115718418515, + "grad_norm": 1.9539223909378052, + "learning_rate": 3.0276786307454335e-05, + "loss": 5.9207, + "step": 4484 + }, + { + "epoch": 0.4324975891996143, + "grad_norm": 1.9083729982376099, + "learning_rate": 3.0269382944479862e-05, + "loss": 5.9499, + "step": 4485 + }, + { + "epoch": 0.43259402121504337, + "grad_norm": 2.252807140350342, + "learning_rate": 3.0261979097887917e-05, + "loss": 6.1001, + "step": 4486 + }, + { + "epoch": 0.4326904532304725, + "grad_norm": 1.8412070274353027, + "learning_rate": 3.025457476835801e-05, + "loss": 5.8592, + "step": 4487 + }, + { + "epoch": 0.43278688524590164, + "grad_norm": 1.5838052034378052, + "learning_rate": 3.024716995656969e-05, + "loss": 5.9008, + "step": 4488 + }, + { + "epoch": 0.4328833172613308, + "grad_norm": 1.7486854791641235, + "learning_rate": 3.0239764663202562e-05, + "loss": 5.9942, + "step": 4489 + }, + { + "epoch": 0.43297974927675986, + "grad_norm": 1.7742631435394287, + "learning_rate": 3.023235888893629e-05, + "loss": 5.9587, + "step": 4490 + }, + { + "epoch": 0.433076181292189, + "grad_norm": 2.0345559120178223, + "learning_rate": 3.022495263445056e-05, + "loss": 5.8807, + "step": 4491 + }, + { + "epoch": 0.43317261330761814, + "grad_norm": 2.0831000804901123, + "learning_rate": 3.021754590042511e-05, + "loss": 5.9648, + "step": 4492 + }, + { + "epoch": 0.4332690453230473, + "grad_norm": 2.1899194717407227, + "learning_rate": 3.0210138687539718e-05, + "loss": 5.8115, + "step": 4493 + }, + { + "epoch": 0.43336547733847636, + "grad_norm": 1.5249698162078857, + "learning_rate": 3.020273099647421e-05, + "loss": 5.9171, + "step": 4494 + }, + { + "epoch": 0.4334619093539055, + "grad_norm": 1.8304593563079834, + "learning_rate": 3.019532282790845e-05, + "loss": 5.9503, + "step": 4495 + }, + { + "epoch": 0.43355834136933463, + "grad_norm": 2.037752628326416, + "learning_rate": 3.018791418252236e-05, + "loss": 5.9747, + "step": 4496 + }, + { + "epoch": 0.4336547733847637, + "grad_norm": 1.5959174633026123, + "learning_rate": 3.018050506099589e-05, + "loss": 6.0601, + "step": 4497 + }, + { + "epoch": 0.43375120540019285, + "grad_norm": 1.5971338748931885, + "learning_rate": 3.017309546400905e-05, + "loss": 5.9252, + "step": 4498 + }, + { + "epoch": 0.433847637415622, + "grad_norm": 1.8264375925064087, + "learning_rate": 3.0165685392241866e-05, + "loss": 5.9515, + "step": 4499 + }, + { + "epoch": 0.4339440694310511, + "grad_norm": 1.9429938793182373, + "learning_rate": 3.0158274846374434e-05, + "loss": 5.899, + "step": 4500 + }, + { + "epoch": 0.4340405014464802, + "grad_norm": 2.230384111404419, + "learning_rate": 3.0150863827086885e-05, + "loss": 6.0656, + "step": 4501 + }, + { + "epoch": 0.43413693346190935, + "grad_norm": 1.8838225603103638, + "learning_rate": 3.0143452335059402e-05, + "loss": 5.9167, + "step": 4502 + }, + { + "epoch": 0.4342333654773385, + "grad_norm": 2.8294992446899414, + "learning_rate": 3.0136040370972184e-05, + "loss": 5.8376, + "step": 4503 + }, + { + "epoch": 0.4343297974927676, + "grad_norm": 2.7601897716522217, + "learning_rate": 3.012862793550551e-05, + "loss": 5.8555, + "step": 4504 + }, + { + "epoch": 0.4344262295081967, + "grad_norm": 1.4290733337402344, + "learning_rate": 3.0121215029339673e-05, + "loss": 5.8915, + "step": 4505 + }, + { + "epoch": 0.43452266152362584, + "grad_norm": 2.090113639831543, + "learning_rate": 3.0113801653155026e-05, + "loss": 5.795, + "step": 4506 + }, + { + "epoch": 0.434619093539055, + "grad_norm": 2.41654634475708, + "learning_rate": 3.0106387807631954e-05, + "loss": 5.8018, + "step": 4507 + }, + { + "epoch": 0.43471552555448406, + "grad_norm": 1.5799481868743896, + "learning_rate": 3.0098973493450892e-05, + "loss": 5.8659, + "step": 4508 + }, + { + "epoch": 0.4348119575699132, + "grad_norm": 1.7984704971313477, + "learning_rate": 3.009155871129233e-05, + "loss": 5.7608, + "step": 4509 + }, + { + "epoch": 0.43490838958534234, + "grad_norm": 2.2773373126983643, + "learning_rate": 3.0084143461836766e-05, + "loss": 5.8313, + "step": 4510 + }, + { + "epoch": 0.4350048216007715, + "grad_norm": 1.668891191482544, + "learning_rate": 3.007672774576477e-05, + "loss": 5.8782, + "step": 4511 + }, + { + "epoch": 0.43510125361620056, + "grad_norm": 2.3589398860931396, + "learning_rate": 3.0069311563756958e-05, + "loss": 5.9407, + "step": 4512 + }, + { + "epoch": 0.4351976856316297, + "grad_norm": 2.5026803016662598, + "learning_rate": 3.0061894916493967e-05, + "loss": 5.8752, + "step": 4513 + }, + { + "epoch": 0.43529411764705883, + "grad_norm": 3.2212181091308594, + "learning_rate": 3.0054477804656484e-05, + "loss": 5.4871, + "step": 4514 + }, + { + "epoch": 0.43539054966248797, + "grad_norm": 2.3020856380462646, + "learning_rate": 3.0047060228925256e-05, + "loss": 5.8836, + "step": 4515 + }, + { + "epoch": 0.43548698167791705, + "grad_norm": 1.9583982229232788, + "learning_rate": 3.0039642189981048e-05, + "loss": 5.9577, + "step": 4516 + }, + { + "epoch": 0.4355834136933462, + "grad_norm": 2.115659713745117, + "learning_rate": 3.003222368850469e-05, + "loss": 5.8271, + "step": 4517 + }, + { + "epoch": 0.4356798457087753, + "grad_norm": 2.3606715202331543, + "learning_rate": 3.0024804725177034e-05, + "loss": 5.9703, + "step": 4518 + }, + { + "epoch": 0.4357762777242044, + "grad_norm": 1.8169665336608887, + "learning_rate": 3.001738530067898e-05, + "loss": 5.9272, + "step": 4519 + }, + { + "epoch": 0.43587270973963355, + "grad_norm": 1.83182692527771, + "learning_rate": 3.0009965415691488e-05, + "loss": 5.9383, + "step": 4520 + }, + { + "epoch": 0.4359691417550627, + "grad_norm": 2.3630592823028564, + "learning_rate": 3.0002545070895525e-05, + "loss": 5.8602, + "step": 4521 + }, + { + "epoch": 0.4360655737704918, + "grad_norm": 2.1545557975769043, + "learning_rate": 2.9995124266972136e-05, + "loss": 5.9654, + "step": 4522 + }, + { + "epoch": 0.4361620057859209, + "grad_norm": 2.275834560394287, + "learning_rate": 2.998770300460239e-05, + "loss": 5.8956, + "step": 4523 + }, + { + "epoch": 0.43625843780135004, + "grad_norm": 2.4795751571655273, + "learning_rate": 2.998028128446741e-05, + "loss": 5.7955, + "step": 4524 + }, + { + "epoch": 0.4363548698167792, + "grad_norm": 1.9089329242706299, + "learning_rate": 2.997285910724833e-05, + "loss": 5.9109, + "step": 4525 + }, + { + "epoch": 0.4364513018322083, + "grad_norm": 1.7523109912872314, + "learning_rate": 2.9965436473626368e-05, + "loss": 5.6736, + "step": 4526 + }, + { + "epoch": 0.4365477338476374, + "grad_norm": 2.1900861263275146, + "learning_rate": 2.9958013384282757e-05, + "loss": 5.7824, + "step": 4527 + }, + { + "epoch": 0.43664416586306654, + "grad_norm": 1.8568546772003174, + "learning_rate": 2.995058983989878e-05, + "loss": 5.9531, + "step": 4528 + }, + { + "epoch": 0.4367405978784957, + "grad_norm": 2.269458293914795, + "learning_rate": 2.994316584115576e-05, + "loss": 5.9366, + "step": 4529 + }, + { + "epoch": 0.43683702989392476, + "grad_norm": 3.3308162689208984, + "learning_rate": 2.9935741388735062e-05, + "loss": 5.7579, + "step": 4530 + }, + { + "epoch": 0.4369334619093539, + "grad_norm": 2.4188337326049805, + "learning_rate": 2.9928316483318104e-05, + "loss": 5.7192, + "step": 4531 + }, + { + "epoch": 0.43702989392478303, + "grad_norm": 2.4807748794555664, + "learning_rate": 2.9920891125586315e-05, + "loss": 5.7457, + "step": 4532 + }, + { + "epoch": 0.43712632594021217, + "grad_norm": 3.4951632022857666, + "learning_rate": 2.9913465316221196e-05, + "loss": 5.9697, + "step": 4533 + }, + { + "epoch": 0.43722275795564125, + "grad_norm": 2.797546148300171, + "learning_rate": 2.9906039055904278e-05, + "loss": 5.881, + "step": 4534 + }, + { + "epoch": 0.4373191899710704, + "grad_norm": 2.4035441875457764, + "learning_rate": 2.9898612345317133e-05, + "loss": 5.8749, + "step": 4535 + }, + { + "epoch": 0.4374156219864995, + "grad_norm": 2.3370091915130615, + "learning_rate": 2.9891185185141373e-05, + "loss": 5.8787, + "step": 4536 + }, + { + "epoch": 0.43751205400192866, + "grad_norm": 3.2917826175689697, + "learning_rate": 2.9883757576058663e-05, + "loss": 5.9191, + "step": 4537 + }, + { + "epoch": 0.43760848601735775, + "grad_norm": 2.602221727371216, + "learning_rate": 2.987632951875069e-05, + "loss": 5.8062, + "step": 4538 + }, + { + "epoch": 0.4377049180327869, + "grad_norm": 1.6402456760406494, + "learning_rate": 2.98689010138992e-05, + "loss": 5.8311, + "step": 4539 + }, + { + "epoch": 0.437801350048216, + "grad_norm": 2.738534688949585, + "learning_rate": 2.986147206218597e-05, + "loss": 5.748, + "step": 4540 + }, + { + "epoch": 0.43789778206364516, + "grad_norm": 3.229624032974243, + "learning_rate": 2.985404266429281e-05, + "loss": 5.8692, + "step": 4541 + }, + { + "epoch": 0.43799421407907424, + "grad_norm": 1.9406849145889282, + "learning_rate": 2.9846612820901587e-05, + "loss": 5.8389, + "step": 4542 + }, + { + "epoch": 0.4380906460945034, + "grad_norm": 1.9155915975570679, + "learning_rate": 2.9839182532694214e-05, + "loss": 5.9074, + "step": 4543 + }, + { + "epoch": 0.4381870781099325, + "grad_norm": 3.0142741203308105, + "learning_rate": 2.983175180035262e-05, + "loss": 5.9038, + "step": 4544 + }, + { + "epoch": 0.4382835101253616, + "grad_norm": 1.8488296270370483, + "learning_rate": 2.9824320624558788e-05, + "loss": 5.9581, + "step": 4545 + }, + { + "epoch": 0.43837994214079073, + "grad_norm": 2.036637306213379, + "learning_rate": 2.981688900599476e-05, + "loss": 5.7115, + "step": 4546 + }, + { + "epoch": 0.4384763741562199, + "grad_norm": 1.990240216255188, + "learning_rate": 2.9809456945342572e-05, + "loss": 5.777, + "step": 4547 + }, + { + "epoch": 0.438572806171649, + "grad_norm": 1.4908528327941895, + "learning_rate": 2.9802024443284355e-05, + "loss": 5.7613, + "step": 4548 + }, + { + "epoch": 0.4386692381870781, + "grad_norm": 1.802048921585083, + "learning_rate": 2.979459150050224e-05, + "loss": 5.7428, + "step": 4549 + }, + { + "epoch": 0.43876567020250723, + "grad_norm": 1.6029032468795776, + "learning_rate": 2.978715811767842e-05, + "loss": 5.7179, + "step": 4550 + }, + { + "epoch": 0.43886210221793637, + "grad_norm": 1.47156822681427, + "learning_rate": 2.977972429549512e-05, + "loss": 5.7062, + "step": 4551 + }, + { + "epoch": 0.4389585342333655, + "grad_norm": 1.8351643085479736, + "learning_rate": 2.977229003463461e-05, + "loss": 5.7177, + "step": 4552 + }, + { + "epoch": 0.4390549662487946, + "grad_norm": 1.7320375442504883, + "learning_rate": 2.9764855335779185e-05, + "loss": 5.7412, + "step": 4553 + }, + { + "epoch": 0.4391513982642237, + "grad_norm": 1.9940160512924194, + "learning_rate": 2.9757420199611214e-05, + "loss": 5.9293, + "step": 4554 + }, + { + "epoch": 0.43924783027965286, + "grad_norm": 2.1081297397613525, + "learning_rate": 2.9749984626813066e-05, + "loss": 5.5378, + "step": 4555 + }, + { + "epoch": 0.43934426229508194, + "grad_norm": 2.2861688137054443, + "learning_rate": 2.974254861806717e-05, + "loss": 5.8048, + "step": 4556 + }, + { + "epoch": 0.4394406943105111, + "grad_norm": 2.217698812484741, + "learning_rate": 2.9735112174056006e-05, + "loss": 5.9519, + "step": 4557 + }, + { + "epoch": 0.4395371263259402, + "grad_norm": 2.6088693141937256, + "learning_rate": 2.972767529546207e-05, + "loss": 5.9928, + "step": 4558 + }, + { + "epoch": 0.43963355834136936, + "grad_norm": 1.5854642391204834, + "learning_rate": 2.972023798296792e-05, + "loss": 5.9762, + "step": 4559 + }, + { + "epoch": 0.43972999035679844, + "grad_norm": 2.4940972328186035, + "learning_rate": 2.9712800237256132e-05, + "loss": 5.9032, + "step": 4560 + }, + { + "epoch": 0.4398264223722276, + "grad_norm": 2.5709633827209473, + "learning_rate": 2.9705362059009344e-05, + "loss": 5.9679, + "step": 4561 + }, + { + "epoch": 0.4399228543876567, + "grad_norm": 1.6961593627929688, + "learning_rate": 2.9697923448910215e-05, + "loss": 5.8977, + "step": 4562 + }, + { + "epoch": 0.44001928640308585, + "grad_norm": 2.371084213256836, + "learning_rate": 2.9690484407641456e-05, + "loss": 5.807, + "step": 4563 + }, + { + "epoch": 0.44011571841851493, + "grad_norm": 3.859161853790283, + "learning_rate": 2.9683044935885806e-05, + "loss": 6.1426, + "step": 4564 + }, + { + "epoch": 0.44021215043394407, + "grad_norm": 2.611618757247925, + "learning_rate": 2.9675605034326066e-05, + "loss": 5.8201, + "step": 4565 + }, + { + "epoch": 0.4403085824493732, + "grad_norm": 1.5887856483459473, + "learning_rate": 2.966816470364504e-05, + "loss": 5.7723, + "step": 4566 + }, + { + "epoch": 0.4404050144648023, + "grad_norm": 3.0172994136810303, + "learning_rate": 2.9660723944525608e-05, + "loss": 5.78, + "step": 4567 + }, + { + "epoch": 0.44050144648023143, + "grad_norm": 2.8755948543548584, + "learning_rate": 2.9653282757650673e-05, + "loss": 5.6835, + "step": 4568 + }, + { + "epoch": 0.44059787849566057, + "grad_norm": 2.5783846378326416, + "learning_rate": 2.9645841143703167e-05, + "loss": 5.1668, + "step": 4569 + }, + { + "epoch": 0.4406943105110897, + "grad_norm": 2.371410369873047, + "learning_rate": 2.9638399103366078e-05, + "loss": 5.8069, + "step": 4570 + }, + { + "epoch": 0.4407907425265188, + "grad_norm": 3.0300605297088623, + "learning_rate": 2.9630956637322434e-05, + "loss": 5.7856, + "step": 4571 + }, + { + "epoch": 0.4408871745419479, + "grad_norm": 2.296347141265869, + "learning_rate": 2.962351374625529e-05, + "loss": 5.8088, + "step": 4572 + }, + { + "epoch": 0.44098360655737706, + "grad_norm": 1.9445009231567383, + "learning_rate": 2.9616070430847737e-05, + "loss": 6.0402, + "step": 4573 + }, + { + "epoch": 0.4410800385728062, + "grad_norm": 2.3200364112854004, + "learning_rate": 2.9608626691782925e-05, + "loss": 5.801, + "step": 4574 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 2.2970495223999023, + "learning_rate": 2.9601182529744025e-05, + "loss": 5.7258, + "step": 4575 + }, + { + "epoch": 0.4412729026036644, + "grad_norm": 2.429584264755249, + "learning_rate": 2.9593737945414264e-05, + "loss": 5.7779, + "step": 4576 + }, + { + "epoch": 0.44136933461909356, + "grad_norm": 2.8027942180633545, + "learning_rate": 2.9586292939476883e-05, + "loss": 5.8786, + "step": 4577 + }, + { + "epoch": 0.44146576663452264, + "grad_norm": 1.812525749206543, + "learning_rate": 2.9578847512615182e-05, + "loss": 5.9151, + "step": 4578 + }, + { + "epoch": 0.4415621986499518, + "grad_norm": 1.7554272413253784, + "learning_rate": 2.9571401665512488e-05, + "loss": 5.8628, + "step": 4579 + }, + { + "epoch": 0.4416586306653809, + "grad_norm": 3.144944190979004, + "learning_rate": 2.956395539885218e-05, + "loss": 5.1797, + "step": 4580 + }, + { + "epoch": 0.44175506268081005, + "grad_norm": 2.2045302391052246, + "learning_rate": 2.9556508713317656e-05, + "loss": 5.7353, + "step": 4581 + }, + { + "epoch": 0.44185149469623913, + "grad_norm": 2.0153114795684814, + "learning_rate": 2.9549061609592378e-05, + "loss": 5.8583, + "step": 4582 + }, + { + "epoch": 0.44194792671166827, + "grad_norm": 1.9690853357315063, + "learning_rate": 2.9541614088359824e-05, + "loss": 5.8866, + "step": 4583 + }, + { + "epoch": 0.4420443587270974, + "grad_norm": 1.9340693950653076, + "learning_rate": 2.9534166150303516e-05, + "loss": 5.8405, + "step": 4584 + }, + { + "epoch": 0.44214079074252655, + "grad_norm": 1.557584524154663, + "learning_rate": 2.952671779610702e-05, + "loss": 5.9691, + "step": 4585 + }, + { + "epoch": 0.44223722275795563, + "grad_norm": 2.0061187744140625, + "learning_rate": 2.9519269026453932e-05, + "loss": 5.729, + "step": 4586 + }, + { + "epoch": 0.44233365477338477, + "grad_norm": 2.6512608528137207, + "learning_rate": 2.9511819842027904e-05, + "loss": 5.956, + "step": 4587 + }, + { + "epoch": 0.4424300867888139, + "grad_norm": 1.621010184288025, + "learning_rate": 2.9504370243512604e-05, + "loss": 5.8866, + "step": 4588 + }, + { + "epoch": 0.442526518804243, + "grad_norm": 1.5499478578567505, + "learning_rate": 2.9496920231591734e-05, + "loss": 5.7306, + "step": 4589 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 2.1905367374420166, + "learning_rate": 2.9489469806949076e-05, + "loss": 5.7995, + "step": 4590 + }, + { + "epoch": 0.44271938283510126, + "grad_norm": 2.082796096801758, + "learning_rate": 2.9482018970268393e-05, + "loss": 5.8047, + "step": 4591 + }, + { + "epoch": 0.4428158148505304, + "grad_norm": 2.158353328704834, + "learning_rate": 2.9474567722233532e-05, + "loss": 5.8851, + "step": 4592 + }, + { + "epoch": 0.4429122468659595, + "grad_norm": 2.4962551593780518, + "learning_rate": 2.946711606352835e-05, + "loss": 5.9931, + "step": 4593 + }, + { + "epoch": 0.4430086788813886, + "grad_norm": 2.0054197311401367, + "learning_rate": 2.9459663994836755e-05, + "loss": 5.8179, + "step": 4594 + }, + { + "epoch": 0.44310511089681776, + "grad_norm": 2.411684036254883, + "learning_rate": 2.9452211516842688e-05, + "loss": 5.7011, + "step": 4595 + }, + { + "epoch": 0.4432015429122469, + "grad_norm": 2.901601791381836, + "learning_rate": 2.9444758630230123e-05, + "loss": 5.9642, + "step": 4596 + }, + { + "epoch": 0.443297974927676, + "grad_norm": 1.8848720788955688, + "learning_rate": 2.9437305335683084e-05, + "loss": 5.9135, + "step": 4597 + }, + { + "epoch": 0.4433944069431051, + "grad_norm": 1.7850160598754883, + "learning_rate": 2.942985163388563e-05, + "loss": 5.8285, + "step": 4598 + }, + { + "epoch": 0.44349083895853425, + "grad_norm": 2.0231168270111084, + "learning_rate": 2.942239752552184e-05, + "loss": 5.893, + "step": 4599 + }, + { + "epoch": 0.44358727097396333, + "grad_norm": 1.8281818628311157, + "learning_rate": 2.941494301127584e-05, + "loss": 5.8283, + "step": 4600 + }, + { + "epoch": 0.44368370298939247, + "grad_norm": 1.611522912979126, + "learning_rate": 2.9407488091831815e-05, + "loss": 5.8434, + "step": 4601 + }, + { + "epoch": 0.4437801350048216, + "grad_norm": 2.1444191932678223, + "learning_rate": 2.940003276787395e-05, + "loss": 5.8, + "step": 4602 + }, + { + "epoch": 0.44387656702025075, + "grad_norm": 1.6226991415023804, + "learning_rate": 2.93925770400865e-05, + "loss": 5.9021, + "step": 4603 + }, + { + "epoch": 0.4439729990356798, + "grad_norm": 2.200347661972046, + "learning_rate": 2.938512090915373e-05, + "loss": 5.8334, + "step": 4604 + }, + { + "epoch": 0.44406943105110896, + "grad_norm": 2.1070663928985596, + "learning_rate": 2.9377664375759966e-05, + "loss": 5.7324, + "step": 4605 + }, + { + "epoch": 0.4441658630665381, + "grad_norm": 1.3590587377548218, + "learning_rate": 2.9370207440589552e-05, + "loss": 5.7334, + "step": 4606 + }, + { + "epoch": 0.44426229508196724, + "grad_norm": 2.2486162185668945, + "learning_rate": 2.9362750104326876e-05, + "loss": 5.9336, + "step": 4607 + }, + { + "epoch": 0.4443587270973963, + "grad_norm": 1.9003005027770996, + "learning_rate": 2.9355292367656363e-05, + "loss": 5.8807, + "step": 4608 + }, + { + "epoch": 0.44445515911282546, + "grad_norm": 2.282255172729492, + "learning_rate": 2.9347834231262482e-05, + "loss": 5.706, + "step": 4609 + }, + { + "epoch": 0.4445515911282546, + "grad_norm": 2.7261857986450195, + "learning_rate": 2.9340375695829725e-05, + "loss": 5.6991, + "step": 4610 + }, + { + "epoch": 0.4446480231436837, + "grad_norm": 1.9450734853744507, + "learning_rate": 2.933291676204263e-05, + "loss": 5.8051, + "step": 4611 + }, + { + "epoch": 0.4447444551591128, + "grad_norm": 3.0124855041503906, + "learning_rate": 2.9325457430585768e-05, + "loss": 5.9329, + "step": 4612 + }, + { + "epoch": 0.44484088717454195, + "grad_norm": 3.3660237789154053, + "learning_rate": 2.9317997702143745e-05, + "loss": 5.8734, + "step": 4613 + }, + { + "epoch": 0.4449373191899711, + "grad_norm": 2.5709264278411865, + "learning_rate": 2.9310537577401202e-05, + "loss": 5.8382, + "step": 4614 + }, + { + "epoch": 0.4450337512054002, + "grad_norm": 2.5703423023223877, + "learning_rate": 2.9303077057042832e-05, + "loss": 5.8654, + "step": 4615 + }, + { + "epoch": 0.4451301832208293, + "grad_norm": 3.301961898803711, + "learning_rate": 2.9295616141753347e-05, + "loss": 5.7791, + "step": 4616 + }, + { + "epoch": 0.44522661523625845, + "grad_norm": 2.331519603729248, + "learning_rate": 2.9288154832217497e-05, + "loss": 5.8383, + "step": 4617 + }, + { + "epoch": 0.4453230472516876, + "grad_norm": 1.8945338726043701, + "learning_rate": 2.9280693129120073e-05, + "loss": 5.8747, + "step": 4618 + }, + { + "epoch": 0.44541947926711667, + "grad_norm": 2.275346517562866, + "learning_rate": 2.9273231033145905e-05, + "loss": 5.8979, + "step": 4619 + }, + { + "epoch": 0.4455159112825458, + "grad_norm": 2.1735920906066895, + "learning_rate": 2.926576854497985e-05, + "loss": 5.9876, + "step": 4620 + }, + { + "epoch": 0.44561234329797494, + "grad_norm": 1.8182151317596436, + "learning_rate": 2.9258305665306807e-05, + "loss": 5.9873, + "step": 4621 + }, + { + "epoch": 0.445708775313404, + "grad_norm": 1.7091598510742188, + "learning_rate": 2.9250842394811712e-05, + "loss": 6.0082, + "step": 4622 + }, + { + "epoch": 0.44580520732883316, + "grad_norm": 1.730838656425476, + "learning_rate": 2.9243378734179537e-05, + "loss": 5.9142, + "step": 4623 + }, + { + "epoch": 0.4459016393442623, + "grad_norm": 1.8392918109893799, + "learning_rate": 2.9235914684095285e-05, + "loss": 5.8525, + "step": 4624 + }, + { + "epoch": 0.44599807135969144, + "grad_norm": 2.6786978244781494, + "learning_rate": 2.9228450245243993e-05, + "loss": 5.937, + "step": 4625 + }, + { + "epoch": 0.4460945033751205, + "grad_norm": 3.006803512573242, + "learning_rate": 2.9220985418310744e-05, + "loss": 5.8281, + "step": 4626 + }, + { + "epoch": 0.44619093539054966, + "grad_norm": 2.444725513458252, + "learning_rate": 2.921352020398065e-05, + "loss": 5.8882, + "step": 4627 + }, + { + "epoch": 0.4462873674059788, + "grad_norm": 2.4853272438049316, + "learning_rate": 2.9206054602938854e-05, + "loss": 5.8367, + "step": 4628 + }, + { + "epoch": 0.44638379942140793, + "grad_norm": 2.354816436767578, + "learning_rate": 2.9198588615870543e-05, + "loss": 5.8074, + "step": 4629 + }, + { + "epoch": 0.446480231436837, + "grad_norm": 2.8387019634246826, + "learning_rate": 2.9191122243460938e-05, + "loss": 5.7781, + "step": 4630 + }, + { + "epoch": 0.44657666345226615, + "grad_norm": 4.749989986419678, + "learning_rate": 2.91836554863953e-05, + "loss": 5.5711, + "step": 4631 + }, + { + "epoch": 0.4466730954676953, + "grad_norm": 2.958576202392578, + "learning_rate": 2.91761883453589e-05, + "loss": 5.5998, + "step": 4632 + }, + { + "epoch": 0.4467695274831244, + "grad_norm": 3.591557502746582, + "learning_rate": 2.9168720821037078e-05, + "loss": 5.4626, + "step": 4633 + }, + { + "epoch": 0.4468659594985535, + "grad_norm": 5.253914833068848, + "learning_rate": 2.916125291411519e-05, + "loss": 5.4952, + "step": 4634 + }, + { + "epoch": 0.44696239151398265, + "grad_norm": 4.399539947509766, + "learning_rate": 2.915378462527863e-05, + "loss": 5.7808, + "step": 4635 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 3.66593599319458, + "learning_rate": 2.9146315955212828e-05, + "loss": 5.5274, + "step": 4636 + }, + { + "epoch": 0.44715525554484087, + "grad_norm": 2.3668367862701416, + "learning_rate": 2.913884690460325e-05, + "loss": 5.8834, + "step": 4637 + }, + { + "epoch": 0.44725168756027, + "grad_norm": 2.5268101692199707, + "learning_rate": 2.9131377474135407e-05, + "loss": 5.8974, + "step": 4638 + }, + { + "epoch": 0.44734811957569914, + "grad_norm": 2.7343485355377197, + "learning_rate": 2.9123907664494816e-05, + "loss": 5.804, + "step": 4639 + }, + { + "epoch": 0.4474445515911283, + "grad_norm": 2.2282440662384033, + "learning_rate": 2.9116437476367053e-05, + "loss": 5.891, + "step": 4640 + }, + { + "epoch": 0.44754098360655736, + "grad_norm": 2.5252158641815186, + "learning_rate": 2.910896691043773e-05, + "loss": 5.8654, + "step": 4641 + }, + { + "epoch": 0.4476374156219865, + "grad_norm": 2.4338667392730713, + "learning_rate": 2.910149596739248e-05, + "loss": 5.8249, + "step": 4642 + }, + { + "epoch": 0.44773384763741564, + "grad_norm": 1.7756558656692505, + "learning_rate": 2.9094024647916974e-05, + "loss": 5.8166, + "step": 4643 + }, + { + "epoch": 0.4478302796528447, + "grad_norm": 2.1830875873565674, + "learning_rate": 2.908655295269693e-05, + "loss": 5.8103, + "step": 4644 + }, + { + "epoch": 0.44792671166827386, + "grad_norm": 1.8099989891052246, + "learning_rate": 2.9079080882418087e-05, + "loss": 5.9353, + "step": 4645 + }, + { + "epoch": 0.448023143683703, + "grad_norm": 1.8860974311828613, + "learning_rate": 2.9071608437766214e-05, + "loss": 5.8505, + "step": 4646 + }, + { + "epoch": 0.44811957569913213, + "grad_norm": 1.8058443069458008, + "learning_rate": 2.9064135619427135e-05, + "loss": 6.0086, + "step": 4647 + }, + { + "epoch": 0.4482160077145612, + "grad_norm": 1.808807134628296, + "learning_rate": 2.905666242808669e-05, + "loss": 5.7846, + "step": 4648 + }, + { + "epoch": 0.44831243972999035, + "grad_norm": 1.9766637086868286, + "learning_rate": 2.904918886443076e-05, + "loss": 5.8146, + "step": 4649 + }, + { + "epoch": 0.4484088717454195, + "grad_norm": 1.8138622045516968, + "learning_rate": 2.9041714929145258e-05, + "loss": 5.8587, + "step": 4650 + }, + { + "epoch": 0.44850530376084863, + "grad_norm": 2.0055594444274902, + "learning_rate": 2.9034240622916137e-05, + "loss": 5.7351, + "step": 4651 + }, + { + "epoch": 0.4486017357762777, + "grad_norm": 2.292171001434326, + "learning_rate": 2.902676594642938e-05, + "loss": 5.9399, + "step": 4652 + }, + { + "epoch": 0.44869816779170685, + "grad_norm": 2.1090292930603027, + "learning_rate": 2.9019290900371e-05, + "loss": 5.763, + "step": 4653 + }, + { + "epoch": 0.448794599807136, + "grad_norm": 3.6754508018493652, + "learning_rate": 2.901181548542704e-05, + "loss": 4.4218, + "step": 4654 + }, + { + "epoch": 0.44889103182256507, + "grad_norm": 3.418210506439209, + "learning_rate": 2.90043397022836e-05, + "loss": 5.8386, + "step": 4655 + }, + { + "epoch": 0.4489874638379942, + "grad_norm": 4.154155731201172, + "learning_rate": 2.899686355162679e-05, + "loss": 5.8079, + "step": 4656 + }, + { + "epoch": 0.44908389585342334, + "grad_norm": 2.5191256999969482, + "learning_rate": 2.898938703414276e-05, + "loss": 5.9263, + "step": 4657 + }, + { + "epoch": 0.4491803278688525, + "grad_norm": 2.0748422145843506, + "learning_rate": 2.89819101505177e-05, + "loss": 5.5448, + "step": 4658 + }, + { + "epoch": 0.44927675988428156, + "grad_norm": 2.193122625350952, + "learning_rate": 2.8974432901437825e-05, + "loss": 5.8885, + "step": 4659 + }, + { + "epoch": 0.4493731918997107, + "grad_norm": 1.9791052341461182, + "learning_rate": 2.89669552875894e-05, + "loss": 5.8766, + "step": 4660 + }, + { + "epoch": 0.44946962391513984, + "grad_norm": 2.3131604194641113, + "learning_rate": 2.8959477309658688e-05, + "loss": 5.7297, + "step": 4661 + }, + { + "epoch": 0.449566055930569, + "grad_norm": 2.6877224445343018, + "learning_rate": 2.8951998968332024e-05, + "loss": 5.8748, + "step": 4662 + }, + { + "epoch": 0.44966248794599806, + "grad_norm": 2.709595203399658, + "learning_rate": 2.894452026429576e-05, + "loss": 5.9538, + "step": 4663 + }, + { + "epoch": 0.4497589199614272, + "grad_norm": 2.0835022926330566, + "learning_rate": 2.893704119823628e-05, + "loss": 5.8246, + "step": 4664 + }, + { + "epoch": 0.44985535197685633, + "grad_norm": 2.407561779022217, + "learning_rate": 2.8929561770840008e-05, + "loss": 5.8343, + "step": 4665 + }, + { + "epoch": 0.4499517839922854, + "grad_norm": 2.4021382331848145, + "learning_rate": 2.8922081982793393e-05, + "loss": 5.8706, + "step": 4666 + }, + { + "epoch": 0.45004821600771455, + "grad_norm": 1.9559773206710815, + "learning_rate": 2.891460183478292e-05, + "loss": 5.8576, + "step": 4667 + }, + { + "epoch": 0.4501446480231437, + "grad_norm": 1.7836878299713135, + "learning_rate": 2.890712132749511e-05, + "loss": 5.857, + "step": 4668 + }, + { + "epoch": 0.4502410800385728, + "grad_norm": 2.1524600982666016, + "learning_rate": 2.8899640461616513e-05, + "loss": 5.6877, + "step": 4669 + }, + { + "epoch": 0.4503375120540019, + "grad_norm": 2.0902535915374756, + "learning_rate": 2.889215923783371e-05, + "loss": 5.8778, + "step": 4670 + }, + { + "epoch": 0.45043394406943105, + "grad_norm": 2.2855844497680664, + "learning_rate": 2.888467765683333e-05, + "loss": 5.7908, + "step": 4671 + }, + { + "epoch": 0.4505303760848602, + "grad_norm": 1.7070457935333252, + "learning_rate": 2.8877195719302007e-05, + "loss": 5.8513, + "step": 4672 + }, + { + "epoch": 0.4506268081002893, + "grad_norm": 1.7732973098754883, + "learning_rate": 2.8869713425926443e-05, + "loss": 5.7406, + "step": 4673 + }, + { + "epoch": 0.4507232401157184, + "grad_norm": 2.1472666263580322, + "learning_rate": 2.8862230777393336e-05, + "loss": 5.813, + "step": 4674 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 2.1760098934173584, + "learning_rate": 2.8854747774389445e-05, + "loss": 5.8166, + "step": 4675 + }, + { + "epoch": 0.4509161041465767, + "grad_norm": 1.5988688468933105, + "learning_rate": 2.884726441760155e-05, + "loss": 5.9075, + "step": 4676 + }, + { + "epoch": 0.45101253616200576, + "grad_norm": 1.812314748764038, + "learning_rate": 2.8839780707716458e-05, + "loss": 5.8995, + "step": 4677 + }, + { + "epoch": 0.4511089681774349, + "grad_norm": 2.7489213943481445, + "learning_rate": 2.883229664542102e-05, + "loss": 5.7399, + "step": 4678 + }, + { + "epoch": 0.45120540019286404, + "grad_norm": 2.326446771621704, + "learning_rate": 2.8824812231402115e-05, + "loss": 5.909, + "step": 4679 + }, + { + "epoch": 0.4513018322082932, + "grad_norm": 1.5426793098449707, + "learning_rate": 2.8817327466346644e-05, + "loss": 5.6632, + "step": 4680 + }, + { + "epoch": 0.45139826422372226, + "grad_norm": 2.1868057250976562, + "learning_rate": 2.8809842350941564e-05, + "loss": 5.9367, + "step": 4681 + }, + { + "epoch": 0.4514946962391514, + "grad_norm": 2.278282642364502, + "learning_rate": 2.8802356885873836e-05, + "loss": 5.8734, + "step": 4682 + }, + { + "epoch": 0.45159112825458053, + "grad_norm": 1.7848446369171143, + "learning_rate": 2.8794871071830476e-05, + "loss": 5.8076, + "step": 4683 + }, + { + "epoch": 0.45168756027000967, + "grad_norm": 1.757720947265625, + "learning_rate": 2.878738490949852e-05, + "loss": 5.7531, + "step": 4684 + }, + { + "epoch": 0.45178399228543875, + "grad_norm": 2.274102210998535, + "learning_rate": 2.8779898399565035e-05, + "loss": 5.336, + "step": 4685 + }, + { + "epoch": 0.4518804243008679, + "grad_norm": 2.345217227935791, + "learning_rate": 2.8772411542717133e-05, + "loss": 5.3347, + "step": 4686 + }, + { + "epoch": 0.451976856316297, + "grad_norm": 2.770570755004883, + "learning_rate": 2.8764924339641935e-05, + "loss": 5.3235, + "step": 4687 + }, + { + "epoch": 0.4520732883317261, + "grad_norm": 2.2423970699310303, + "learning_rate": 2.875743679102662e-05, + "loss": 5.6339, + "step": 4688 + }, + { + "epoch": 0.45216972034715525, + "grad_norm": 3.097867727279663, + "learning_rate": 2.8749948897558387e-05, + "loss": 5.938, + "step": 4689 + }, + { + "epoch": 0.4522661523625844, + "grad_norm": 2.187344789505005, + "learning_rate": 2.8742460659924452e-05, + "loss": 5.7719, + "step": 4690 + }, + { + "epoch": 0.4523625843780135, + "grad_norm": 2.7780544757843018, + "learning_rate": 2.8734972078812084e-05, + "loss": 5.837, + "step": 4691 + }, + { + "epoch": 0.4524590163934426, + "grad_norm": 2.0603082180023193, + "learning_rate": 2.8727483154908573e-05, + "loss": 5.7868, + "step": 4692 + }, + { + "epoch": 0.45255544840887174, + "grad_norm": 1.927586555480957, + "learning_rate": 2.8719993888901258e-05, + "loss": 5.9425, + "step": 4693 + }, + { + "epoch": 0.4526518804243009, + "grad_norm": 2.107124090194702, + "learning_rate": 2.871250428147747e-05, + "loss": 5.8865, + "step": 4694 + }, + { + "epoch": 0.45274831243973, + "grad_norm": 2.0062255859375, + "learning_rate": 2.8705014333324614e-05, + "loss": 5.8212, + "step": 4695 + }, + { + "epoch": 0.4528447444551591, + "grad_norm": 2.2701683044433594, + "learning_rate": 2.8697524045130098e-05, + "loss": 5.7963, + "step": 4696 + }, + { + "epoch": 0.45294117647058824, + "grad_norm": 2.571741819381714, + "learning_rate": 2.8690033417581387e-05, + "loss": 5.7326, + "step": 4697 + }, + { + "epoch": 0.4530376084860174, + "grad_norm": 2.484849452972412, + "learning_rate": 2.868254245136594e-05, + "loss": 5.91, + "step": 4698 + }, + { + "epoch": 0.45313404050144646, + "grad_norm": 3.2133710384368896, + "learning_rate": 2.8675051147171282e-05, + "loss": 5.6334, + "step": 4699 + }, + { + "epoch": 0.4532304725168756, + "grad_norm": 3.2327818870544434, + "learning_rate": 2.866755950568496e-05, + "loss": 5.7764, + "step": 4700 + }, + { + "epoch": 0.45332690453230473, + "grad_norm": 2.0023624897003174, + "learning_rate": 2.8660067527594532e-05, + "loss": 5.8252, + "step": 4701 + }, + { + "epoch": 0.45342333654773387, + "grad_norm": 2.321347713470459, + "learning_rate": 2.8652575213587614e-05, + "loss": 5.9673, + "step": 4702 + }, + { + "epoch": 0.45351976856316295, + "grad_norm": 2.9234094619750977, + "learning_rate": 2.8645082564351834e-05, + "loss": 5.8398, + "step": 4703 + }, + { + "epoch": 0.4536162005785921, + "grad_norm": 2.141359567642212, + "learning_rate": 2.863758958057487e-05, + "loss": 5.5788, + "step": 4704 + }, + { + "epoch": 0.4537126325940212, + "grad_norm": 1.7428804636001587, + "learning_rate": 2.8630096262944405e-05, + "loss": 5.6335, + "step": 4705 + }, + { + "epoch": 0.45380906460945036, + "grad_norm": 1.9988096952438354, + "learning_rate": 2.8622602612148175e-05, + "loss": 5.6776, + "step": 4706 + }, + { + "epoch": 0.45390549662487945, + "grad_norm": 3.7731900215148926, + "learning_rate": 2.861510862887393e-05, + "loss": 4.8111, + "step": 4707 + }, + { + "epoch": 0.4540019286403086, + "grad_norm": 2.595158576965332, + "learning_rate": 2.8607614313809473e-05, + "loss": 4.7296, + "step": 4708 + }, + { + "epoch": 0.4540983606557377, + "grad_norm": 2.2221720218658447, + "learning_rate": 2.860011966764261e-05, + "loss": 4.6495, + "step": 4709 + }, + { + "epoch": 0.4541947926711668, + "grad_norm": 2.7087152004241943, + "learning_rate": 2.85926246910612e-05, + "loss": 5.1068, + "step": 4710 + }, + { + "epoch": 0.45429122468659594, + "grad_norm": 3.0588295459747314, + "learning_rate": 2.8585129384753116e-05, + "loss": 5.8735, + "step": 4711 + }, + { + "epoch": 0.4543876567020251, + "grad_norm": 2.413994789123535, + "learning_rate": 2.857763374940627e-05, + "loss": 5.8736, + "step": 4712 + }, + { + "epoch": 0.4544840887174542, + "grad_norm": 2.269702434539795, + "learning_rate": 2.85701377857086e-05, + "loss": 5.8131, + "step": 4713 + }, + { + "epoch": 0.4545805207328833, + "grad_norm": 2.3553318977355957, + "learning_rate": 2.8562641494348075e-05, + "loss": 5.8318, + "step": 4714 + }, + { + "epoch": 0.45467695274831244, + "grad_norm": 2.4416897296905518, + "learning_rate": 2.8555144876012706e-05, + "loss": 5.8268, + "step": 4715 + }, + { + "epoch": 0.4547733847637416, + "grad_norm": 2.1913602352142334, + "learning_rate": 2.8547647931390513e-05, + "loss": 5.8426, + "step": 4716 + }, + { + "epoch": 0.4548698167791707, + "grad_norm": 3.5880849361419678, + "learning_rate": 2.8540150661169556e-05, + "loss": 5.7582, + "step": 4717 + }, + { + "epoch": 0.4549662487945998, + "grad_norm": 3.056279420852661, + "learning_rate": 2.8532653066037928e-05, + "loss": 5.7436, + "step": 4718 + }, + { + "epoch": 0.45506268081002893, + "grad_norm": 2.055722951889038, + "learning_rate": 2.8525155146683758e-05, + "loss": 5.4999, + "step": 4719 + }, + { + "epoch": 0.45515911282545807, + "grad_norm": 2.255115032196045, + "learning_rate": 2.8517656903795182e-05, + "loss": 5.7916, + "step": 4720 + }, + { + "epoch": 0.45525554484088715, + "grad_norm": 3.080104112625122, + "learning_rate": 2.8510158338060382e-05, + "loss": 5.8602, + "step": 4721 + }, + { + "epoch": 0.4553519768563163, + "grad_norm": 2.4441797733306885, + "learning_rate": 2.8502659450167578e-05, + "loss": 5.7003, + "step": 4722 + }, + { + "epoch": 0.4554484088717454, + "grad_norm": 1.9865018129348755, + "learning_rate": 2.8495160240804997e-05, + "loss": 5.7173, + "step": 4723 + }, + { + "epoch": 0.45554484088717456, + "grad_norm": 1.663092017173767, + "learning_rate": 2.8487660710660906e-05, + "loss": 5.6933, + "step": 4724 + }, + { + "epoch": 0.45564127290260364, + "grad_norm": 1.6325337886810303, + "learning_rate": 2.848016086042361e-05, + "loss": 5.6638, + "step": 4725 + }, + { + "epoch": 0.4557377049180328, + "grad_norm": 1.6470849514007568, + "learning_rate": 2.847266069078144e-05, + "loss": 5.6794, + "step": 4726 + }, + { + "epoch": 0.4558341369334619, + "grad_norm": 1.3176090717315674, + "learning_rate": 2.8465160202422735e-05, + "loss": 5.8012, + "step": 4727 + }, + { + "epoch": 0.45593056894889106, + "grad_norm": 1.7954401969909668, + "learning_rate": 2.84576593960359e-05, + "loss": 5.7823, + "step": 4728 + }, + { + "epoch": 0.45602700096432014, + "grad_norm": 1.4240567684173584, + "learning_rate": 2.8450158272309342e-05, + "loss": 5.6891, + "step": 4729 + }, + { + "epoch": 0.4561234329797493, + "grad_norm": 1.722692847251892, + "learning_rate": 2.8442656831931496e-05, + "loss": 5.7053, + "step": 4730 + }, + { + "epoch": 0.4562198649951784, + "grad_norm": 1.8091626167297363, + "learning_rate": 2.843515507559085e-05, + "loss": 5.7828, + "step": 4731 + }, + { + "epoch": 0.4563162970106075, + "grad_norm": 1.2625105381011963, + "learning_rate": 2.8427653003975895e-05, + "loss": 5.7557, + "step": 4732 + }, + { + "epoch": 0.45641272902603663, + "grad_norm": 2.7441179752349854, + "learning_rate": 2.842015061777517e-05, + "loss": 6.0414, + "step": 4733 + }, + { + "epoch": 0.45650916104146577, + "grad_norm": 2.6744043827056885, + "learning_rate": 2.8412647917677228e-05, + "loss": 5.7925, + "step": 4734 + }, + { + "epoch": 0.4566055930568949, + "grad_norm": 2.0178139209747314, + "learning_rate": 2.840514490437066e-05, + "loss": 5.8252, + "step": 4735 + }, + { + "epoch": 0.456702025072324, + "grad_norm": 2.1815876960754395, + "learning_rate": 2.839764157854409e-05, + "loss": 5.767, + "step": 4736 + }, + { + "epoch": 0.45679845708775313, + "grad_norm": 2.848132848739624, + "learning_rate": 2.8390137940886152e-05, + "loss": 5.7125, + "step": 4737 + }, + { + "epoch": 0.45689488910318227, + "grad_norm": 1.7868553400039673, + "learning_rate": 2.838263399208553e-05, + "loss": 5.8159, + "step": 4738 + }, + { + "epoch": 0.4569913211186114, + "grad_norm": 2.509906530380249, + "learning_rate": 2.837512973283092e-05, + "loss": 5.7606, + "step": 4739 + }, + { + "epoch": 0.4570877531340405, + "grad_norm": 2.0475308895111084, + "learning_rate": 2.836762516381105e-05, + "loss": 5.7708, + "step": 4740 + }, + { + "epoch": 0.4571841851494696, + "grad_norm": 2.0303306579589844, + "learning_rate": 2.8360120285714704e-05, + "loss": 5.9593, + "step": 4741 + }, + { + "epoch": 0.45728061716489876, + "grad_norm": 2.005223512649536, + "learning_rate": 2.8352615099230646e-05, + "loss": 5.6915, + "step": 4742 + }, + { + "epoch": 0.45737704918032784, + "grad_norm": 1.7376737594604492, + "learning_rate": 2.834510960504769e-05, + "loss": 5.9715, + "step": 4743 + }, + { + "epoch": 0.457473481195757, + "grad_norm": 1.86961829662323, + "learning_rate": 2.8337603803854714e-05, + "loss": 5.9042, + "step": 4744 + }, + { + "epoch": 0.4575699132111861, + "grad_norm": 1.934991717338562, + "learning_rate": 2.833009769634055e-05, + "loss": 5.8863, + "step": 4745 + }, + { + "epoch": 0.45766634522661526, + "grad_norm": 1.240513801574707, + "learning_rate": 2.8322591283194127e-05, + "loss": 5.7822, + "step": 4746 + }, + { + "epoch": 0.45776277724204434, + "grad_norm": 1.5076336860656738, + "learning_rate": 2.8315084565104362e-05, + "loss": 5.8202, + "step": 4747 + }, + { + "epoch": 0.4578592092574735, + "grad_norm": 1.432185411453247, + "learning_rate": 2.830757754276021e-05, + "loss": 5.7929, + "step": 4748 + }, + { + "epoch": 0.4579556412729026, + "grad_norm": 1.8941324949264526, + "learning_rate": 2.830007021685067e-05, + "loss": 5.8564, + "step": 4749 + }, + { + "epoch": 0.45805207328833175, + "grad_norm": 1.6765645742416382, + "learning_rate": 2.8292562588064747e-05, + "loss": 5.8138, + "step": 4750 + }, + { + "epoch": 0.45814850530376083, + "grad_norm": 1.5499763488769531, + "learning_rate": 2.828505465709148e-05, + "loss": 5.7584, + "step": 4751 + }, + { + "epoch": 0.45824493731918997, + "grad_norm": 1.9114488363265991, + "learning_rate": 2.8277546424619945e-05, + "loss": 5.8203, + "step": 4752 + }, + { + "epoch": 0.4583413693346191, + "grad_norm": 2.265543222427368, + "learning_rate": 2.8270037891339228e-05, + "loss": 5.7127, + "step": 4753 + }, + { + "epoch": 0.4584378013500482, + "grad_norm": 2.3316962718963623, + "learning_rate": 2.8262529057938458e-05, + "loss": 5.8621, + "step": 4754 + }, + { + "epoch": 0.45853423336547733, + "grad_norm": 1.540927767753601, + "learning_rate": 2.825501992510679e-05, + "loss": 5.706, + "step": 4755 + }, + { + "epoch": 0.45863066538090647, + "grad_norm": 1.6489063501358032, + "learning_rate": 2.8247510493533397e-05, + "loss": 5.6929, + "step": 4756 + }, + { + "epoch": 0.4587270973963356, + "grad_norm": 2.022960901260376, + "learning_rate": 2.8240000763907486e-05, + "loss": 5.9167, + "step": 4757 + }, + { + "epoch": 0.4588235294117647, + "grad_norm": 1.8711786270141602, + "learning_rate": 2.8232490736918292e-05, + "loss": 5.448, + "step": 4758 + }, + { + "epoch": 0.4589199614271938, + "grad_norm": 2.3429031372070312, + "learning_rate": 2.8224980413255086e-05, + "loss": 5.7245, + "step": 4759 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 2.719675302505493, + "learning_rate": 2.8217469793607144e-05, + "loss": 5.7554, + "step": 4760 + }, + { + "epoch": 0.4591128254580521, + "grad_norm": 1.8104208707809448, + "learning_rate": 2.8209958878663778e-05, + "loss": 5.6574, + "step": 4761 + }, + { + "epoch": 0.4592092574734812, + "grad_norm": 2.163839817047119, + "learning_rate": 2.8202447669114335e-05, + "loss": 5.8008, + "step": 4762 + }, + { + "epoch": 0.4593056894889103, + "grad_norm": 1.907239317893982, + "learning_rate": 2.81949361656482e-05, + "loss": 5.8751, + "step": 4763 + }, + { + "epoch": 0.45940212150433946, + "grad_norm": 1.7337584495544434, + "learning_rate": 2.8187424368954744e-05, + "loss": 5.7098, + "step": 4764 + }, + { + "epoch": 0.45949855351976854, + "grad_norm": 2.065861463546753, + "learning_rate": 2.8179912279723402e-05, + "loss": 5.8103, + "step": 4765 + }, + { + "epoch": 0.4595949855351977, + "grad_norm": 1.8224432468414307, + "learning_rate": 2.8172399898643636e-05, + "loss": 5.7706, + "step": 4766 + }, + { + "epoch": 0.4596914175506268, + "grad_norm": 2.2831220626831055, + "learning_rate": 2.81648872264049e-05, + "loss": 5.7757, + "step": 4767 + }, + { + "epoch": 0.45978784956605595, + "grad_norm": 2.7121479511260986, + "learning_rate": 2.8157374263696716e-05, + "loss": 5.8029, + "step": 4768 + }, + { + "epoch": 0.45988428158148503, + "grad_norm": 1.4833263158798218, + "learning_rate": 2.8149861011208607e-05, + "loss": 5.6618, + "step": 4769 + }, + { + "epoch": 0.45998071359691417, + "grad_norm": 2.1180760860443115, + "learning_rate": 2.8142347469630125e-05, + "loss": 5.8887, + "step": 4770 + }, + { + "epoch": 0.4600771456123433, + "grad_norm": 2.1869094371795654, + "learning_rate": 2.813483363965087e-05, + "loss": 5.6396, + "step": 4771 + }, + { + "epoch": 0.46017357762777245, + "grad_norm": 1.9953789710998535, + "learning_rate": 2.812731952196043e-05, + "loss": 5.8719, + "step": 4772 + }, + { + "epoch": 0.4602700096432015, + "grad_norm": 2.2557051181793213, + "learning_rate": 2.8119805117248464e-05, + "loss": 5.7484, + "step": 4773 + }, + { + "epoch": 0.46036644165863067, + "grad_norm": 2.248455762863159, + "learning_rate": 2.8112290426204625e-05, + "loss": 5.8601, + "step": 4774 + }, + { + "epoch": 0.4604628736740598, + "grad_norm": 2.2678310871124268, + "learning_rate": 2.8104775449518593e-05, + "loss": 5.8342, + "step": 4775 + }, + { + "epoch": 0.4605593056894889, + "grad_norm": 1.8909990787506104, + "learning_rate": 2.8097260187880097e-05, + "loss": 5.845, + "step": 4776 + }, + { + "epoch": 0.460655737704918, + "grad_norm": 1.3679527044296265, + "learning_rate": 2.8089744641978878e-05, + "loss": 5.8639, + "step": 4777 + }, + { + "epoch": 0.46075216972034716, + "grad_norm": 2.4141883850097656, + "learning_rate": 2.808222881250469e-05, + "loss": 5.6171, + "step": 4778 + }, + { + "epoch": 0.4608486017357763, + "grad_norm": 2.2877745628356934, + "learning_rate": 2.8074712700147338e-05, + "loss": 5.7937, + "step": 4779 + }, + { + "epoch": 0.4609450337512054, + "grad_norm": 1.771065592765808, + "learning_rate": 2.8067196305596636e-05, + "loss": 5.7659, + "step": 4780 + }, + { + "epoch": 0.4610414657666345, + "grad_norm": 2.4201972484588623, + "learning_rate": 2.805967962954244e-05, + "loss": 5.7895, + "step": 4781 + }, + { + "epoch": 0.46113789778206365, + "grad_norm": 2.2788779735565186, + "learning_rate": 2.8052162672674608e-05, + "loss": 5.7685, + "step": 4782 + }, + { + "epoch": 0.4612343297974928, + "grad_norm": 2.060575246810913, + "learning_rate": 2.8044645435683044e-05, + "loss": 5.6019, + "step": 4783 + }, + { + "epoch": 0.4613307618129219, + "grad_norm": 2.144623279571533, + "learning_rate": 2.8037127919257668e-05, + "loss": 5.8552, + "step": 4784 + }, + { + "epoch": 0.461427193828351, + "grad_norm": 1.9190665483474731, + "learning_rate": 2.802961012408843e-05, + "loss": 5.8216, + "step": 4785 + }, + { + "epoch": 0.46152362584378015, + "grad_norm": 1.7979888916015625, + "learning_rate": 2.80220920508653e-05, + "loss": 5.8977, + "step": 4786 + }, + { + "epoch": 0.46162005785920923, + "grad_norm": 2.122087240219116, + "learning_rate": 2.8014573700278286e-05, + "loss": 5.8413, + "step": 4787 + }, + { + "epoch": 0.46171648987463837, + "grad_norm": 1.8667973279953003, + "learning_rate": 2.8007055073017406e-05, + "loss": 5.774, + "step": 4788 + }, + { + "epoch": 0.4618129218900675, + "grad_norm": 1.526321291923523, + "learning_rate": 2.7999536169772718e-05, + "loss": 5.8484, + "step": 4789 + }, + { + "epoch": 0.46190935390549664, + "grad_norm": 1.3425339460372925, + "learning_rate": 2.7992016991234282e-05, + "loss": 5.8802, + "step": 4790 + }, + { + "epoch": 0.4620057859209257, + "grad_norm": 1.4589790105819702, + "learning_rate": 2.7984497538092213e-05, + "loss": 5.7115, + "step": 4791 + }, + { + "epoch": 0.46210221793635486, + "grad_norm": 1.4184534549713135, + "learning_rate": 2.7976977811036636e-05, + "loss": 5.7428, + "step": 4792 + }, + { + "epoch": 0.462198649951784, + "grad_norm": 1.5459994077682495, + "learning_rate": 2.79694578107577e-05, + "loss": 5.7071, + "step": 4793 + }, + { + "epoch": 0.46229508196721314, + "grad_norm": 1.5070366859436035, + "learning_rate": 2.7961937537945575e-05, + "loss": 5.8124, + "step": 4794 + }, + { + "epoch": 0.4623915139826422, + "grad_norm": 1.8703711032867432, + "learning_rate": 2.7954416993290474e-05, + "loss": 5.7963, + "step": 4795 + }, + { + "epoch": 0.46248794599807136, + "grad_norm": 1.7606383562088013, + "learning_rate": 2.794689617748262e-05, + "loss": 5.7999, + "step": 4796 + }, + { + "epoch": 0.4625843780135005, + "grad_norm": 1.733479619026184, + "learning_rate": 2.7939375091212256e-05, + "loss": 5.7997, + "step": 4797 + }, + { + "epoch": 0.4626808100289296, + "grad_norm": 2.0469532012939453, + "learning_rate": 2.793185373516966e-05, + "loss": 5.8008, + "step": 4798 + }, + { + "epoch": 0.4627772420443587, + "grad_norm": 1.6468122005462646, + "learning_rate": 2.792433211004515e-05, + "loss": 5.8402, + "step": 4799 + }, + { + "epoch": 0.46287367405978785, + "grad_norm": 2.4923951625823975, + "learning_rate": 2.7916810216529025e-05, + "loss": 5.7639, + "step": 4800 + }, + { + "epoch": 0.462970106075217, + "grad_norm": 2.452643394470215, + "learning_rate": 2.7909288055311656e-05, + "loss": 5.6472, + "step": 4801 + }, + { + "epoch": 0.4630665380906461, + "grad_norm": 1.6488170623779297, + "learning_rate": 2.790176562708341e-05, + "loss": 5.7757, + "step": 4802 + }, + { + "epoch": 0.4631629701060752, + "grad_norm": 2.3927478790283203, + "learning_rate": 2.7894242932534685e-05, + "loss": 5.727, + "step": 4803 + }, + { + "epoch": 0.46325940212150435, + "grad_norm": 1.7629001140594482, + "learning_rate": 2.7886719972355907e-05, + "loss": 5.7132, + "step": 4804 + }, + { + "epoch": 0.4633558341369335, + "grad_norm": 1.9718427658081055, + "learning_rate": 2.787919674723752e-05, + "loss": 5.6085, + "step": 4805 + }, + { + "epoch": 0.46345226615236257, + "grad_norm": 2.3805346488952637, + "learning_rate": 2.787167325787e-05, + "loss": 5.5804, + "step": 4806 + }, + { + "epoch": 0.4635486981677917, + "grad_norm": 1.6047688722610474, + "learning_rate": 2.786414950494385e-05, + "loss": 5.6266, + "step": 4807 + }, + { + "epoch": 0.46364513018322084, + "grad_norm": 1.7821903228759766, + "learning_rate": 2.7856625489149572e-05, + "loss": 5.7324, + "step": 4808 + }, + { + "epoch": 0.4637415621986499, + "grad_norm": 2.1285791397094727, + "learning_rate": 2.7849101211177725e-05, + "loss": 5.723, + "step": 4809 + }, + { + "epoch": 0.46383799421407906, + "grad_norm": 1.937894582748413, + "learning_rate": 2.7841576671718885e-05, + "loss": 5.6439, + "step": 4810 + }, + { + "epoch": 0.4639344262295082, + "grad_norm": 1.6487010717391968, + "learning_rate": 2.7834051871463625e-05, + "loss": 5.701, + "step": 4811 + }, + { + "epoch": 0.46403085824493734, + "grad_norm": 1.9541196823120117, + "learning_rate": 2.7826526811102575e-05, + "loss": 5.6964, + "step": 4812 + }, + { + "epoch": 0.4641272902603664, + "grad_norm": 1.3516870737075806, + "learning_rate": 2.7819001491326376e-05, + "loss": 5.6427, + "step": 4813 + }, + { + "epoch": 0.46422372227579556, + "grad_norm": 1.6618553400039673, + "learning_rate": 2.7811475912825684e-05, + "loss": 5.6226, + "step": 4814 + }, + { + "epoch": 0.4643201542912247, + "grad_norm": 1.9664322137832642, + "learning_rate": 2.7803950076291197e-05, + "loss": 5.7452, + "step": 4815 + }, + { + "epoch": 0.46441658630665383, + "grad_norm": 1.4131815433502197, + "learning_rate": 2.7796423982413623e-05, + "loss": 5.7678, + "step": 4816 + }, + { + "epoch": 0.4645130183220829, + "grad_norm": 1.6159921884536743, + "learning_rate": 2.778889763188369e-05, + "loss": 5.7632, + "step": 4817 + }, + { + "epoch": 0.46460945033751205, + "grad_norm": 1.8134403228759766, + "learning_rate": 2.7781371025392183e-05, + "loss": 5.8095, + "step": 4818 + }, + { + "epoch": 0.4647058823529412, + "grad_norm": 1.62057363986969, + "learning_rate": 2.7773844163629854e-05, + "loss": 5.6936, + "step": 4819 + }, + { + "epoch": 0.4648023143683703, + "grad_norm": 1.5423678159713745, + "learning_rate": 2.776631704728752e-05, + "loss": 5.7093, + "step": 4820 + }, + { + "epoch": 0.4648987463837994, + "grad_norm": 2.223388195037842, + "learning_rate": 2.7758789677056023e-05, + "loss": 5.9011, + "step": 4821 + }, + { + "epoch": 0.46499517839922855, + "grad_norm": 1.764089822769165, + "learning_rate": 2.7751262053626197e-05, + "loss": 5.6126, + "step": 4822 + }, + { + "epoch": 0.4650916104146577, + "grad_norm": 1.4490567445755005, + "learning_rate": 2.7743734177688925e-05, + "loss": 5.7452, + "step": 4823 + }, + { + "epoch": 0.46518804243008677, + "grad_norm": 2.06386137008667, + "learning_rate": 2.773620604993511e-05, + "loss": 5.6643, + "step": 4824 + }, + { + "epoch": 0.4652844744455159, + "grad_norm": 2.1833691596984863, + "learning_rate": 2.7728677671055674e-05, + "loss": 5.6558, + "step": 4825 + }, + { + "epoch": 0.46538090646094504, + "grad_norm": 2.087975025177002, + "learning_rate": 2.7721149041741555e-05, + "loss": 5.5859, + "step": 4826 + }, + { + "epoch": 0.4654773384763742, + "grad_norm": 1.9582570791244507, + "learning_rate": 2.7713620162683723e-05, + "loss": 5.6164, + "step": 4827 + }, + { + "epoch": 0.46557377049180326, + "grad_norm": 1.8664404153823853, + "learning_rate": 2.770609103457318e-05, + "loss": 5.7761, + "step": 4828 + }, + { + "epoch": 0.4656702025072324, + "grad_norm": 1.9699987173080444, + "learning_rate": 2.769856165810093e-05, + "loss": 5.7143, + "step": 4829 + }, + { + "epoch": 0.46576663452266154, + "grad_norm": 1.784536361694336, + "learning_rate": 2.7691032033958016e-05, + "loss": 5.7618, + "step": 4830 + }, + { + "epoch": 0.4658630665380906, + "grad_norm": 1.6232472658157349, + "learning_rate": 2.7683502162835483e-05, + "loss": 5.8468, + "step": 4831 + }, + { + "epoch": 0.46595949855351976, + "grad_norm": 1.599867820739746, + "learning_rate": 2.767597204542443e-05, + "loss": 5.7692, + "step": 4832 + }, + { + "epoch": 0.4660559305689489, + "grad_norm": 1.7154910564422607, + "learning_rate": 2.7668441682415963e-05, + "loss": 5.8463, + "step": 4833 + }, + { + "epoch": 0.46615236258437803, + "grad_norm": 1.3809905052185059, + "learning_rate": 2.76609110745012e-05, + "loss": 5.8323, + "step": 4834 + }, + { + "epoch": 0.4662487945998071, + "grad_norm": 1.5741825103759766, + "learning_rate": 2.765338022237129e-05, + "loss": 5.7917, + "step": 4835 + }, + { + "epoch": 0.46634522661523625, + "grad_norm": 2.005873203277588, + "learning_rate": 2.7645849126717406e-05, + "loss": 5.8623, + "step": 4836 + }, + { + "epoch": 0.4664416586306654, + "grad_norm": 2.0261600017547607, + "learning_rate": 2.763831778823075e-05, + "loss": 5.7945, + "step": 4837 + }, + { + "epoch": 0.4665380906460945, + "grad_norm": 1.990602970123291, + "learning_rate": 2.7630786207602532e-05, + "loss": 5.7046, + "step": 4838 + }, + { + "epoch": 0.4666345226615236, + "grad_norm": 1.8545225858688354, + "learning_rate": 2.7623254385523994e-05, + "loss": 5.7974, + "step": 4839 + }, + { + "epoch": 0.46673095467695275, + "grad_norm": 2.0035717487335205, + "learning_rate": 2.7615722322686404e-05, + "loss": 5.6687, + "step": 4840 + }, + { + "epoch": 0.4668273866923819, + "grad_norm": 1.90520179271698, + "learning_rate": 2.7608190019781032e-05, + "loss": 5.7572, + "step": 4841 + }, + { + "epoch": 0.46692381870781097, + "grad_norm": 1.6737351417541504, + "learning_rate": 2.7600657477499192e-05, + "loss": 5.6497, + "step": 4842 + }, + { + "epoch": 0.4670202507232401, + "grad_norm": 1.838908314704895, + "learning_rate": 2.7593124696532212e-05, + "loss": 5.6931, + "step": 4843 + }, + { + "epoch": 0.46711668273866924, + "grad_norm": 1.8747793436050415, + "learning_rate": 2.7585591677571437e-05, + "loss": 5.7659, + "step": 4844 + }, + { + "epoch": 0.4672131147540984, + "grad_norm": 1.7211650609970093, + "learning_rate": 2.757805842130824e-05, + "loss": 5.7634, + "step": 4845 + }, + { + "epoch": 0.46730954676952746, + "grad_norm": 2.1007604598999023, + "learning_rate": 2.757052492843401e-05, + "loss": 5.8444, + "step": 4846 + }, + { + "epoch": 0.4674059787849566, + "grad_norm": 2.2642738819122314, + "learning_rate": 2.7562991199640176e-05, + "loss": 5.7327, + "step": 4847 + }, + { + "epoch": 0.46750241080038574, + "grad_norm": 1.8395092487335205, + "learning_rate": 2.7555457235618155e-05, + "loss": 5.7361, + "step": 4848 + }, + { + "epoch": 0.4675988428158149, + "grad_norm": 1.9908980131149292, + "learning_rate": 2.754792303705942e-05, + "loss": 5.7052, + "step": 4849 + }, + { + "epoch": 0.46769527483124396, + "grad_norm": 2.3746869564056396, + "learning_rate": 2.7540388604655437e-05, + "loss": 5.8116, + "step": 4850 + }, + { + "epoch": 0.4677917068466731, + "grad_norm": 1.979102611541748, + "learning_rate": 2.7532853939097724e-05, + "loss": 5.8113, + "step": 4851 + }, + { + "epoch": 0.46788813886210223, + "grad_norm": 2.5219039916992188, + "learning_rate": 2.7525319041077792e-05, + "loss": 5.7187, + "step": 4852 + }, + { + "epoch": 0.4679845708775313, + "grad_norm": 2.4100916385650635, + "learning_rate": 2.7517783911287182e-05, + "loss": 5.8664, + "step": 4853 + }, + { + "epoch": 0.46808100289296045, + "grad_norm": 1.8374701738357544, + "learning_rate": 2.751024855041746e-05, + "loss": 5.7868, + "step": 4854 + }, + { + "epoch": 0.4681774349083896, + "grad_norm": 2.7892227172851562, + "learning_rate": 2.750271295916022e-05, + "loss": 5.6804, + "step": 4855 + }, + { + "epoch": 0.4682738669238187, + "grad_norm": 2.7163126468658447, + "learning_rate": 2.7495177138207067e-05, + "loss": 5.7406, + "step": 4856 + }, + { + "epoch": 0.4683702989392478, + "grad_norm": 1.5884915590286255, + "learning_rate": 2.7487641088249628e-05, + "loss": 5.7692, + "step": 4857 + }, + { + "epoch": 0.46846673095467695, + "grad_norm": 3.571331739425659, + "learning_rate": 2.748010480997955e-05, + "loss": 5.5093, + "step": 4858 + }, + { + "epoch": 0.4685631629701061, + "grad_norm": 3.3724939823150635, + "learning_rate": 2.7472568304088502e-05, + "loss": 5.822, + "step": 4859 + }, + { + "epoch": 0.4686595949855352, + "grad_norm": 2.9486286640167236, + "learning_rate": 2.7465031571268185e-05, + "loss": 5.6501, + "step": 4860 + }, + { + "epoch": 0.4687560270009643, + "grad_norm": 2.1603493690490723, + "learning_rate": 2.74574946122103e-05, + "loss": 5.6344, + "step": 4861 + }, + { + "epoch": 0.46885245901639344, + "grad_norm": 2.8758633136749268, + "learning_rate": 2.7449957427606588e-05, + "loss": 5.8419, + "step": 4862 + }, + { + "epoch": 0.4689488910318226, + "grad_norm": 3.499213695526123, + "learning_rate": 2.7442420018148797e-05, + "loss": 5.7113, + "step": 4863 + }, + { + "epoch": 0.46904532304725166, + "grad_norm": 2.7057011127471924, + "learning_rate": 2.7434882384528697e-05, + "loss": 5.5974, + "step": 4864 + }, + { + "epoch": 0.4691417550626808, + "grad_norm": 2.040910005569458, + "learning_rate": 2.7427344527438097e-05, + "loss": 5.8588, + "step": 4865 + }, + { + "epoch": 0.46923818707810994, + "grad_norm": 1.8344119787216187, + "learning_rate": 2.7419806447568795e-05, + "loss": 5.6663, + "step": 4866 + }, + { + "epoch": 0.4693346190935391, + "grad_norm": 2.0097503662109375, + "learning_rate": 2.7412268145612642e-05, + "loss": 5.6317, + "step": 4867 + }, + { + "epoch": 0.46943105110896816, + "grad_norm": 1.9308228492736816, + "learning_rate": 2.7404729622261487e-05, + "loss": 5.5693, + "step": 4868 + }, + { + "epoch": 0.4695274831243973, + "grad_norm": 1.531241536140442, + "learning_rate": 2.7397190878207207e-05, + "loss": 5.5247, + "step": 4869 + }, + { + "epoch": 0.46962391513982643, + "grad_norm": 1.2958364486694336, + "learning_rate": 2.73896519141417e-05, + "loss": 5.8209, + "step": 4870 + }, + { + "epoch": 0.46972034715525557, + "grad_norm": 1.4828696250915527, + "learning_rate": 2.7382112730756875e-05, + "loss": 5.8509, + "step": 4871 + }, + { + "epoch": 0.46981677917068465, + "grad_norm": 1.6637134552001953, + "learning_rate": 2.7374573328744678e-05, + "loss": 5.784, + "step": 4872 + }, + { + "epoch": 0.4699132111861138, + "grad_norm": 1.548252820968628, + "learning_rate": 2.7367033708797073e-05, + "loss": 5.7435, + "step": 4873 + }, + { + "epoch": 0.4700096432015429, + "grad_norm": 1.9600919485092163, + "learning_rate": 2.735949387160602e-05, + "loss": 5.81, + "step": 4874 + }, + { + "epoch": 0.470106075216972, + "grad_norm": 1.8701512813568115, + "learning_rate": 2.7351953817863518e-05, + "loss": 5.75, + "step": 4875 + }, + { + "epoch": 0.47020250723240115, + "grad_norm": 1.8608952760696411, + "learning_rate": 2.7344413548261595e-05, + "loss": 5.754, + "step": 4876 + }, + { + "epoch": 0.4702989392478303, + "grad_norm": 2.0801727771759033, + "learning_rate": 2.733687306349228e-05, + "loss": 5.6978, + "step": 4877 + }, + { + "epoch": 0.4703953712632594, + "grad_norm": 1.5802890062332153, + "learning_rate": 2.7329332364247634e-05, + "loss": 5.689, + "step": 4878 + }, + { + "epoch": 0.4704918032786885, + "grad_norm": 2.2211477756500244, + "learning_rate": 2.7321791451219725e-05, + "loss": 5.6883, + "step": 4879 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 2.621126413345337, + "learning_rate": 2.7314250325100664e-05, + "loss": 5.6189, + "step": 4880 + }, + { + "epoch": 0.4706846673095468, + "grad_norm": 1.680493712425232, + "learning_rate": 2.7306708986582553e-05, + "loss": 5.7166, + "step": 4881 + }, + { + "epoch": 0.4707810993249759, + "grad_norm": 1.713272213935852, + "learning_rate": 2.7299167436357524e-05, + "loss": 5.6267, + "step": 4882 + }, + { + "epoch": 0.470877531340405, + "grad_norm": 1.9174120426177979, + "learning_rate": 2.7291625675117744e-05, + "loss": 5.6578, + "step": 4883 + }, + { + "epoch": 0.47097396335583414, + "grad_norm": 1.750230073928833, + "learning_rate": 2.728408370355538e-05, + "loss": 5.5416, + "step": 4884 + }, + { + "epoch": 0.4710703953712633, + "grad_norm": 1.5479539632797241, + "learning_rate": 2.7276541522362624e-05, + "loss": 5.7207, + "step": 4885 + }, + { + "epoch": 0.47116682738669236, + "grad_norm": 1.8800245523452759, + "learning_rate": 2.726899913223169e-05, + "loss": 5.7786, + "step": 4886 + }, + { + "epoch": 0.4712632594021215, + "grad_norm": 1.9958713054656982, + "learning_rate": 2.7261456533854807e-05, + "loss": 5.6218, + "step": 4887 + }, + { + "epoch": 0.47135969141755063, + "grad_norm": 1.6197346448898315, + "learning_rate": 2.725391372792423e-05, + "loss": 5.6216, + "step": 4888 + }, + { + "epoch": 0.47145612343297977, + "grad_norm": 1.7074816226959229, + "learning_rate": 2.724637071513222e-05, + "loss": 5.641, + "step": 4889 + }, + { + "epoch": 0.47155255544840885, + "grad_norm": 1.9855444431304932, + "learning_rate": 2.723882749617107e-05, + "loss": 5.6716, + "step": 4890 + }, + { + "epoch": 0.471648987463838, + "grad_norm": 1.5248613357543945, + "learning_rate": 2.7231284071733093e-05, + "loss": 5.7411, + "step": 4891 + }, + { + "epoch": 0.4717454194792671, + "grad_norm": 1.3223220109939575, + "learning_rate": 2.7223740442510602e-05, + "loss": 5.7529, + "step": 4892 + }, + { + "epoch": 0.47184185149469626, + "grad_norm": 1.7580558061599731, + "learning_rate": 2.721619660919596e-05, + "loss": 5.778, + "step": 4893 + }, + { + "epoch": 0.47193828351012534, + "grad_norm": 1.5793923139572144, + "learning_rate": 2.7208652572481513e-05, + "loss": 5.7989, + "step": 4894 + }, + { + "epoch": 0.4720347155255545, + "grad_norm": 3.055870771408081, + "learning_rate": 2.720110833305966e-05, + "loss": 5.6088, + "step": 4895 + }, + { + "epoch": 0.4721311475409836, + "grad_norm": 3.195138454437256, + "learning_rate": 2.7193563891622786e-05, + "loss": 5.7481, + "step": 4896 + }, + { + "epoch": 0.4722275795564127, + "grad_norm": 3.7741734981536865, + "learning_rate": 2.718601924886332e-05, + "loss": 5.7944, + "step": 4897 + }, + { + "epoch": 0.47232401157184184, + "grad_norm": 3.0421841144561768, + "learning_rate": 2.7178474405473696e-05, + "loss": 6.0575, + "step": 4898 + }, + { + "epoch": 0.472420443587271, + "grad_norm": 2.2138164043426514, + "learning_rate": 2.7170929362146376e-05, + "loss": 6.253, + "step": 4899 + }, + { + "epoch": 0.4725168756027001, + "grad_norm": 2.7082407474517822, + "learning_rate": 2.7163384119573826e-05, + "loss": 6.1282, + "step": 4900 + }, + { + "epoch": 0.4726133076181292, + "grad_norm": 3.9512364864349365, + "learning_rate": 2.7155838678448548e-05, + "loss": 5.781, + "step": 4901 + }, + { + "epoch": 0.47270973963355833, + "grad_norm": 2.815291404724121, + "learning_rate": 2.714829303946305e-05, + "loss": 5.8269, + "step": 4902 + }, + { + "epoch": 0.47280617164898747, + "grad_norm": 2.04531192779541, + "learning_rate": 2.7140747203309857e-05, + "loss": 5.7875, + "step": 4903 + }, + { + "epoch": 0.4729026036644166, + "grad_norm": 2.7039270401000977, + "learning_rate": 2.7133201170681522e-05, + "loss": 5.725, + "step": 4904 + }, + { + "epoch": 0.4729990356798457, + "grad_norm": 2.379145860671997, + "learning_rate": 2.7125654942270612e-05, + "loss": 5.7755, + "step": 4905 + }, + { + "epoch": 0.47309546769527483, + "grad_norm": 1.9841159582138062, + "learning_rate": 2.711810851876971e-05, + "loss": 5.8276, + "step": 4906 + }, + { + "epoch": 0.47319189971070397, + "grad_norm": 1.7407050132751465, + "learning_rate": 2.7110561900871413e-05, + "loss": 5.8267, + "step": 4907 + }, + { + "epoch": 0.47328833172613305, + "grad_norm": 1.558061957359314, + "learning_rate": 2.7103015089268337e-05, + "loss": 5.8571, + "step": 4908 + }, + { + "epoch": 0.4733847637415622, + "grad_norm": 1.550650715827942, + "learning_rate": 2.709546808465313e-05, + "loss": 5.8545, + "step": 4909 + }, + { + "epoch": 0.4734811957569913, + "grad_norm": 1.6654208898544312, + "learning_rate": 2.7087920887718444e-05, + "loss": 5.7673, + "step": 4910 + }, + { + "epoch": 0.47357762777242046, + "grad_norm": 1.8038803339004517, + "learning_rate": 2.708037349915694e-05, + "loss": 5.686, + "step": 4911 + }, + { + "epoch": 0.47367405978784954, + "grad_norm": 1.5459140539169312, + "learning_rate": 2.707282591966132e-05, + "loss": 5.7618, + "step": 4912 + }, + { + "epoch": 0.4737704918032787, + "grad_norm": 1.6822502613067627, + "learning_rate": 2.7065278149924296e-05, + "loss": 5.7643, + "step": 4913 + }, + { + "epoch": 0.4738669238187078, + "grad_norm": 1.7573872804641724, + "learning_rate": 2.7057730190638574e-05, + "loss": 5.7545, + "step": 4914 + }, + { + "epoch": 0.47396335583413696, + "grad_norm": 1.0686627626419067, + "learning_rate": 2.705018204249691e-05, + "loss": 5.7545, + "step": 4915 + }, + { + "epoch": 0.47405978784956604, + "grad_norm": 1.8351482152938843, + "learning_rate": 2.7042633706192067e-05, + "loss": 5.7143, + "step": 4916 + }, + { + "epoch": 0.4741562198649952, + "grad_norm": 2.1328985691070557, + "learning_rate": 2.7035085182416813e-05, + "loss": 5.7897, + "step": 4917 + }, + { + "epoch": 0.4742526518804243, + "grad_norm": 1.8131377696990967, + "learning_rate": 2.7027536471863945e-05, + "loss": 5.9233, + "step": 4918 + }, + { + "epoch": 0.4743490838958534, + "grad_norm": 1.603441596031189, + "learning_rate": 2.701998757522628e-05, + "loss": 5.7093, + "step": 4919 + }, + { + "epoch": 0.47444551591128253, + "grad_norm": 2.4346892833709717, + "learning_rate": 2.7012438493196636e-05, + "loss": 5.8217, + "step": 4920 + }, + { + "epoch": 0.47454194792671167, + "grad_norm": 2.202004909515381, + "learning_rate": 2.700488922646786e-05, + "loss": 5.8613, + "step": 4921 + }, + { + "epoch": 0.4746383799421408, + "grad_norm": 1.5551191568374634, + "learning_rate": 2.6997339775732827e-05, + "loss": 5.7471, + "step": 4922 + }, + { + "epoch": 0.4747348119575699, + "grad_norm": 1.8719854354858398, + "learning_rate": 2.69897901416844e-05, + "loss": 5.7447, + "step": 4923 + }, + { + "epoch": 0.47483124397299903, + "grad_norm": 1.9486395120620728, + "learning_rate": 2.6982240325015497e-05, + "loss": 5.7338, + "step": 4924 + }, + { + "epoch": 0.47492767598842817, + "grad_norm": 1.907949447631836, + "learning_rate": 2.6974690326419007e-05, + "loss": 5.7494, + "step": 4925 + }, + { + "epoch": 0.4750241080038573, + "grad_norm": 1.833457350730896, + "learning_rate": 2.6967140146587865e-05, + "loss": 5.7796, + "step": 4926 + }, + { + "epoch": 0.4751205400192864, + "grad_norm": 2.4033830165863037, + "learning_rate": 2.695958978621502e-05, + "loss": 5.6407, + "step": 4927 + }, + { + "epoch": 0.4752169720347155, + "grad_norm": 2.464876413345337, + "learning_rate": 2.6952039245993454e-05, + "loss": 5.563, + "step": 4928 + }, + { + "epoch": 0.47531340405014466, + "grad_norm": 2.5031180381774902, + "learning_rate": 2.6944488526616114e-05, + "loss": 5.7681, + "step": 4929 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 2.6134719848632812, + "learning_rate": 2.693693762877601e-05, + "loss": 5.7701, + "step": 4930 + }, + { + "epoch": 0.4755062680810029, + "grad_norm": 2.9847075939178467, + "learning_rate": 2.6929386553166164e-05, + "loss": 5.7626, + "step": 4931 + }, + { + "epoch": 0.475602700096432, + "grad_norm": 2.5493452548980713, + "learning_rate": 2.6921835300479588e-05, + "loss": 5.6584, + "step": 4932 + }, + { + "epoch": 0.47569913211186116, + "grad_norm": 2.451726198196411, + "learning_rate": 2.6914283871409333e-05, + "loss": 5.7544, + "step": 4933 + }, + { + "epoch": 0.47579556412729024, + "grad_norm": 3.009620428085327, + "learning_rate": 2.690673226664846e-05, + "loss": 5.8015, + "step": 4934 + }, + { + "epoch": 0.4758919961427194, + "grad_norm": 3.676215887069702, + "learning_rate": 2.6899180486890053e-05, + "loss": 5.683, + "step": 4935 + }, + { + "epoch": 0.4759884281581485, + "grad_norm": 3.483433961868286, + "learning_rate": 2.689162853282719e-05, + "loss": 5.6985, + "step": 4936 + }, + { + "epoch": 0.47608486017357765, + "grad_norm": 2.878925085067749, + "learning_rate": 2.6884076405152992e-05, + "loss": 5.7872, + "step": 4937 + }, + { + "epoch": 0.47618129218900673, + "grad_norm": 2.977109432220459, + "learning_rate": 2.687652410456058e-05, + "loss": 5.6737, + "step": 4938 + }, + { + "epoch": 0.47627772420443587, + "grad_norm": 2.643752098083496, + "learning_rate": 2.6868971631743102e-05, + "loss": 5.7092, + "step": 4939 + }, + { + "epoch": 0.476374156219865, + "grad_norm": 2.4497697353363037, + "learning_rate": 2.68614189873937e-05, + "loss": 5.8431, + "step": 4940 + }, + { + "epoch": 0.4764705882352941, + "grad_norm": 2.946505308151245, + "learning_rate": 2.6853866172205556e-05, + "loss": 5.6967, + "step": 4941 + }, + { + "epoch": 0.47656702025072323, + "grad_norm": 3.491062641143799, + "learning_rate": 2.6846313186871853e-05, + "loss": 5.7401, + "step": 4942 + }, + { + "epoch": 0.47666345226615237, + "grad_norm": 2.0085761547088623, + "learning_rate": 2.6838760032085796e-05, + "loss": 5.7184, + "step": 4943 + }, + { + "epoch": 0.4767598842815815, + "grad_norm": 2.007800340652466, + "learning_rate": 2.6831206708540612e-05, + "loss": 5.5742, + "step": 4944 + }, + { + "epoch": 0.4768563162970106, + "grad_norm": 2.505274772644043, + "learning_rate": 2.6823653216929536e-05, + "loss": 5.4875, + "step": 4945 + }, + { + "epoch": 0.4769527483124397, + "grad_norm": 2.1497929096221924, + "learning_rate": 2.6816099557945806e-05, + "loss": 5.7031, + "step": 4946 + }, + { + "epoch": 0.47704918032786886, + "grad_norm": 1.7304277420043945, + "learning_rate": 2.6808545732282693e-05, + "loss": 5.7611, + "step": 4947 + }, + { + "epoch": 0.477145612343298, + "grad_norm": 1.7527779340744019, + "learning_rate": 2.680099174063348e-05, + "loss": 5.7279, + "step": 4948 + }, + { + "epoch": 0.4772420443587271, + "grad_norm": 1.9652087688446045, + "learning_rate": 2.6793437583691462e-05, + "loss": 5.7917, + "step": 4949 + }, + { + "epoch": 0.4773384763741562, + "grad_norm": 1.9624031782150269, + "learning_rate": 2.6785883262149952e-05, + "loss": 5.5955, + "step": 4950 + }, + { + "epoch": 0.47743490838958536, + "grad_norm": 1.8287314176559448, + "learning_rate": 2.677832877670227e-05, + "loss": 5.688, + "step": 4951 + }, + { + "epoch": 0.47753134040501444, + "grad_norm": 1.9583722352981567, + "learning_rate": 2.6770774128041766e-05, + "loss": 5.76, + "step": 4952 + }, + { + "epoch": 0.4776277724204436, + "grad_norm": 1.9552634954452515, + "learning_rate": 2.6763219316861788e-05, + "loss": 5.6579, + "step": 4953 + }, + { + "epoch": 0.4777242044358727, + "grad_norm": 1.6642484664916992, + "learning_rate": 2.6755664343855712e-05, + "loss": 5.7425, + "step": 4954 + }, + { + "epoch": 0.47782063645130185, + "grad_norm": 1.8286492824554443, + "learning_rate": 2.6748109209716925e-05, + "loss": 5.7882, + "step": 4955 + }, + { + "epoch": 0.47791706846673093, + "grad_norm": 1.8020702600479126, + "learning_rate": 2.6740553915138822e-05, + "loss": 5.7651, + "step": 4956 + }, + { + "epoch": 0.47801350048216007, + "grad_norm": 1.728614330291748, + "learning_rate": 2.6732998460814835e-05, + "loss": 5.7739, + "step": 4957 + }, + { + "epoch": 0.4781099324975892, + "grad_norm": 1.4404003620147705, + "learning_rate": 2.672544284743837e-05, + "loss": 5.7573, + "step": 4958 + }, + { + "epoch": 0.47820636451301834, + "grad_norm": 1.694737195968628, + "learning_rate": 2.6717887075702887e-05, + "loss": 5.8295, + "step": 4959 + }, + { + "epoch": 0.4783027965284474, + "grad_norm": 2.171433448791504, + "learning_rate": 2.6710331146301844e-05, + "loss": 5.5508, + "step": 4960 + }, + { + "epoch": 0.47839922854387656, + "grad_norm": 1.5765132904052734, + "learning_rate": 2.670277505992871e-05, + "loss": 5.5504, + "step": 4961 + }, + { + "epoch": 0.4784956605593057, + "grad_norm": 1.4346520900726318, + "learning_rate": 2.669521881727698e-05, + "loss": 5.6773, + "step": 4962 + }, + { + "epoch": 0.4785920925747348, + "grad_norm": 1.6297937631607056, + "learning_rate": 2.6687662419040153e-05, + "loss": 5.681, + "step": 4963 + }, + { + "epoch": 0.4786885245901639, + "grad_norm": 1.6210566759109497, + "learning_rate": 2.668010586591175e-05, + "loss": 5.6, + "step": 4964 + }, + { + "epoch": 0.47878495660559306, + "grad_norm": 1.636872410774231, + "learning_rate": 2.6672549158585293e-05, + "loss": 5.6496, + "step": 4965 + }, + { + "epoch": 0.4788813886210222, + "grad_norm": 1.349872350692749, + "learning_rate": 2.6664992297754332e-05, + "loss": 5.764, + "step": 4966 + }, + { + "epoch": 0.4789778206364513, + "grad_norm": 1.3485430479049683, + "learning_rate": 2.6657435284112424e-05, + "loss": 5.8034, + "step": 4967 + }, + { + "epoch": 0.4790742526518804, + "grad_norm": 1.6763310432434082, + "learning_rate": 2.664987811835316e-05, + "loss": 5.7678, + "step": 4968 + }, + { + "epoch": 0.47917068466730955, + "grad_norm": 1.4603441953659058, + "learning_rate": 2.66423208011701e-05, + "loss": 5.6696, + "step": 4969 + }, + { + "epoch": 0.4792671166827387, + "grad_norm": 1.2345088720321655, + "learning_rate": 2.6634763333256863e-05, + "loss": 5.7198, + "step": 4970 + }, + { + "epoch": 0.4793635486981678, + "grad_norm": 1.5188935995101929, + "learning_rate": 2.6627205715307057e-05, + "loss": 5.7353, + "step": 4971 + }, + { + "epoch": 0.4794599807135969, + "grad_norm": 1.606921911239624, + "learning_rate": 2.661964794801432e-05, + "loss": 5.6306, + "step": 4972 + }, + { + "epoch": 0.47955641272902605, + "grad_norm": 2.17566180229187, + "learning_rate": 2.6612090032072284e-05, + "loss": 5.7285, + "step": 4973 + }, + { + "epoch": 0.4796528447444552, + "grad_norm": 1.5922815799713135, + "learning_rate": 2.6604531968174607e-05, + "loss": 5.7802, + "step": 4974 + }, + { + "epoch": 0.47974927675988427, + "grad_norm": 2.4782638549804688, + "learning_rate": 2.659697375701496e-05, + "loss": 5.7164, + "step": 4975 + }, + { + "epoch": 0.4798457087753134, + "grad_norm": 1.6793845891952515, + "learning_rate": 2.6589415399287033e-05, + "loss": 5.7984, + "step": 4976 + }, + { + "epoch": 0.47994214079074254, + "grad_norm": 1.7010207176208496, + "learning_rate": 2.658185689568451e-05, + "loss": 5.7911, + "step": 4977 + }, + { + "epoch": 0.4800385728061716, + "grad_norm": 3.0203630924224854, + "learning_rate": 2.6574298246901107e-05, + "loss": 5.8888, + "step": 4978 + }, + { + "epoch": 0.48013500482160076, + "grad_norm": 1.9130791425704956, + "learning_rate": 2.6566739453630556e-05, + "loss": 5.6777, + "step": 4979 + }, + { + "epoch": 0.4802314368370299, + "grad_norm": 1.9607347249984741, + "learning_rate": 2.6559180516566584e-05, + "loss": 5.6999, + "step": 4980 + }, + { + "epoch": 0.48032786885245904, + "grad_norm": 1.6009639501571655, + "learning_rate": 2.655162143640294e-05, + "loss": 5.7008, + "step": 4981 + }, + { + "epoch": 0.4804243008678881, + "grad_norm": 1.400827169418335, + "learning_rate": 2.654406221383339e-05, + "loss": 5.8473, + "step": 4982 + }, + { + "epoch": 0.48052073288331726, + "grad_norm": 1.655078411102295, + "learning_rate": 2.6536502849551713e-05, + "loss": 5.7807, + "step": 4983 + }, + { + "epoch": 0.4806171648987464, + "grad_norm": 2.018625497817993, + "learning_rate": 2.6528943344251693e-05, + "loss": 5.7288, + "step": 4984 + }, + { + "epoch": 0.48071359691417553, + "grad_norm": 2.178401470184326, + "learning_rate": 2.652138369862714e-05, + "loss": 5.7283, + "step": 4985 + }, + { + "epoch": 0.4808100289296046, + "grad_norm": 2.5138113498687744, + "learning_rate": 2.651382391337186e-05, + "loss": 5.7497, + "step": 4986 + }, + { + "epoch": 0.48090646094503375, + "grad_norm": 2.453620672225952, + "learning_rate": 2.6506263989179685e-05, + "loss": 5.6425, + "step": 4987 + }, + { + "epoch": 0.4810028929604629, + "grad_norm": 1.8234479427337646, + "learning_rate": 2.6498703926744456e-05, + "loss": 5.6771, + "step": 4988 + }, + { + "epoch": 0.481099324975892, + "grad_norm": 1.7518612146377563, + "learning_rate": 2.649114372676002e-05, + "loss": 5.7324, + "step": 4989 + }, + { + "epoch": 0.4811957569913211, + "grad_norm": 2.1065306663513184, + "learning_rate": 2.648358338992026e-05, + "loss": 5.8135, + "step": 4990 + }, + { + "epoch": 0.48129218900675025, + "grad_norm": 3.534090042114258, + "learning_rate": 2.6476022916919036e-05, + "loss": 5.663, + "step": 4991 + }, + { + "epoch": 0.4813886210221794, + "grad_norm": 2.2772722244262695, + "learning_rate": 2.646846230845024e-05, + "loss": 5.625, + "step": 4992 + }, + { + "epoch": 0.48148505303760847, + "grad_norm": 2.5039610862731934, + "learning_rate": 2.6460901565207785e-05, + "loss": 5.7488, + "step": 4993 + }, + { + "epoch": 0.4815814850530376, + "grad_norm": 2.5844907760620117, + "learning_rate": 2.6453340687885586e-05, + "loss": 5.7418, + "step": 4994 + }, + { + "epoch": 0.48167791706846674, + "grad_norm": 2.5385990142822266, + "learning_rate": 2.6445779677177562e-05, + "loss": 5.6759, + "step": 4995 + }, + { + "epoch": 0.4817743490838959, + "grad_norm": 2.1848161220550537, + "learning_rate": 2.643821853377766e-05, + "loss": 5.7978, + "step": 4996 + }, + { + "epoch": 0.48187078109932496, + "grad_norm": 2.6788618564605713, + "learning_rate": 2.643065725837983e-05, + "loss": 5.683, + "step": 4997 + }, + { + "epoch": 0.4819672131147541, + "grad_norm": 2.2895946502685547, + "learning_rate": 2.6423095851678043e-05, + "loss": 5.7728, + "step": 4998 + }, + { + "epoch": 0.48206364513018324, + "grad_norm": 1.8980505466461182, + "learning_rate": 2.6415534314366264e-05, + "loss": 5.7927, + "step": 4999 + }, + { + "epoch": 0.4821600771456123, + "grad_norm": 1.9490234851837158, + "learning_rate": 2.640797264713849e-05, + "loss": 5.6447, + "step": 5000 + }, + { + "epoch": 0.48225650916104146, + "grad_norm": 1.9799362421035767, + "learning_rate": 2.6400410850688724e-05, + "loss": 5.7736, + "step": 5001 + }, + { + "epoch": 0.4823529411764706, + "grad_norm": 2.3801631927490234, + "learning_rate": 2.639284892571097e-05, + "loss": 5.4808, + "step": 5002 + }, + { + "epoch": 0.48244937319189973, + "grad_norm": 2.7565908432006836, + "learning_rate": 2.638528687289925e-05, + "loss": 5.7098, + "step": 5003 + }, + { + "epoch": 0.4825458052073288, + "grad_norm": 2.5897960662841797, + "learning_rate": 2.6377724692947614e-05, + "loss": 5.6419, + "step": 5004 + }, + { + "epoch": 0.48264223722275795, + "grad_norm": 1.7183594703674316, + "learning_rate": 2.6370162386550095e-05, + "loss": 5.8744, + "step": 5005 + }, + { + "epoch": 0.4827386692381871, + "grad_norm": 2.0131847858428955, + "learning_rate": 2.6362599954400765e-05, + "loss": 5.5119, + "step": 5006 + }, + { + "epoch": 0.48283510125361623, + "grad_norm": 2.599799394607544, + "learning_rate": 2.6355037397193684e-05, + "loss": 5.459, + "step": 5007 + }, + { + "epoch": 0.4829315332690453, + "grad_norm": 2.2636542320251465, + "learning_rate": 2.6347474715622938e-05, + "loss": 5.6497, + "step": 5008 + }, + { + "epoch": 0.48302796528447445, + "grad_norm": 1.9286701679229736, + "learning_rate": 2.6339911910382624e-05, + "loss": 5.7434, + "step": 5009 + }, + { + "epoch": 0.4831243972999036, + "grad_norm": 2.3529317378997803, + "learning_rate": 2.6332348982166842e-05, + "loss": 5.4713, + "step": 5010 + }, + { + "epoch": 0.48322082931533267, + "grad_norm": 2.3228938579559326, + "learning_rate": 2.632478593166971e-05, + "loss": 5.5125, + "step": 5011 + }, + { + "epoch": 0.4833172613307618, + "grad_norm": 2.0926706790924072, + "learning_rate": 2.6317222759585364e-05, + "loss": 5.3228, + "step": 5012 + }, + { + "epoch": 0.48341369334619094, + "grad_norm": 1.956241250038147, + "learning_rate": 2.6309659466607927e-05, + "loss": 5.3903, + "step": 5013 + }, + { + "epoch": 0.4835101253616201, + "grad_norm": 1.88369619846344, + "learning_rate": 2.6302096053431552e-05, + "loss": 5.3323, + "step": 5014 + }, + { + "epoch": 0.48360655737704916, + "grad_norm": 2.6565351486206055, + "learning_rate": 2.629453252075041e-05, + "loss": 5.1326, + "step": 5015 + }, + { + "epoch": 0.4837029893924783, + "grad_norm": 2.1028239727020264, + "learning_rate": 2.6286968869258665e-05, + "loss": 5.2822, + "step": 5016 + }, + { + "epoch": 0.48379942140790744, + "grad_norm": 2.076768159866333, + "learning_rate": 2.62794050996505e-05, + "loss": 5.2335, + "step": 5017 + }, + { + "epoch": 0.4838958534233366, + "grad_norm": 2.4755523204803467, + "learning_rate": 2.6271841212620113e-05, + "loss": 5.2794, + "step": 5018 + }, + { + "epoch": 0.48399228543876566, + "grad_norm": 2.79244327545166, + "learning_rate": 2.6264277208861698e-05, + "loss": 5.4507, + "step": 5019 + }, + { + "epoch": 0.4840887174541948, + "grad_norm": 1.944574236869812, + "learning_rate": 2.625671308906949e-05, + "loss": 5.4458, + "step": 5020 + }, + { + "epoch": 0.48418514946962393, + "grad_norm": 2.9266984462738037, + "learning_rate": 2.624914885393769e-05, + "loss": 5.1852, + "step": 5021 + }, + { + "epoch": 0.484281581485053, + "grad_norm": 3.7801620960235596, + "learning_rate": 2.624158450416055e-05, + "loss": 5.2927, + "step": 5022 + }, + { + "epoch": 0.48437801350048215, + "grad_norm": 2.376615047454834, + "learning_rate": 2.6234020040432313e-05, + "loss": 5.3034, + "step": 5023 + }, + { + "epoch": 0.4844744455159113, + "grad_norm": 2.439450979232788, + "learning_rate": 2.622645546344723e-05, + "loss": 5.4697, + "step": 5024 + }, + { + "epoch": 0.4845708775313404, + "grad_norm": 3.408612012863159, + "learning_rate": 2.621889077389958e-05, + "loss": 5.8224, + "step": 5025 + }, + { + "epoch": 0.4846673095467695, + "grad_norm": 3.1510438919067383, + "learning_rate": 2.621132597248363e-05, + "loss": 4.9531, + "step": 5026 + }, + { + "epoch": 0.48476374156219865, + "grad_norm": 2.2659780979156494, + "learning_rate": 2.620376105989367e-05, + "loss": 5.3831, + "step": 5027 + }, + { + "epoch": 0.4848601735776278, + "grad_norm": 2.3410232067108154, + "learning_rate": 2.6196196036824013e-05, + "loss": 5.5219, + "step": 5028 + }, + { + "epoch": 0.4849566055930569, + "grad_norm": 2.9918057918548584, + "learning_rate": 2.6188630903968948e-05, + "loss": 5.5352, + "step": 5029 + }, + { + "epoch": 0.485053037608486, + "grad_norm": 2.3633885383605957, + "learning_rate": 2.61810656620228e-05, + "loss": 5.497, + "step": 5030 + }, + { + "epoch": 0.48514946962391514, + "grad_norm": 3.8001556396484375, + "learning_rate": 2.6173500311679904e-05, + "loss": 5.0775, + "step": 5031 + }, + { + "epoch": 0.4852459016393443, + "grad_norm": 2.2924964427948, + "learning_rate": 2.6165934853634593e-05, + "loss": 5.3991, + "step": 5032 + }, + { + "epoch": 0.48534233365477336, + "grad_norm": 2.932002067565918, + "learning_rate": 2.615836928858122e-05, + "loss": 5.4149, + "step": 5033 + }, + { + "epoch": 0.4854387656702025, + "grad_norm": 1.8758533000946045, + "learning_rate": 2.6150803617214137e-05, + "loss": 5.5189, + "step": 5034 + }, + { + "epoch": 0.48553519768563164, + "grad_norm": 1.9787033796310425, + "learning_rate": 2.6143237840227707e-05, + "loss": 5.198, + "step": 5035 + }, + { + "epoch": 0.4856316297010608, + "grad_norm": 2.8705403804779053, + "learning_rate": 2.613567195831632e-05, + "loss": 4.566, + "step": 5036 + }, + { + "epoch": 0.48572806171648986, + "grad_norm": 3.4282612800598145, + "learning_rate": 2.612810597217436e-05, + "loss": 4.6798, + "step": 5037 + }, + { + "epoch": 0.485824493731919, + "grad_norm": 3.153803825378418, + "learning_rate": 2.6120539882496226e-05, + "loss": 5.222, + "step": 5038 + }, + { + "epoch": 0.48592092574734813, + "grad_norm": 2.211120367050171, + "learning_rate": 2.6112973689976317e-05, + "loss": 5.2899, + "step": 5039 + }, + { + "epoch": 0.48601735776277727, + "grad_norm": 2.658893585205078, + "learning_rate": 2.610540739530905e-05, + "loss": 5.0542, + "step": 5040 + }, + { + "epoch": 0.48611378977820635, + "grad_norm": 2.9575839042663574, + "learning_rate": 2.6097840999188856e-05, + "loss": 5.2757, + "step": 5041 + }, + { + "epoch": 0.4862102217936355, + "grad_norm": 2.971038818359375, + "learning_rate": 2.609027450231017e-05, + "loss": 5.6058, + "step": 5042 + }, + { + "epoch": 0.4863066538090646, + "grad_norm": 2.727435827255249, + "learning_rate": 2.608270790536742e-05, + "loss": 5.4645, + "step": 5043 + }, + { + "epoch": 0.4864030858244937, + "grad_norm": 2.9171292781829834, + "learning_rate": 2.6075141209055083e-05, + "loss": 5.2384, + "step": 5044 + }, + { + "epoch": 0.48649951783992285, + "grad_norm": 2.2083208560943604, + "learning_rate": 2.6067574414067607e-05, + "loss": 5.293, + "step": 5045 + }, + { + "epoch": 0.486595949855352, + "grad_norm": 1.8364763259887695, + "learning_rate": 2.606000752109946e-05, + "loss": 5.3785, + "step": 5046 + }, + { + "epoch": 0.4866923818707811, + "grad_norm": 2.1449029445648193, + "learning_rate": 2.605244053084513e-05, + "loss": 5.4261, + "step": 5047 + }, + { + "epoch": 0.4867888138862102, + "grad_norm": 3.020095109939575, + "learning_rate": 2.60448734439991e-05, + "loss": 5.5655, + "step": 5048 + }, + { + "epoch": 0.48688524590163934, + "grad_norm": 2.364534616470337, + "learning_rate": 2.6037306261255873e-05, + "loss": 5.3684, + "step": 5049 + }, + { + "epoch": 0.4869816779170685, + "grad_norm": 2.076064109802246, + "learning_rate": 2.6029738983309954e-05, + "loss": 5.4924, + "step": 5050 + }, + { + "epoch": 0.4870781099324976, + "grad_norm": 2.7022335529327393, + "learning_rate": 2.6022171610855862e-05, + "loss": 5.0905, + "step": 5051 + }, + { + "epoch": 0.4871745419479267, + "grad_norm": 3.235853910446167, + "learning_rate": 2.601460414458811e-05, + "loss": 5.4348, + "step": 5052 + }, + { + "epoch": 0.48727097396335584, + "grad_norm": 3.1480302810668945, + "learning_rate": 2.600703658520125e-05, + "loss": 5.5643, + "step": 5053 + }, + { + "epoch": 0.487367405978785, + "grad_norm": 2.2738471031188965, + "learning_rate": 2.599946893338981e-05, + "loss": 5.5652, + "step": 5054 + }, + { + "epoch": 0.48746383799421406, + "grad_norm": 2.1571877002716064, + "learning_rate": 2.5991901189848334e-05, + "loss": 5.6194, + "step": 5055 + }, + { + "epoch": 0.4875602700096432, + "grad_norm": 2.5025532245635986, + "learning_rate": 2.5984333355271397e-05, + "loss": 5.4153, + "step": 5056 + }, + { + "epoch": 0.48765670202507233, + "grad_norm": 2.9581613540649414, + "learning_rate": 2.5976765430353543e-05, + "loss": 5.3035, + "step": 5057 + }, + { + "epoch": 0.48775313404050147, + "grad_norm": 2.5150671005249023, + "learning_rate": 2.596919741578937e-05, + "loss": 5.4932, + "step": 5058 + }, + { + "epoch": 0.48784956605593055, + "grad_norm": 2.3793435096740723, + "learning_rate": 2.5961629312273443e-05, + "loss": 4.9858, + "step": 5059 + }, + { + "epoch": 0.4879459980713597, + "grad_norm": 2.696916341781616, + "learning_rate": 2.5954061120500374e-05, + "loss": 5.1274, + "step": 5060 + }, + { + "epoch": 0.4880424300867888, + "grad_norm": 3.1577181816101074, + "learning_rate": 2.5946492841164738e-05, + "loss": 5.1536, + "step": 5061 + }, + { + "epoch": 0.48813886210221796, + "grad_norm": 3.4356534481048584, + "learning_rate": 2.5938924474961158e-05, + "loss": 5.1476, + "step": 5062 + }, + { + "epoch": 0.48823529411764705, + "grad_norm": 3.2146496772766113, + "learning_rate": 2.5931356022584246e-05, + "loss": 5.0467, + "step": 5063 + }, + { + "epoch": 0.4883317261330762, + "grad_norm": 4.339074611663818, + "learning_rate": 2.592378748472863e-05, + "loss": 5.0334, + "step": 5064 + }, + { + "epoch": 0.4884281581485053, + "grad_norm": 3.849595785140991, + "learning_rate": 2.5916218862088924e-05, + "loss": 5.3364, + "step": 5065 + }, + { + "epoch": 0.4885245901639344, + "grad_norm": 2.6950674057006836, + "learning_rate": 2.5908650155359786e-05, + "loss": 5.1432, + "step": 5066 + }, + { + "epoch": 0.48862102217936354, + "grad_norm": 2.4545371532440186, + "learning_rate": 2.5901081365235853e-05, + "loss": 5.4664, + "step": 5067 + }, + { + "epoch": 0.4887174541947927, + "grad_norm": 2.7599220275878906, + "learning_rate": 2.589351249241178e-05, + "loss": 5.4108, + "step": 5068 + }, + { + "epoch": 0.4888138862102218, + "grad_norm": 3.5005037784576416, + "learning_rate": 2.5885943537582237e-05, + "loss": 5.5283, + "step": 5069 + }, + { + "epoch": 0.4889103182256509, + "grad_norm": 3.0642809867858887, + "learning_rate": 2.5878374501441877e-05, + "loss": 5.1776, + "step": 5070 + }, + { + "epoch": 0.48900675024108003, + "grad_norm": 3.0205330848693848, + "learning_rate": 2.5870805384685393e-05, + "loss": 5.1209, + "step": 5071 + }, + { + "epoch": 0.4891031822565092, + "grad_norm": 3.383204698562622, + "learning_rate": 2.5863236188007455e-05, + "loss": 4.9736, + "step": 5072 + }, + { + "epoch": 0.4891996142719383, + "grad_norm": 2.957146644592285, + "learning_rate": 2.5855666912102764e-05, + "loss": 4.9405, + "step": 5073 + }, + { + "epoch": 0.4892960462873674, + "grad_norm": 3.0487935543060303, + "learning_rate": 2.5848097557666024e-05, + "loss": 5.4473, + "step": 5074 + }, + { + "epoch": 0.48939247830279653, + "grad_norm": 2.853590726852417, + "learning_rate": 2.5840528125391933e-05, + "loss": 5.2854, + "step": 5075 + }, + { + "epoch": 0.48948891031822567, + "grad_norm": 2.955972671508789, + "learning_rate": 2.58329586159752e-05, + "loss": 5.4554, + "step": 5076 + }, + { + "epoch": 0.48958534233365475, + "grad_norm": 2.736133575439453, + "learning_rate": 2.5825389030110553e-05, + "loss": 5.1771, + "step": 5077 + }, + { + "epoch": 0.4896817743490839, + "grad_norm": 2.2924087047576904, + "learning_rate": 2.5817819368492723e-05, + "loss": 5.4435, + "step": 5078 + }, + { + "epoch": 0.489778206364513, + "grad_norm": 2.7101800441741943, + "learning_rate": 2.581024963181643e-05, + "loss": 5.3375, + "step": 5079 + }, + { + "epoch": 0.48987463837994216, + "grad_norm": 2.5006566047668457, + "learning_rate": 2.5802679820776422e-05, + "loss": 5.5589, + "step": 5080 + }, + { + "epoch": 0.48997107039537124, + "grad_norm": 2.250356912612915, + "learning_rate": 2.579510993606745e-05, + "loss": 5.6289, + "step": 5081 + }, + { + "epoch": 0.4900675024108004, + "grad_norm": 2.5161337852478027, + "learning_rate": 2.5787539978384274e-05, + "loss": 5.1169, + "step": 5082 + }, + { + "epoch": 0.4901639344262295, + "grad_norm": 2.4604499340057373, + "learning_rate": 2.5779969948421638e-05, + "loss": 5.4473, + "step": 5083 + }, + { + "epoch": 0.49026036644165866, + "grad_norm": 2.2055327892303467, + "learning_rate": 2.5772399846874323e-05, + "loss": 5.4279, + "step": 5084 + }, + { + "epoch": 0.49035679845708774, + "grad_norm": 2.3750274181365967, + "learning_rate": 2.5764829674437103e-05, + "loss": 5.2495, + "step": 5085 + }, + { + "epoch": 0.4904532304725169, + "grad_norm": 2.8210630416870117, + "learning_rate": 2.5757259431804758e-05, + "loss": 5.1171, + "step": 5086 + }, + { + "epoch": 0.490549662487946, + "grad_norm": 3.95393705368042, + "learning_rate": 2.5749689119672066e-05, + "loss": 4.8027, + "step": 5087 + }, + { + "epoch": 0.4906460945033751, + "grad_norm": 2.4276602268218994, + "learning_rate": 2.5742118738733833e-05, + "loss": 4.8698, + "step": 5088 + }, + { + "epoch": 0.49074252651880423, + "grad_norm": 3.1342716217041016, + "learning_rate": 2.573454828968486e-05, + "loss": 4.9303, + "step": 5089 + }, + { + "epoch": 0.49083895853423337, + "grad_norm": 2.267380714416504, + "learning_rate": 2.5726977773219945e-05, + "loss": 5.4808, + "step": 5090 + }, + { + "epoch": 0.4909353905496625, + "grad_norm": 2.2766175270080566, + "learning_rate": 2.5719407190033907e-05, + "loss": 5.6385, + "step": 5091 + }, + { + "epoch": 0.4910318225650916, + "grad_norm": 1.9139512777328491, + "learning_rate": 2.571183654082156e-05, + "loss": 5.5393, + "step": 5092 + }, + { + "epoch": 0.49112825458052073, + "grad_norm": 2.1799356937408447, + "learning_rate": 2.5704265826277735e-05, + "loss": 5.4107, + "step": 5093 + }, + { + "epoch": 0.49122468659594987, + "grad_norm": 3.619046211242676, + "learning_rate": 2.569669504709726e-05, + "loss": 4.9276, + "step": 5094 + }, + { + "epoch": 0.491321118611379, + "grad_norm": 3.3711020946502686, + "learning_rate": 2.5689124203974963e-05, + "loss": 4.8514, + "step": 5095 + }, + { + "epoch": 0.4914175506268081, + "grad_norm": 4.527166366577148, + "learning_rate": 2.56815532976057e-05, + "loss": 5.4751, + "step": 5096 + }, + { + "epoch": 0.4915139826422372, + "grad_norm": 2.343977212905884, + "learning_rate": 2.567398232868432e-05, + "loss": 5.3542, + "step": 5097 + }, + { + "epoch": 0.49161041465766636, + "grad_norm": 2.579587697982788, + "learning_rate": 2.5666411297905664e-05, + "loss": 5.096, + "step": 5098 + }, + { + "epoch": 0.49170684667309544, + "grad_norm": 1.709149956703186, + "learning_rate": 2.56588402059646e-05, + "loss": 5.5938, + "step": 5099 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 2.0241758823394775, + "learning_rate": 2.5651269053555997e-05, + "loss": 5.5602, + "step": 5100 + }, + { + "epoch": 0.4918997107039537, + "grad_norm": 3.9968860149383545, + "learning_rate": 2.564369784137472e-05, + "loss": 5.1164, + "step": 5101 + }, + { + "epoch": 0.49199614271938286, + "grad_norm": 2.5559489727020264, + "learning_rate": 2.5636126570115642e-05, + "loss": 5.47, + "step": 5102 + }, + { + "epoch": 0.49209257473481194, + "grad_norm": 1.9222314357757568, + "learning_rate": 2.5628555240473657e-05, + "loss": 5.3667, + "step": 5103 + }, + { + "epoch": 0.4921890067502411, + "grad_norm": 2.447967052459717, + "learning_rate": 2.5620983853143644e-05, + "loss": 5.6542, + "step": 5104 + }, + { + "epoch": 0.4922854387656702, + "grad_norm": 3.6690666675567627, + "learning_rate": 2.5613412408820497e-05, + "loss": 5.4429, + "step": 5105 + }, + { + "epoch": 0.49238187078109935, + "grad_norm": 3.2514519691467285, + "learning_rate": 2.5605840908199108e-05, + "loss": 5.58, + "step": 5106 + }, + { + "epoch": 0.49247830279652843, + "grad_norm": 3.2158515453338623, + "learning_rate": 2.559826935197439e-05, + "loss": 4.9404, + "step": 5107 + }, + { + "epoch": 0.49257473481195757, + "grad_norm": 3.523861885070801, + "learning_rate": 2.5590697740841257e-05, + "loss": 5.184, + "step": 5108 + }, + { + "epoch": 0.4926711668273867, + "grad_norm": 4.248315811157227, + "learning_rate": 2.55831260754946e-05, + "loss": 5.3124, + "step": 5109 + }, + { + "epoch": 0.4927675988428158, + "grad_norm": 3.1209654808044434, + "learning_rate": 2.5575554356629355e-05, + "loss": 5.2412, + "step": 5110 + }, + { + "epoch": 0.49286403085824493, + "grad_norm": 4.383740425109863, + "learning_rate": 2.5567982584940435e-05, + "loss": 4.9767, + "step": 5111 + }, + { + "epoch": 0.49296046287367407, + "grad_norm": 4.9867353439331055, + "learning_rate": 2.5560410761122777e-05, + "loss": 5.3561, + "step": 5112 + }, + { + "epoch": 0.4930568948891032, + "grad_norm": 3.6709601879119873, + "learning_rate": 2.5552838885871306e-05, + "loss": 5.3153, + "step": 5113 + }, + { + "epoch": 0.4931533269045323, + "grad_norm": 3.755375385284424, + "learning_rate": 2.5545266959880966e-05, + "loss": 5.4803, + "step": 5114 + }, + { + "epoch": 0.4932497589199614, + "grad_norm": 2.552942991256714, + "learning_rate": 2.5537694983846693e-05, + "loss": 5.3791, + "step": 5115 + }, + { + "epoch": 0.49334619093539056, + "grad_norm": 2.546229839324951, + "learning_rate": 2.5530122958463437e-05, + "loss": 5.3155, + "step": 5116 + }, + { + "epoch": 0.4934426229508197, + "grad_norm": 3.1618587970733643, + "learning_rate": 2.552255088442615e-05, + "loss": 5.3159, + "step": 5117 + }, + { + "epoch": 0.4935390549662488, + "grad_norm": 3.1070001125335693, + "learning_rate": 2.551497876242978e-05, + "loss": 5.2431, + "step": 5118 + }, + { + "epoch": 0.4936354869816779, + "grad_norm": 3.116384267807007, + "learning_rate": 2.5507406593169305e-05, + "loss": 5.3787, + "step": 5119 + }, + { + "epoch": 0.49373191899710706, + "grad_norm": 2.8571414947509766, + "learning_rate": 2.5499834377339665e-05, + "loss": 5.2361, + "step": 5120 + }, + { + "epoch": 0.49382835101253614, + "grad_norm": 3.643923759460449, + "learning_rate": 2.5492262115635846e-05, + "loss": 5.047, + "step": 5121 + }, + { + "epoch": 0.4939247830279653, + "grad_norm": 3.2464256286621094, + "learning_rate": 2.548468980875282e-05, + "loss": 5.5178, + "step": 5122 + }, + { + "epoch": 0.4940212150433944, + "grad_norm": 2.2291908264160156, + "learning_rate": 2.547711745738555e-05, + "loss": 5.4691, + "step": 5123 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 3.0675063133239746, + "learning_rate": 2.5469545062229034e-05, + "loss": 5.2282, + "step": 5124 + }, + { + "epoch": 0.49421407907425263, + "grad_norm": 2.075589179992676, + "learning_rate": 2.5461972623978247e-05, + "loss": 5.5137, + "step": 5125 + }, + { + "epoch": 0.49431051108968177, + "grad_norm": 2.283911943435669, + "learning_rate": 2.5454400143328188e-05, + "loss": 5.4796, + "step": 5126 + }, + { + "epoch": 0.4944069431051109, + "grad_norm": 1.970496416091919, + "learning_rate": 2.544682762097384e-05, + "loss": 5.5517, + "step": 5127 + }, + { + "epoch": 0.49450337512054005, + "grad_norm": 1.9832693338394165, + "learning_rate": 2.5439255057610196e-05, + "loss": 5.5527, + "step": 5128 + }, + { + "epoch": 0.4945998071359691, + "grad_norm": 2.0675201416015625, + "learning_rate": 2.543168245393227e-05, + "loss": 5.3875, + "step": 5129 + }, + { + "epoch": 0.49469623915139826, + "grad_norm": 2.7182562351226807, + "learning_rate": 2.542410981063506e-05, + "loss": 5.3858, + "step": 5130 + }, + { + "epoch": 0.4947926711668274, + "grad_norm": 2.512756586074829, + "learning_rate": 2.541653712841357e-05, + "loss": 5.6373, + "step": 5131 + }, + { + "epoch": 0.4948891031822565, + "grad_norm": 2.4340786933898926, + "learning_rate": 2.5408964407962816e-05, + "loss": 5.3292, + "step": 5132 + }, + { + "epoch": 0.4949855351976856, + "grad_norm": 2.9011292457580566, + "learning_rate": 2.5401391649977812e-05, + "loss": 5.249, + "step": 5133 + }, + { + "epoch": 0.49508196721311476, + "grad_norm": 1.8074829578399658, + "learning_rate": 2.5393818855153577e-05, + "loss": 5.1271, + "step": 5134 + }, + { + "epoch": 0.4951783992285439, + "grad_norm": 1.9002094268798828, + "learning_rate": 2.538624602418513e-05, + "loss": 5.1914, + "step": 5135 + }, + { + "epoch": 0.495274831243973, + "grad_norm": 2.137040615081787, + "learning_rate": 2.53786731577675e-05, + "loss": 5.2251, + "step": 5136 + }, + { + "epoch": 0.4953712632594021, + "grad_norm": 2.5028741359710693, + "learning_rate": 2.537110025659572e-05, + "loss": 4.8603, + "step": 5137 + }, + { + "epoch": 0.49546769527483125, + "grad_norm": 1.871504783630371, + "learning_rate": 2.5363527321364804e-05, + "loss": 4.7466, + "step": 5138 + }, + { + "epoch": 0.4955641272902604, + "grad_norm": 1.7806798219680786, + "learning_rate": 2.5355954352769805e-05, + "loss": 4.8688, + "step": 5139 + }, + { + "epoch": 0.4956605593056895, + "grad_norm": 1.9846769571304321, + "learning_rate": 2.534838135150575e-05, + "loss": 5.089, + "step": 5140 + }, + { + "epoch": 0.4957569913211186, + "grad_norm": 1.8862518072128296, + "learning_rate": 2.5340808318267688e-05, + "loss": 5.5893, + "step": 5141 + }, + { + "epoch": 0.49585342333654775, + "grad_norm": 1.855156660079956, + "learning_rate": 2.5333235253750653e-05, + "loss": 5.227, + "step": 5142 + }, + { + "epoch": 0.49594985535197683, + "grad_norm": 1.5422018766403198, + "learning_rate": 2.5325662158649698e-05, + "loss": 5.3645, + "step": 5143 + }, + { + "epoch": 0.49604628736740597, + "grad_norm": 1.8303214311599731, + "learning_rate": 2.5318089033659874e-05, + "loss": 5.5242, + "step": 5144 + }, + { + "epoch": 0.4961427193828351, + "grad_norm": 1.875130534172058, + "learning_rate": 2.5310515879476232e-05, + "loss": 5.2, + "step": 5145 + }, + { + "epoch": 0.49623915139826424, + "grad_norm": 2.157423973083496, + "learning_rate": 2.530294269679382e-05, + "loss": 5.3742, + "step": 5146 + }, + { + "epoch": 0.4963355834136933, + "grad_norm": 1.9106656312942505, + "learning_rate": 2.5295369486307697e-05, + "loss": 5.2863, + "step": 5147 + }, + { + "epoch": 0.49643201542912246, + "grad_norm": 2.211770534515381, + "learning_rate": 2.5287796248712937e-05, + "loss": 5.2974, + "step": 5148 + }, + { + "epoch": 0.4965284474445516, + "grad_norm": 2.4630348682403564, + "learning_rate": 2.5280222984704576e-05, + "loss": 5.2871, + "step": 5149 + }, + { + "epoch": 0.49662487945998074, + "grad_norm": 2.7765839099884033, + "learning_rate": 2.5272649694977706e-05, + "loss": 5.4948, + "step": 5150 + }, + { + "epoch": 0.4967213114754098, + "grad_norm": 3.29093074798584, + "learning_rate": 2.5265076380227375e-05, + "loss": 5.461, + "step": 5151 + }, + { + "epoch": 0.49681774349083896, + "grad_norm": 3.8910324573516846, + "learning_rate": 2.5257503041148672e-05, + "loss": 5.7218, + "step": 5152 + }, + { + "epoch": 0.4969141755062681, + "grad_norm": 3.69039249420166, + "learning_rate": 2.5249929678436646e-05, + "loss": 5.7426, + "step": 5153 + }, + { + "epoch": 0.4970106075216972, + "grad_norm": 2.425807476043701, + "learning_rate": 2.5242356292786385e-05, + "loss": 5.847, + "step": 5154 + }, + { + "epoch": 0.4971070395371263, + "grad_norm": 2.770656108856201, + "learning_rate": 2.523478288489296e-05, + "loss": 5.8005, + "step": 5155 + }, + { + "epoch": 0.49720347155255545, + "grad_norm": 3.631476402282715, + "learning_rate": 2.522720945545145e-05, + "loss": 5.6795, + "step": 5156 + }, + { + "epoch": 0.4972999035679846, + "grad_norm": 3.401512622833252, + "learning_rate": 2.5219636005156937e-05, + "loss": 5.7642, + "step": 5157 + }, + { + "epoch": 0.4973963355834137, + "grad_norm": 2.047783613204956, + "learning_rate": 2.52120625347045e-05, + "loss": 5.7971, + "step": 5158 + }, + { + "epoch": 0.4974927675988428, + "grad_norm": 3.6047630310058594, + "learning_rate": 2.520448904478923e-05, + "loss": 5.7632, + "step": 5159 + }, + { + "epoch": 0.49758919961427195, + "grad_norm": 4.08705472946167, + "learning_rate": 2.5196915536106207e-05, + "loss": 5.7443, + "step": 5160 + }, + { + "epoch": 0.4976856316297011, + "grad_norm": 3.7159616947174072, + "learning_rate": 2.5189342009350513e-05, + "loss": 5.7357, + "step": 5161 + }, + { + "epoch": 0.49778206364513017, + "grad_norm": 2.455105781555176, + "learning_rate": 2.5181768465217246e-05, + "loss": 5.7332, + "step": 5162 + }, + { + "epoch": 0.4978784956605593, + "grad_norm": 2.626923084259033, + "learning_rate": 2.5174194904401493e-05, + "loss": 5.7279, + "step": 5163 + }, + { + "epoch": 0.49797492767598844, + "grad_norm": 3.0944175720214844, + "learning_rate": 2.516662132759835e-05, + "loss": 5.83, + "step": 5164 + }, + { + "epoch": 0.4980713596914175, + "grad_norm": 3.673879623413086, + "learning_rate": 2.51590477355029e-05, + "loss": 5.7253, + "step": 5165 + }, + { + "epoch": 0.49816779170684666, + "grad_norm": 2.187037706375122, + "learning_rate": 2.515147412881025e-05, + "loss": 5.7906, + "step": 5166 + }, + { + "epoch": 0.4982642237222758, + "grad_norm": 2.646852970123291, + "learning_rate": 2.5143900508215484e-05, + "loss": 5.6718, + "step": 5167 + }, + { + "epoch": 0.49836065573770494, + "grad_norm": 2.866809606552124, + "learning_rate": 2.5136326874413714e-05, + "loss": 5.5868, + "step": 5168 + }, + { + "epoch": 0.498457087753134, + "grad_norm": 2.5374057292938232, + "learning_rate": 2.512875322810002e-05, + "loss": 5.4741, + "step": 5169 + }, + { + "epoch": 0.49855351976856316, + "grad_norm": 2.3675966262817383, + "learning_rate": 2.5121179569969526e-05, + "loss": 5.7075, + "step": 5170 + }, + { + "epoch": 0.4986499517839923, + "grad_norm": 1.8658565282821655, + "learning_rate": 2.5113605900717313e-05, + "loss": 5.6721, + "step": 5171 + }, + { + "epoch": 0.49874638379942143, + "grad_norm": 2.8442587852478027, + "learning_rate": 2.5106032221038493e-05, + "loss": 5.7495, + "step": 5172 + }, + { + "epoch": 0.4988428158148505, + "grad_norm": 2.8844056129455566, + "learning_rate": 2.5098458531628166e-05, + "loss": 5.8805, + "step": 5173 + }, + { + "epoch": 0.49893924783027965, + "grad_norm": 1.7330185174942017, + "learning_rate": 2.5090884833181438e-05, + "loss": 5.6993, + "step": 5174 + }, + { + "epoch": 0.4990356798457088, + "grad_norm": 1.8529980182647705, + "learning_rate": 2.508331112639341e-05, + "loss": 5.7184, + "step": 5175 + }, + { + "epoch": 0.4991321118611379, + "grad_norm": 1.737585425376892, + "learning_rate": 2.507573741195919e-05, + "loss": 5.6485, + "step": 5176 + }, + { + "epoch": 0.499228543876567, + "grad_norm": 1.659497857093811, + "learning_rate": 2.5068163690573877e-05, + "loss": 5.6022, + "step": 5177 + }, + { + "epoch": 0.49932497589199615, + "grad_norm": 1.8748817443847656, + "learning_rate": 2.506058996293259e-05, + "loss": 5.5794, + "step": 5178 + }, + { + "epoch": 0.4994214079074253, + "grad_norm": 1.6995104551315308, + "learning_rate": 2.5053016229730425e-05, + "loss": 5.6147, + "step": 5179 + }, + { + "epoch": 0.49951783992285437, + "grad_norm": 1.83549165725708, + "learning_rate": 2.50454424916625e-05, + "loss": 5.6449, + "step": 5180 + }, + { + "epoch": 0.4996142719382835, + "grad_norm": 1.883816123008728, + "learning_rate": 2.5037868749423925e-05, + "loss": 5.7664, + "step": 5181 + }, + { + "epoch": 0.49971070395371264, + "grad_norm": 1.8513368368148804, + "learning_rate": 2.5030295003709797e-05, + "loss": 5.7409, + "step": 5182 + }, + { + "epoch": 0.4998071359691418, + "grad_norm": 1.5849288702011108, + "learning_rate": 2.5022721255215233e-05, + "loss": 5.6862, + "step": 5183 + }, + { + "epoch": 0.49990356798457086, + "grad_norm": 1.8345141410827637, + "learning_rate": 2.5015147504635337e-05, + "loss": 5.7198, + "step": 5184 + }, + { + "epoch": 0.5, + "grad_norm": 1.666548252105713, + "learning_rate": 2.5007573752665224e-05, + "loss": 5.7471, + "step": 5185 + }, + { + "epoch": 0.5000964320154291, + "grad_norm": 1.620879054069519, + "learning_rate": 2.5e-05, + "loss": 5.749, + "step": 5186 + }, + { + "epoch": 0.5001928640308583, + "grad_norm": 1.736627459526062, + "learning_rate": 2.4992426247334782e-05, + "loss": 5.7478, + "step": 5187 + }, + { + "epoch": 0.5002892960462874, + "grad_norm": 1.964329481124878, + "learning_rate": 2.4984852495364665e-05, + "loss": 5.6232, + "step": 5188 + }, + { + "epoch": 0.5003857280617164, + "grad_norm": 1.5507832765579224, + "learning_rate": 2.4977278744784773e-05, + "loss": 5.7037, + "step": 5189 + }, + { + "epoch": 0.5004821600771456, + "grad_norm": 1.631888508796692, + "learning_rate": 2.4969704996290212e-05, + "loss": 5.613, + "step": 5190 + }, + { + "epoch": 0.5005785920925747, + "grad_norm": 1.6150871515274048, + "learning_rate": 2.496213125057608e-05, + "loss": 5.6816, + "step": 5191 + }, + { + "epoch": 0.5006750241080039, + "grad_norm": 1.4316186904907227, + "learning_rate": 2.4954557508337505e-05, + "loss": 5.7667, + "step": 5192 + }, + { + "epoch": 0.500771456123433, + "grad_norm": 2.2458698749542236, + "learning_rate": 2.494698377026957e-05, + "loss": 5.7127, + "step": 5193 + }, + { + "epoch": 0.5008678881388621, + "grad_norm": 2.6452152729034424, + "learning_rate": 2.4939410037067417e-05, + "loss": 5.915, + "step": 5194 + }, + { + "epoch": 0.5009643201542913, + "grad_norm": 1.6669979095458984, + "learning_rate": 2.493183630942613e-05, + "loss": 5.7455, + "step": 5195 + }, + { + "epoch": 0.5010607521697203, + "grad_norm": 2.351254463195801, + "learning_rate": 2.492426258804082e-05, + "loss": 5.6473, + "step": 5196 + }, + { + "epoch": 0.5011571841851494, + "grad_norm": 2.0815322399139404, + "learning_rate": 2.49166888736066e-05, + "loss": 5.7973, + "step": 5197 + }, + { + "epoch": 0.5012536162005786, + "grad_norm": 2.1242613792419434, + "learning_rate": 2.490911516681857e-05, + "loss": 5.6234, + "step": 5198 + }, + { + "epoch": 0.5013500482160077, + "grad_norm": 2.114800214767456, + "learning_rate": 2.4901541468371843e-05, + "loss": 5.6337, + "step": 5199 + }, + { + "epoch": 0.5014464802314368, + "grad_norm": 2.024430513381958, + "learning_rate": 2.4893967778961513e-05, + "loss": 5.7823, + "step": 5200 + }, + { + "epoch": 0.501542912246866, + "grad_norm": 1.9336163997650146, + "learning_rate": 2.4886394099282697e-05, + "loss": 5.5679, + "step": 5201 + }, + { + "epoch": 0.5016393442622951, + "grad_norm": 1.4883170127868652, + "learning_rate": 2.4878820430030477e-05, + "loss": 5.813, + "step": 5202 + }, + { + "epoch": 0.5017357762777243, + "grad_norm": 1.448095679283142, + "learning_rate": 2.4871246771899982e-05, + "loss": 5.8301, + "step": 5203 + }, + { + "epoch": 0.5018322082931533, + "grad_norm": 1.3803555965423584, + "learning_rate": 2.486367312558629e-05, + "loss": 5.7946, + "step": 5204 + }, + { + "epoch": 0.5019286403085824, + "grad_norm": 1.6259945631027222, + "learning_rate": 2.4856099491784522e-05, + "loss": 5.6794, + "step": 5205 + }, + { + "epoch": 0.5020250723240116, + "grad_norm": 1.6900088787078857, + "learning_rate": 2.4848525871189764e-05, + "loss": 5.7498, + "step": 5206 + }, + { + "epoch": 0.5021215043394407, + "grad_norm": 1.835081934928894, + "learning_rate": 2.4840952264497104e-05, + "loss": 5.6875, + "step": 5207 + }, + { + "epoch": 0.5022179363548698, + "grad_norm": 1.772379994392395, + "learning_rate": 2.4833378672401657e-05, + "loss": 5.7669, + "step": 5208 + }, + { + "epoch": 0.502314368370299, + "grad_norm": 1.9510709047317505, + "learning_rate": 2.4825805095598516e-05, + "loss": 5.6624, + "step": 5209 + }, + { + "epoch": 0.502410800385728, + "grad_norm": 1.6865506172180176, + "learning_rate": 2.4818231534782756e-05, + "loss": 5.7448, + "step": 5210 + }, + { + "epoch": 0.5025072324011571, + "grad_norm": 1.9983327388763428, + "learning_rate": 2.4810657990649493e-05, + "loss": 5.9126, + "step": 5211 + }, + { + "epoch": 0.5026036644165863, + "grad_norm": 2.6227781772613525, + "learning_rate": 2.4803084463893805e-05, + "loss": 5.5083, + "step": 5212 + }, + { + "epoch": 0.5027000964320154, + "grad_norm": 1.7294892072677612, + "learning_rate": 2.479551095521077e-05, + "loss": 5.7177, + "step": 5213 + }, + { + "epoch": 0.5027965284474446, + "grad_norm": 1.8641434907913208, + "learning_rate": 2.4787937465295505e-05, + "loss": 5.9185, + "step": 5214 + }, + { + "epoch": 0.5028929604628737, + "grad_norm": 1.746175765991211, + "learning_rate": 2.4780363994843066e-05, + "loss": 5.6714, + "step": 5215 + }, + { + "epoch": 0.5029893924783028, + "grad_norm": 1.9156564474105835, + "learning_rate": 2.4772790544548554e-05, + "loss": 5.6497, + "step": 5216 + }, + { + "epoch": 0.503085824493732, + "grad_norm": 2.2638978958129883, + "learning_rate": 2.476521711510705e-05, + "loss": 5.8061, + "step": 5217 + }, + { + "epoch": 0.503182256509161, + "grad_norm": 2.1320266723632812, + "learning_rate": 2.475764370721362e-05, + "loss": 5.6675, + "step": 5218 + }, + { + "epoch": 0.5032786885245901, + "grad_norm": 1.7672971487045288, + "learning_rate": 2.475007032156336e-05, + "loss": 5.6075, + "step": 5219 + }, + { + "epoch": 0.5033751205400193, + "grad_norm": 1.4613406658172607, + "learning_rate": 2.474249695885134e-05, + "loss": 5.5532, + "step": 5220 + }, + { + "epoch": 0.5034715525554484, + "grad_norm": 1.500718593597412, + "learning_rate": 2.4734923619772624e-05, + "loss": 5.8208, + "step": 5221 + }, + { + "epoch": 0.5035679845708775, + "grad_norm": 1.686531901359558, + "learning_rate": 2.47273503050223e-05, + "loss": 5.7105, + "step": 5222 + }, + { + "epoch": 0.5036644165863067, + "grad_norm": 1.7172389030456543, + "learning_rate": 2.471977701529543e-05, + "loss": 5.3947, + "step": 5223 + }, + { + "epoch": 0.5037608486017358, + "grad_norm": 1.8703171014785767, + "learning_rate": 2.4712203751287072e-05, + "loss": 5.803, + "step": 5224 + }, + { + "epoch": 0.503857280617165, + "grad_norm": 2.543147325515747, + "learning_rate": 2.470463051369231e-05, + "loss": 5.8417, + "step": 5225 + }, + { + "epoch": 0.503953712632594, + "grad_norm": 2.3778247833251953, + "learning_rate": 2.4697057303206184e-05, + "loss": 5.4269, + "step": 5226 + }, + { + "epoch": 0.5040501446480231, + "grad_norm": 1.4471454620361328, + "learning_rate": 2.4689484120523777e-05, + "loss": 5.6699, + "step": 5227 + }, + { + "epoch": 0.5041465766634523, + "grad_norm": 1.9415918588638306, + "learning_rate": 2.4681910966340132e-05, + "loss": 5.818, + "step": 5228 + }, + { + "epoch": 0.5042430086788814, + "grad_norm": 2.9741828441619873, + "learning_rate": 2.46743378413503e-05, + "loss": 5.6104, + "step": 5229 + }, + { + "epoch": 0.5043394406943105, + "grad_norm": 2.163205146789551, + "learning_rate": 2.4666764746249353e-05, + "loss": 5.741, + "step": 5230 + }, + { + "epoch": 0.5044358727097397, + "grad_norm": 1.758650541305542, + "learning_rate": 2.465919168173232e-05, + "loss": 5.7047, + "step": 5231 + }, + { + "epoch": 0.5045323047251687, + "grad_norm": 1.9823439121246338, + "learning_rate": 2.465161864849425e-05, + "loss": 5.6382, + "step": 5232 + }, + { + "epoch": 0.5046287367405978, + "grad_norm": 2.0010178089141846, + "learning_rate": 2.46440456472302e-05, + "loss": 5.5104, + "step": 5233 + }, + { + "epoch": 0.504725168756027, + "grad_norm": 2.109714984893799, + "learning_rate": 2.4636472678635205e-05, + "loss": 5.7138, + "step": 5234 + }, + { + "epoch": 0.5048216007714561, + "grad_norm": 2.5939688682556152, + "learning_rate": 2.4628899743404288e-05, + "loss": 5.7155, + "step": 5235 + }, + { + "epoch": 0.5049180327868853, + "grad_norm": 1.8712037801742554, + "learning_rate": 2.4621326842232505e-05, + "loss": 5.79, + "step": 5236 + }, + { + "epoch": 0.5050144648023144, + "grad_norm": 1.6143527030944824, + "learning_rate": 2.461375397581487e-05, + "loss": 5.6986, + "step": 5237 + }, + { + "epoch": 0.5051108968177435, + "grad_norm": 2.498037338256836, + "learning_rate": 2.460618114484643e-05, + "loss": 5.5183, + "step": 5238 + }, + { + "epoch": 0.5052073288331727, + "grad_norm": 1.9038087129592896, + "learning_rate": 2.4598608350022193e-05, + "loss": 5.6687, + "step": 5239 + }, + { + "epoch": 0.5053037608486017, + "grad_norm": 1.709870457649231, + "learning_rate": 2.4591035592037186e-05, + "loss": 5.6278, + "step": 5240 + }, + { + "epoch": 0.5054001928640308, + "grad_norm": 2.594538450241089, + "learning_rate": 2.4583462871586436e-05, + "loss": 5.6173, + "step": 5241 + }, + { + "epoch": 0.50549662487946, + "grad_norm": 2.0516130924224854, + "learning_rate": 2.457589018936495e-05, + "loss": 5.6689, + "step": 5242 + }, + { + "epoch": 0.5055930568948891, + "grad_norm": 1.5736666917800903, + "learning_rate": 2.456831754606773e-05, + "loss": 5.6559, + "step": 5243 + }, + { + "epoch": 0.5056894889103182, + "grad_norm": 2.0387632846832275, + "learning_rate": 2.4560744942389806e-05, + "loss": 5.6714, + "step": 5244 + }, + { + "epoch": 0.5057859209257474, + "grad_norm": 1.7969242334365845, + "learning_rate": 2.4553172379026174e-05, + "loss": 5.7861, + "step": 5245 + }, + { + "epoch": 0.5058823529411764, + "grad_norm": 1.8419815301895142, + "learning_rate": 2.4545599856671818e-05, + "loss": 5.4761, + "step": 5246 + }, + { + "epoch": 0.5059787849566056, + "grad_norm": 1.842109203338623, + "learning_rate": 2.453802737602176e-05, + "loss": 5.7844, + "step": 5247 + }, + { + "epoch": 0.5060752169720347, + "grad_norm": 1.8354718685150146, + "learning_rate": 2.4530454937770965e-05, + "loss": 5.7038, + "step": 5248 + }, + { + "epoch": 0.5061716489874638, + "grad_norm": 2.096113920211792, + "learning_rate": 2.4522882542614452e-05, + "loss": 5.7302, + "step": 5249 + }, + { + "epoch": 0.506268081002893, + "grad_norm": 1.6534168720245361, + "learning_rate": 2.451531019124719e-05, + "loss": 5.6988, + "step": 5250 + }, + { + "epoch": 0.5063645130183221, + "grad_norm": 1.867413878440857, + "learning_rate": 2.4507737884364153e-05, + "loss": 5.5552, + "step": 5251 + }, + { + "epoch": 0.5064609450337512, + "grad_norm": 2.2041261196136475, + "learning_rate": 2.450016562266034e-05, + "loss": 5.575, + "step": 5252 + }, + { + "epoch": 0.5065573770491804, + "grad_norm": 1.5057449340820312, + "learning_rate": 2.4492593406830708e-05, + "loss": 5.6281, + "step": 5253 + }, + { + "epoch": 0.5066538090646094, + "grad_norm": 1.6689391136169434, + "learning_rate": 2.448502123757022e-05, + "loss": 5.6996, + "step": 5254 + }, + { + "epoch": 0.5067502410800385, + "grad_norm": 1.9807616472244263, + "learning_rate": 2.447744911557386e-05, + "loss": 5.7806, + "step": 5255 + }, + { + "epoch": 0.5068466730954677, + "grad_norm": 1.377579927444458, + "learning_rate": 2.4469877041536573e-05, + "loss": 5.69, + "step": 5256 + }, + { + "epoch": 0.5069431051108968, + "grad_norm": 1.5254768133163452, + "learning_rate": 2.4462305016153313e-05, + "loss": 5.6404, + "step": 5257 + }, + { + "epoch": 0.507039537126326, + "grad_norm": 1.591636061668396, + "learning_rate": 2.4454733040119043e-05, + "loss": 5.655, + "step": 5258 + }, + { + "epoch": 0.5071359691417551, + "grad_norm": 1.3659257888793945, + "learning_rate": 2.4447161114128693e-05, + "loss": 5.6995, + "step": 5259 + }, + { + "epoch": 0.5072324011571842, + "grad_norm": 1.7247264385223389, + "learning_rate": 2.443958923887723e-05, + "loss": 5.7655, + "step": 5260 + }, + { + "epoch": 0.5073288331726133, + "grad_norm": 1.887107491493225, + "learning_rate": 2.443201741505957e-05, + "loss": 5.594, + "step": 5261 + }, + { + "epoch": 0.5074252651880424, + "grad_norm": 1.1960406303405762, + "learning_rate": 2.442444564337065e-05, + "loss": 5.6933, + "step": 5262 + }, + { + "epoch": 0.5075216972034715, + "grad_norm": 1.7481998205184937, + "learning_rate": 2.4416873924505402e-05, + "loss": 5.6638, + "step": 5263 + }, + { + "epoch": 0.5076181292189007, + "grad_norm": 2.491344928741455, + "learning_rate": 2.4409302259158752e-05, + "loss": 5.7019, + "step": 5264 + }, + { + "epoch": 0.5077145612343298, + "grad_norm": 1.7121832370758057, + "learning_rate": 2.4401730648025605e-05, + "loss": 5.6723, + "step": 5265 + }, + { + "epoch": 0.5078109932497589, + "grad_norm": 1.2917476892471313, + "learning_rate": 2.4394159091800894e-05, + "loss": 5.7029, + "step": 5266 + }, + { + "epoch": 0.5079074252651881, + "grad_norm": 1.7774938344955444, + "learning_rate": 2.4386587591179516e-05, + "loss": 5.6274, + "step": 5267 + }, + { + "epoch": 0.5080038572806171, + "grad_norm": 1.941102147102356, + "learning_rate": 2.4379016146856362e-05, + "loss": 5.6897, + "step": 5268 + }, + { + "epoch": 0.5081002892960463, + "grad_norm": 1.8533909320831299, + "learning_rate": 2.4371444759526352e-05, + "loss": 5.5813, + "step": 5269 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 1.3812119960784912, + "learning_rate": 2.4363873429884357e-05, + "loss": 5.5295, + "step": 5270 + }, + { + "epoch": 0.5082931533269045, + "grad_norm": 1.398140788078308, + "learning_rate": 2.4356302158625288e-05, + "loss": 5.5808, + "step": 5271 + }, + { + "epoch": 0.5083895853423337, + "grad_norm": 1.497143268585205, + "learning_rate": 2.4348730946444015e-05, + "loss": 5.5484, + "step": 5272 + }, + { + "epoch": 0.5084860173577628, + "grad_norm": 1.3552637100219727, + "learning_rate": 2.43411597940354e-05, + "loss": 5.5792, + "step": 5273 + }, + { + "epoch": 0.5085824493731919, + "grad_norm": 1.2054166793823242, + "learning_rate": 2.4333588702094342e-05, + "loss": 5.5741, + "step": 5274 + }, + { + "epoch": 0.508678881388621, + "grad_norm": 1.704368233680725, + "learning_rate": 2.432601767131569e-05, + "loss": 5.4976, + "step": 5275 + }, + { + "epoch": 0.5087753134040501, + "grad_norm": 2.4233322143554688, + "learning_rate": 2.4318446702394302e-05, + "loss": 5.654, + "step": 5276 + }, + { + "epoch": 0.5088717454194792, + "grad_norm": 2.22151780128479, + "learning_rate": 2.4310875796025043e-05, + "loss": 5.6997, + "step": 5277 + }, + { + "epoch": 0.5089681774349084, + "grad_norm": 2.244811534881592, + "learning_rate": 2.430330495290275e-05, + "loss": 5.616, + "step": 5278 + }, + { + "epoch": 0.5090646094503375, + "grad_norm": 2.494230270385742, + "learning_rate": 2.429573417372227e-05, + "loss": 5.4798, + "step": 5279 + }, + { + "epoch": 0.5091610414657667, + "grad_norm": 2.629716634750366, + "learning_rate": 2.428816345917845e-05, + "loss": 5.4708, + "step": 5280 + }, + { + "epoch": 0.5092574734811958, + "grad_norm": 2.5840697288513184, + "learning_rate": 2.4280592809966095e-05, + "loss": 5.67, + "step": 5281 + }, + { + "epoch": 0.5093539054966248, + "grad_norm": 2.3941776752471924, + "learning_rate": 2.427302222678006e-05, + "loss": 5.6556, + "step": 5282 + }, + { + "epoch": 0.509450337512054, + "grad_norm": 2.359431505203247, + "learning_rate": 2.4265451710315144e-05, + "loss": 5.7214, + "step": 5283 + }, + { + "epoch": 0.5095467695274831, + "grad_norm": 2.709620237350464, + "learning_rate": 2.425788126126617e-05, + "loss": 5.743, + "step": 5284 + }, + { + "epoch": 0.5096432015429122, + "grad_norm": 2.887110471725464, + "learning_rate": 2.4250310880327937e-05, + "loss": 5.679, + "step": 5285 + }, + { + "epoch": 0.5097396335583414, + "grad_norm": 1.7220900058746338, + "learning_rate": 2.4242740568195254e-05, + "loss": 5.6821, + "step": 5286 + }, + { + "epoch": 0.5098360655737705, + "grad_norm": 1.6844737529754639, + "learning_rate": 2.42351703255629e-05, + "loss": 5.6272, + "step": 5287 + }, + { + "epoch": 0.5099324975891996, + "grad_norm": 2.293545961380005, + "learning_rate": 2.4227600153125683e-05, + "loss": 5.6725, + "step": 5288 + }, + { + "epoch": 0.5100289296046288, + "grad_norm": 2.5830740928649902, + "learning_rate": 2.4220030051578374e-05, + "loss": 5.7122, + "step": 5289 + }, + { + "epoch": 0.5101253616200578, + "grad_norm": 1.8912538290023804, + "learning_rate": 2.4212460021615735e-05, + "loss": 5.8063, + "step": 5290 + }, + { + "epoch": 0.510221793635487, + "grad_norm": 3.4643304347991943, + "learning_rate": 2.4204890063932555e-05, + "loss": 5.7804, + "step": 5291 + }, + { + "epoch": 0.5103182256509161, + "grad_norm": 2.812925100326538, + "learning_rate": 2.4197320179223577e-05, + "loss": 5.559, + "step": 5292 + }, + { + "epoch": 0.5104146576663452, + "grad_norm": 2.169546365737915, + "learning_rate": 2.4189750368183575e-05, + "loss": 5.6885, + "step": 5293 + }, + { + "epoch": 0.5105110896817744, + "grad_norm": 2.9512555599212646, + "learning_rate": 2.418218063150729e-05, + "loss": 5.6227, + "step": 5294 + }, + { + "epoch": 0.5106075216972035, + "grad_norm": 3.2074432373046875, + "learning_rate": 2.4174610969889446e-05, + "loss": 5.6617, + "step": 5295 + }, + { + "epoch": 0.5107039537126326, + "grad_norm": 3.18810772895813, + "learning_rate": 2.4167041384024802e-05, + "loss": 5.6974, + "step": 5296 + }, + { + "epoch": 0.5108003857280617, + "grad_norm": 1.6338391304016113, + "learning_rate": 2.4159471874608076e-05, + "loss": 5.372, + "step": 5297 + }, + { + "epoch": 0.5108968177434908, + "grad_norm": 3.1057932376861572, + "learning_rate": 2.4151902442333975e-05, + "loss": 5.7943, + "step": 5298 + }, + { + "epoch": 0.5109932497589199, + "grad_norm": 3.5405404567718506, + "learning_rate": 2.4144333087897238e-05, + "loss": 5.5459, + "step": 5299 + }, + { + "epoch": 0.5110896817743491, + "grad_norm": 1.9474506378173828, + "learning_rate": 2.413676381199255e-05, + "loss": 5.65, + "step": 5300 + }, + { + "epoch": 0.5111861137897782, + "grad_norm": 1.8647223711013794, + "learning_rate": 2.4129194615314616e-05, + "loss": 5.6578, + "step": 5301 + }, + { + "epoch": 0.5112825458052074, + "grad_norm": 2.8322525024414062, + "learning_rate": 2.412162549855813e-05, + "loss": 5.5751, + "step": 5302 + }, + { + "epoch": 0.5113789778206365, + "grad_norm": 2.899765968322754, + "learning_rate": 2.411405646241777e-05, + "loss": 5.6403, + "step": 5303 + }, + { + "epoch": 0.5114754098360655, + "grad_norm": 1.565062403678894, + "learning_rate": 2.4106487507588225e-05, + "loss": 5.6638, + "step": 5304 + }, + { + "epoch": 0.5115718418514947, + "grad_norm": 1.9728660583496094, + "learning_rate": 2.4098918634764153e-05, + "loss": 5.5637, + "step": 5305 + }, + { + "epoch": 0.5116682738669238, + "grad_norm": 1.7344293594360352, + "learning_rate": 2.409134984464022e-05, + "loss": 5.4387, + "step": 5306 + }, + { + "epoch": 0.5117647058823529, + "grad_norm": 1.713099479675293, + "learning_rate": 2.4083781137911078e-05, + "loss": 5.6868, + "step": 5307 + }, + { + "epoch": 0.5118611378977821, + "grad_norm": 1.9526218175888062, + "learning_rate": 2.4076212515271383e-05, + "loss": 5.6768, + "step": 5308 + }, + { + "epoch": 0.5119575699132112, + "grad_norm": 1.5878288745880127, + "learning_rate": 2.4068643977415756e-05, + "loss": 5.7108, + "step": 5309 + }, + { + "epoch": 0.5120540019286403, + "grad_norm": 1.5576121807098389, + "learning_rate": 2.4061075525038845e-05, + "loss": 5.6751, + "step": 5310 + }, + { + "epoch": 0.5121504339440694, + "grad_norm": 1.4956986904144287, + "learning_rate": 2.405350715883527e-05, + "loss": 5.3678, + "step": 5311 + }, + { + "epoch": 0.5122468659594985, + "grad_norm": 1.8983439207077026, + "learning_rate": 2.4045938879499632e-05, + "loss": 5.801, + "step": 5312 + }, + { + "epoch": 0.5123432979749277, + "grad_norm": 1.5474251508712769, + "learning_rate": 2.403837068772656e-05, + "loss": 5.6595, + "step": 5313 + }, + { + "epoch": 0.5124397299903568, + "grad_norm": 1.524996042251587, + "learning_rate": 2.4030802584210634e-05, + "loss": 5.5906, + "step": 5314 + }, + { + "epoch": 0.5125361620057859, + "grad_norm": 1.7559535503387451, + "learning_rate": 2.402323456964646e-05, + "loss": 5.5179, + "step": 5315 + }, + { + "epoch": 0.5126325940212151, + "grad_norm": 1.743377923965454, + "learning_rate": 2.401566664472862e-05, + "loss": 5.677, + "step": 5316 + }, + { + "epoch": 0.5127290260366442, + "grad_norm": 1.6195489168167114, + "learning_rate": 2.4008098810151665e-05, + "loss": 5.61, + "step": 5317 + }, + { + "epoch": 0.5128254580520732, + "grad_norm": 1.642260193824768, + "learning_rate": 2.40005310666102e-05, + "loss": 5.5877, + "step": 5318 + }, + { + "epoch": 0.5129218900675024, + "grad_norm": 1.6922597885131836, + "learning_rate": 2.3992963414798757e-05, + "loss": 5.5654, + "step": 5319 + }, + { + "epoch": 0.5130183220829315, + "grad_norm": 1.8033465147018433, + "learning_rate": 2.3985395855411887e-05, + "loss": 5.6685, + "step": 5320 + }, + { + "epoch": 0.5131147540983606, + "grad_norm": 1.6622116565704346, + "learning_rate": 2.3977828389144144e-05, + "loss": 5.5582, + "step": 5321 + }, + { + "epoch": 0.5132111861137898, + "grad_norm": 1.6833804845809937, + "learning_rate": 2.397026101669005e-05, + "loss": 5.5249, + "step": 5322 + }, + { + "epoch": 0.5133076181292189, + "grad_norm": 2.3577287197113037, + "learning_rate": 2.3962693738744133e-05, + "loss": 5.6416, + "step": 5323 + }, + { + "epoch": 0.5134040501446481, + "grad_norm": 2.2973968982696533, + "learning_rate": 2.3955126556000905e-05, + "loss": 5.5357, + "step": 5324 + }, + { + "epoch": 0.5135004821600772, + "grad_norm": 1.4794336557388306, + "learning_rate": 2.3947559469154875e-05, + "loss": 5.6754, + "step": 5325 + }, + { + "epoch": 0.5135969141755062, + "grad_norm": 1.8573638200759888, + "learning_rate": 2.3939992478900545e-05, + "loss": 5.6224, + "step": 5326 + }, + { + "epoch": 0.5136933461909354, + "grad_norm": 1.922156572341919, + "learning_rate": 2.3932425585932405e-05, + "loss": 5.5077, + "step": 5327 + }, + { + "epoch": 0.5137897782063645, + "grad_norm": 1.5740593671798706, + "learning_rate": 2.392485879094492e-05, + "loss": 5.7489, + "step": 5328 + }, + { + "epoch": 0.5138862102217936, + "grad_norm": 2.2837178707122803, + "learning_rate": 2.391729209463258e-05, + "loss": 5.5946, + "step": 5329 + }, + { + "epoch": 0.5139826422372228, + "grad_norm": 1.413499116897583, + "learning_rate": 2.3909725497689845e-05, + "loss": 5.5566, + "step": 5330 + }, + { + "epoch": 0.5140790742526519, + "grad_norm": 1.8905106782913208, + "learning_rate": 2.3902159000811147e-05, + "loss": 5.5847, + "step": 5331 + }, + { + "epoch": 0.514175506268081, + "grad_norm": 1.7822209596633911, + "learning_rate": 2.3894592604690954e-05, + "loss": 5.6279, + "step": 5332 + }, + { + "epoch": 0.5142719382835101, + "grad_norm": 2.0505220890045166, + "learning_rate": 2.3887026310023692e-05, + "loss": 5.6569, + "step": 5333 + }, + { + "epoch": 0.5143683702989392, + "grad_norm": 1.960182785987854, + "learning_rate": 2.3879460117503776e-05, + "loss": 5.681, + "step": 5334 + }, + { + "epoch": 0.5144648023143684, + "grad_norm": 2.0885207653045654, + "learning_rate": 2.3871894027825643e-05, + "loss": 5.5652, + "step": 5335 + }, + { + "epoch": 0.5145612343297975, + "grad_norm": 1.6140358448028564, + "learning_rate": 2.3864328041683675e-05, + "loss": 5.6412, + "step": 5336 + }, + { + "epoch": 0.5146576663452266, + "grad_norm": 2.439171552658081, + "learning_rate": 2.3856762159772295e-05, + "loss": 5.6905, + "step": 5337 + }, + { + "epoch": 0.5147540983606558, + "grad_norm": 1.5771392583847046, + "learning_rate": 2.3849196382785876e-05, + "loss": 5.5931, + "step": 5338 + }, + { + "epoch": 0.5148505303760849, + "grad_norm": 1.84849214553833, + "learning_rate": 2.3841630711418782e-05, + "loss": 5.5962, + "step": 5339 + }, + { + "epoch": 0.5149469623915139, + "grad_norm": 1.9471129179000854, + "learning_rate": 2.383406514636541e-05, + "loss": 5.6634, + "step": 5340 + }, + { + "epoch": 0.5150433944069431, + "grad_norm": 1.7407389879226685, + "learning_rate": 2.38264996883201e-05, + "loss": 5.6153, + "step": 5341 + }, + { + "epoch": 0.5151398264223722, + "grad_norm": 1.8722646236419678, + "learning_rate": 2.38189343379772e-05, + "loss": 5.6668, + "step": 5342 + }, + { + "epoch": 0.5152362584378013, + "grad_norm": 1.7386053800582886, + "learning_rate": 2.381136909603106e-05, + "loss": 5.5812, + "step": 5343 + }, + { + "epoch": 0.5153326904532305, + "grad_norm": 2.2566187381744385, + "learning_rate": 2.3803803963175996e-05, + "loss": 5.4777, + "step": 5344 + }, + { + "epoch": 0.5154291224686596, + "grad_norm": 2.9117252826690674, + "learning_rate": 2.3796238940106332e-05, + "loss": 5.6427, + "step": 5345 + }, + { + "epoch": 0.5155255544840888, + "grad_norm": 2.6547677516937256, + "learning_rate": 2.3788674027516374e-05, + "loss": 5.6685, + "step": 5346 + }, + { + "epoch": 0.5156219864995178, + "grad_norm": 1.5370997190475464, + "learning_rate": 2.378110922610043e-05, + "loss": 5.6294, + "step": 5347 + }, + { + "epoch": 0.5157184185149469, + "grad_norm": 2.2848665714263916, + "learning_rate": 2.3773544536552772e-05, + "loss": 5.4781, + "step": 5348 + }, + { + "epoch": 0.5158148505303761, + "grad_norm": 2.891604423522949, + "learning_rate": 2.3765979959567697e-05, + "loss": 5.7972, + "step": 5349 + }, + { + "epoch": 0.5159112825458052, + "grad_norm": 2.5229010581970215, + "learning_rate": 2.3758415495839454e-05, + "loss": 5.5893, + "step": 5350 + }, + { + "epoch": 0.5160077145612343, + "grad_norm": 1.3398998975753784, + "learning_rate": 2.3750851146062313e-05, + "loss": 5.5887, + "step": 5351 + }, + { + "epoch": 0.5161041465766635, + "grad_norm": 1.6679693460464478, + "learning_rate": 2.3743286910930523e-05, + "loss": 5.5721, + "step": 5352 + }, + { + "epoch": 0.5162005785920926, + "grad_norm": 2.3585808277130127, + "learning_rate": 2.3735722791138297e-05, + "loss": 5.5689, + "step": 5353 + }, + { + "epoch": 0.5162970106075216, + "grad_norm": 2.1103904247283936, + "learning_rate": 2.3728158787379892e-05, + "loss": 5.5381, + "step": 5354 + }, + { + "epoch": 0.5163934426229508, + "grad_norm": 1.5132516622543335, + "learning_rate": 2.372059490034951e-05, + "loss": 5.5457, + "step": 5355 + }, + { + "epoch": 0.5164898746383799, + "grad_norm": 1.9623602628707886, + "learning_rate": 2.3713031130741337e-05, + "loss": 5.5953, + "step": 5356 + }, + { + "epoch": 0.5165863066538091, + "grad_norm": 1.7890138626098633, + "learning_rate": 2.3705467479249597e-05, + "loss": 5.4874, + "step": 5357 + }, + { + "epoch": 0.5166827386692382, + "grad_norm": 1.4512301683425903, + "learning_rate": 2.3697903946568443e-05, + "loss": 5.3449, + "step": 5358 + }, + { + "epoch": 0.5167791706846673, + "grad_norm": 1.9130371809005737, + "learning_rate": 2.3690340533392082e-05, + "loss": 5.3547, + "step": 5359 + }, + { + "epoch": 0.5168756027000965, + "grad_norm": 2.0094902515411377, + "learning_rate": 2.3682777240414646e-05, + "loss": 5.4623, + "step": 5360 + }, + { + "epoch": 0.5169720347155256, + "grad_norm": 2.420607805252075, + "learning_rate": 2.367521406833029e-05, + "loss": 5.5576, + "step": 5361 + }, + { + "epoch": 0.5170684667309546, + "grad_norm": 1.743739366531372, + "learning_rate": 2.366765101783316e-05, + "loss": 5.6479, + "step": 5362 + }, + { + "epoch": 0.5171648987463838, + "grad_norm": 1.4640634059906006, + "learning_rate": 2.3660088089617385e-05, + "loss": 5.5943, + "step": 5363 + }, + { + "epoch": 0.5172613307618129, + "grad_norm": 2.274423599243164, + "learning_rate": 2.3652525284377064e-05, + "loss": 5.5592, + "step": 5364 + }, + { + "epoch": 0.517357762777242, + "grad_norm": 1.92252516746521, + "learning_rate": 2.3644962602806322e-05, + "loss": 5.4755, + "step": 5365 + }, + { + "epoch": 0.5174541947926712, + "grad_norm": 2.0004513263702393, + "learning_rate": 2.363740004559924e-05, + "loss": 5.5215, + "step": 5366 + }, + { + "epoch": 0.5175506268081003, + "grad_norm": 1.992153525352478, + "learning_rate": 2.362983761344991e-05, + "loss": 5.6455, + "step": 5367 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 2.038665533065796, + "learning_rate": 2.3622275307052392e-05, + "loss": 5.358, + "step": 5368 + }, + { + "epoch": 0.5177434908389585, + "grad_norm": 2.565586805343628, + "learning_rate": 2.361471312710075e-05, + "loss": 5.7112, + "step": 5369 + }, + { + "epoch": 0.5178399228543876, + "grad_norm": 2.30375075340271, + "learning_rate": 2.3607151074289034e-05, + "loss": 5.5877, + "step": 5370 + }, + { + "epoch": 0.5179363548698168, + "grad_norm": 2.2511799335479736, + "learning_rate": 2.3599589149311285e-05, + "loss": 5.8353, + "step": 5371 + }, + { + "epoch": 0.5180327868852459, + "grad_norm": 1.9540959596633911, + "learning_rate": 2.359202735286151e-05, + "loss": 5.8078, + "step": 5372 + }, + { + "epoch": 0.518129218900675, + "grad_norm": 2.308803081512451, + "learning_rate": 2.3584465685633738e-05, + "loss": 5.5448, + "step": 5373 + }, + { + "epoch": 0.5182256509161042, + "grad_norm": 1.9972392320632935, + "learning_rate": 2.357690414832197e-05, + "loss": 5.4721, + "step": 5374 + }, + { + "epoch": 0.5183220829315333, + "grad_norm": 2.2442786693573, + "learning_rate": 2.356934274162017e-05, + "loss": 5.5888, + "step": 5375 + }, + { + "epoch": 0.5184185149469623, + "grad_norm": 2.422178268432617, + "learning_rate": 2.3561781466222347e-05, + "loss": 5.5355, + "step": 5376 + }, + { + "epoch": 0.5185149469623915, + "grad_norm": 1.7845447063446045, + "learning_rate": 2.3554220322822447e-05, + "loss": 5.7237, + "step": 5377 + }, + { + "epoch": 0.5186113789778206, + "grad_norm": 2.304706573486328, + "learning_rate": 2.354665931211442e-05, + "loss": 5.7362, + "step": 5378 + }, + { + "epoch": 0.5187078109932498, + "grad_norm": 2.97204852104187, + "learning_rate": 2.353909843479222e-05, + "loss": 5.6847, + "step": 5379 + }, + { + "epoch": 0.5188042430086789, + "grad_norm": 1.7881340980529785, + "learning_rate": 2.353153769154976e-05, + "loss": 5.6412, + "step": 5380 + }, + { + "epoch": 0.518900675024108, + "grad_norm": 2.626230478286743, + "learning_rate": 2.3523977083080973e-05, + "loss": 5.6247, + "step": 5381 + }, + { + "epoch": 0.5189971070395372, + "grad_norm": 3.3233439922332764, + "learning_rate": 2.351641661007975e-05, + "loss": 5.6848, + "step": 5382 + }, + { + "epoch": 0.5190935390549662, + "grad_norm": 3.4926071166992188, + "learning_rate": 2.3508856273239975e-05, + "loss": 5.7844, + "step": 5383 + }, + { + "epoch": 0.5191899710703953, + "grad_norm": 1.9539318084716797, + "learning_rate": 2.350129607325555e-05, + "loss": 5.5871, + "step": 5384 + }, + { + "epoch": 0.5192864030858245, + "grad_norm": 2.8852732181549072, + "learning_rate": 2.3493736010820324e-05, + "loss": 5.654, + "step": 5385 + }, + { + "epoch": 0.5193828351012536, + "grad_norm": 2.498878002166748, + "learning_rate": 2.3486176086628143e-05, + "loss": 5.6481, + "step": 5386 + }, + { + "epoch": 0.5194792671166827, + "grad_norm": 2.1520698070526123, + "learning_rate": 2.347861630137287e-05, + "loss": 5.6186, + "step": 5387 + }, + { + "epoch": 0.5195756991321119, + "grad_norm": 2.3650057315826416, + "learning_rate": 2.347105665574831e-05, + "loss": 5.8217, + "step": 5388 + }, + { + "epoch": 0.519672131147541, + "grad_norm": 2.55517840385437, + "learning_rate": 2.346349715044829e-05, + "loss": 5.7276, + "step": 5389 + }, + { + "epoch": 0.5197685631629702, + "grad_norm": 1.9633582830429077, + "learning_rate": 2.345593778616661e-05, + "loss": 5.6854, + "step": 5390 + }, + { + "epoch": 0.5198649951783992, + "grad_norm": 1.5689473152160645, + "learning_rate": 2.3448378563597065e-05, + "loss": 5.6593, + "step": 5391 + }, + { + "epoch": 0.5199614271938283, + "grad_norm": 2.3045239448547363, + "learning_rate": 2.3440819483433422e-05, + "loss": 5.5766, + "step": 5392 + }, + { + "epoch": 0.5200578592092575, + "grad_norm": 2.075650930404663, + "learning_rate": 2.343326054636945e-05, + "loss": 5.4733, + "step": 5393 + }, + { + "epoch": 0.5201542912246866, + "grad_norm": 1.7399924993515015, + "learning_rate": 2.342570175309889e-05, + "loss": 5.6482, + "step": 5394 + }, + { + "epoch": 0.5202507232401157, + "grad_norm": 2.608567953109741, + "learning_rate": 2.3418143104315495e-05, + "loss": 5.6623, + "step": 5395 + }, + { + "epoch": 0.5203471552555449, + "grad_norm": 3.28631854057312, + "learning_rate": 2.341058460071298e-05, + "loss": 5.4057, + "step": 5396 + }, + { + "epoch": 0.520443587270974, + "grad_norm": 2.274670362472534, + "learning_rate": 2.3403026242985042e-05, + "loss": 5.7357, + "step": 5397 + }, + { + "epoch": 0.520540019286403, + "grad_norm": 1.9076482057571411, + "learning_rate": 2.3395468031825402e-05, + "loss": 5.6079, + "step": 5398 + }, + { + "epoch": 0.5206364513018322, + "grad_norm": 2.122575044631958, + "learning_rate": 2.338790996792773e-05, + "loss": 5.5309, + "step": 5399 + }, + { + "epoch": 0.5207328833172613, + "grad_norm": 2.4210729598999023, + "learning_rate": 2.3380352051985684e-05, + "loss": 5.6578, + "step": 5400 + }, + { + "epoch": 0.5208293153326905, + "grad_norm": 1.4155993461608887, + "learning_rate": 2.337279428469295e-05, + "loss": 5.5867, + "step": 5401 + }, + { + "epoch": 0.5209257473481196, + "grad_norm": 1.8232340812683105, + "learning_rate": 2.3365236666743136e-05, + "loss": 5.6726, + "step": 5402 + }, + { + "epoch": 0.5210221793635487, + "grad_norm": 1.5814887285232544, + "learning_rate": 2.3357679198829903e-05, + "loss": 5.5446, + "step": 5403 + }, + { + "epoch": 0.5211186113789779, + "grad_norm": 1.3898606300354004, + "learning_rate": 2.3350121881646853e-05, + "loss": 5.6413, + "step": 5404 + }, + { + "epoch": 0.5212150433944069, + "grad_norm": 1.6515506505966187, + "learning_rate": 2.3342564715887575e-05, + "loss": 5.5781, + "step": 5405 + }, + { + "epoch": 0.521311475409836, + "grad_norm": 1.530526876449585, + "learning_rate": 2.3335007702245677e-05, + "loss": 5.613, + "step": 5406 + }, + { + "epoch": 0.5214079074252652, + "grad_norm": 1.49728524684906, + "learning_rate": 2.3327450841414716e-05, + "loss": 5.6563, + "step": 5407 + }, + { + "epoch": 0.5215043394406943, + "grad_norm": 2.917083978652954, + "learning_rate": 2.331989413408826e-05, + "loss": 5.4577, + "step": 5408 + }, + { + "epoch": 0.5216007714561234, + "grad_norm": 1.95597243309021, + "learning_rate": 2.331233758095985e-05, + "loss": 5.5796, + "step": 5409 + }, + { + "epoch": 0.5216972034715526, + "grad_norm": 1.5753052234649658, + "learning_rate": 2.330478118272303e-05, + "loss": 5.5317, + "step": 5410 + }, + { + "epoch": 0.5217936354869817, + "grad_norm": 1.8215899467468262, + "learning_rate": 2.3297224940071297e-05, + "loss": 5.6625, + "step": 5411 + }, + { + "epoch": 0.5218900675024108, + "grad_norm": 1.7838473320007324, + "learning_rate": 2.3289668853698166e-05, + "loss": 5.503, + "step": 5412 + }, + { + "epoch": 0.5219864995178399, + "grad_norm": 1.3518576622009277, + "learning_rate": 2.328211292429712e-05, + "loss": 5.6858, + "step": 5413 + }, + { + "epoch": 0.522082931533269, + "grad_norm": 1.72842276096344, + "learning_rate": 2.3274557152561637e-05, + "loss": 5.4765, + "step": 5414 + }, + { + "epoch": 0.5221793635486982, + "grad_norm": 2.341963768005371, + "learning_rate": 2.326700153918518e-05, + "loss": 5.7418, + "step": 5415 + }, + { + "epoch": 0.5222757955641273, + "grad_norm": 1.4943993091583252, + "learning_rate": 2.3259446084861177e-05, + "loss": 5.5405, + "step": 5416 + }, + { + "epoch": 0.5223722275795564, + "grad_norm": 2.5774123668670654, + "learning_rate": 2.325189079028308e-05, + "loss": 5.639, + "step": 5417 + }, + { + "epoch": 0.5224686595949856, + "grad_norm": 2.3404054641723633, + "learning_rate": 2.3244335656144297e-05, + "loss": 5.7911, + "step": 5418 + }, + { + "epoch": 0.5225650916104146, + "grad_norm": 1.979109525680542, + "learning_rate": 2.3236780683138214e-05, + "loss": 5.5591, + "step": 5419 + }, + { + "epoch": 0.5226615236258437, + "grad_norm": 2.2337145805358887, + "learning_rate": 2.3229225871958243e-05, + "loss": 5.5835, + "step": 5420 + }, + { + "epoch": 0.5227579556412729, + "grad_norm": 2.912255048751831, + "learning_rate": 2.322167122329774e-05, + "loss": 5.6159, + "step": 5421 + }, + { + "epoch": 0.522854387656702, + "grad_norm": 2.460742712020874, + "learning_rate": 2.3214116737850057e-05, + "loss": 5.7774, + "step": 5422 + }, + { + "epoch": 0.5229508196721312, + "grad_norm": 2.209347724914551, + "learning_rate": 2.3206562416308547e-05, + "loss": 5.5658, + "step": 5423 + }, + { + "epoch": 0.5230472516875603, + "grad_norm": 1.9425562620162964, + "learning_rate": 2.319900825936652e-05, + "loss": 5.6873, + "step": 5424 + }, + { + "epoch": 0.5231436837029894, + "grad_norm": 2.0074265003204346, + "learning_rate": 2.3191454267717313e-05, + "loss": 5.7938, + "step": 5425 + }, + { + "epoch": 0.5232401157184186, + "grad_norm": 1.5191457271575928, + "learning_rate": 2.3183900442054203e-05, + "loss": 5.6172, + "step": 5426 + }, + { + "epoch": 0.5233365477338476, + "grad_norm": 1.8347903490066528, + "learning_rate": 2.3176346783070467e-05, + "loss": 5.6424, + "step": 5427 + }, + { + "epoch": 0.5234329797492767, + "grad_norm": 1.8751752376556396, + "learning_rate": 2.316879329145939e-05, + "loss": 5.5551, + "step": 5428 + }, + { + "epoch": 0.5235294117647059, + "grad_norm": 1.3210020065307617, + "learning_rate": 2.3161239967914206e-05, + "loss": 5.5372, + "step": 5429 + }, + { + "epoch": 0.523625843780135, + "grad_norm": 1.798600673675537, + "learning_rate": 2.3153686813128153e-05, + "loss": 5.5662, + "step": 5430 + }, + { + "epoch": 0.5237222757955641, + "grad_norm": 1.8270301818847656, + "learning_rate": 2.3146133827794453e-05, + "loss": 5.635, + "step": 5431 + }, + { + "epoch": 0.5238187078109933, + "grad_norm": 1.366235375404358, + "learning_rate": 2.3138581012606314e-05, + "loss": 5.5914, + "step": 5432 + }, + { + "epoch": 0.5239151398264223, + "grad_norm": 1.8413041830062866, + "learning_rate": 2.3131028368256904e-05, + "loss": 5.553, + "step": 5433 + }, + { + "epoch": 0.5240115718418515, + "grad_norm": 1.5663161277770996, + "learning_rate": 2.3123475895439425e-05, + "loss": 5.6271, + "step": 5434 + }, + { + "epoch": 0.5241080038572806, + "grad_norm": 1.4441174268722534, + "learning_rate": 2.3115923594847013e-05, + "loss": 5.5025, + "step": 5435 + }, + { + "epoch": 0.5242044358727097, + "grad_norm": 1.905132532119751, + "learning_rate": 2.3108371467172814e-05, + "loss": 5.5433, + "step": 5436 + }, + { + "epoch": 0.5243008678881389, + "grad_norm": 1.4443926811218262, + "learning_rate": 2.310081951310996e-05, + "loss": 5.4631, + "step": 5437 + }, + { + "epoch": 0.524397299903568, + "grad_norm": 1.545074462890625, + "learning_rate": 2.309326773335154e-05, + "loss": 5.4904, + "step": 5438 + }, + { + "epoch": 0.5244937319189971, + "grad_norm": 1.5397510528564453, + "learning_rate": 2.3085716128590672e-05, + "loss": 5.6102, + "step": 5439 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 1.358866810798645, + "learning_rate": 2.307816469952042e-05, + "loss": 5.5925, + "step": 5440 + }, + { + "epoch": 0.5246865959498553, + "grad_norm": 1.3593264818191528, + "learning_rate": 2.3070613446833842e-05, + "loss": 5.5387, + "step": 5441 + }, + { + "epoch": 0.5247830279652844, + "grad_norm": 1.623090386390686, + "learning_rate": 2.3063062371223992e-05, + "loss": 5.3891, + "step": 5442 + }, + { + "epoch": 0.5248794599807136, + "grad_norm": 1.2819515466690063, + "learning_rate": 2.30555114733839e-05, + "loss": 5.5536, + "step": 5443 + }, + { + "epoch": 0.5249758919961427, + "grad_norm": 1.2338680028915405, + "learning_rate": 2.3047960754006552e-05, + "loss": 5.6086, + "step": 5444 + }, + { + "epoch": 0.5250723240115719, + "grad_norm": 1.1719918251037598, + "learning_rate": 2.304041021378498e-05, + "loss": 5.5776, + "step": 5445 + }, + { + "epoch": 0.525168756027001, + "grad_norm": 1.2015317678451538, + "learning_rate": 2.3032859853412134e-05, + "loss": 5.5549, + "step": 5446 + }, + { + "epoch": 0.52526518804243, + "grad_norm": 1.5223313570022583, + "learning_rate": 2.3025309673581002e-05, + "loss": 5.3846, + "step": 5447 + }, + { + "epoch": 0.5253616200578592, + "grad_norm": 1.4143813848495483, + "learning_rate": 2.301775967498452e-05, + "loss": 5.3219, + "step": 5448 + }, + { + "epoch": 0.5254580520732883, + "grad_norm": 1.2607029676437378, + "learning_rate": 2.30102098583156e-05, + "loss": 5.4245, + "step": 5449 + }, + { + "epoch": 0.5255544840887174, + "grad_norm": 1.4783684015274048, + "learning_rate": 2.3002660224267182e-05, + "loss": 5.551, + "step": 5450 + }, + { + "epoch": 0.5256509161041466, + "grad_norm": 1.6164300441741943, + "learning_rate": 2.2995110773532145e-05, + "loss": 5.5486, + "step": 5451 + }, + { + "epoch": 0.5257473481195757, + "grad_norm": 1.2885981798171997, + "learning_rate": 2.298756150680337e-05, + "loss": 5.6241, + "step": 5452 + }, + { + "epoch": 0.5258437801350048, + "grad_norm": 1.5801029205322266, + "learning_rate": 2.2980012424773728e-05, + "loss": 5.5294, + "step": 5453 + }, + { + "epoch": 0.525940212150434, + "grad_norm": 1.937031626701355, + "learning_rate": 2.2972463528136064e-05, + "loss": 5.6434, + "step": 5454 + }, + { + "epoch": 0.526036644165863, + "grad_norm": 2.0629332065582275, + "learning_rate": 2.296491481758319e-05, + "loss": 5.2569, + "step": 5455 + }, + { + "epoch": 0.5261330761812922, + "grad_norm": 1.7239995002746582, + "learning_rate": 2.2957366293807942e-05, + "loss": 5.4661, + "step": 5456 + }, + { + "epoch": 0.5262295081967213, + "grad_norm": 2.3362653255462646, + "learning_rate": 2.294981795750309e-05, + "loss": 5.6093, + "step": 5457 + }, + { + "epoch": 0.5263259402121504, + "grad_norm": 2.172393321990967, + "learning_rate": 2.294226980936143e-05, + "loss": 5.5469, + "step": 5458 + }, + { + "epoch": 0.5264223722275796, + "grad_norm": 1.6680997610092163, + "learning_rate": 2.293472185007572e-05, + "loss": 5.6547, + "step": 5459 + }, + { + "epoch": 0.5265188042430087, + "grad_norm": 1.6083730459213257, + "learning_rate": 2.292717408033868e-05, + "loss": 5.552, + "step": 5460 + }, + { + "epoch": 0.5266152362584378, + "grad_norm": 1.824289083480835, + "learning_rate": 2.2919626500843068e-05, + "loss": 5.591, + "step": 5461 + }, + { + "epoch": 0.526711668273867, + "grad_norm": 1.6113489866256714, + "learning_rate": 2.291207911228157e-05, + "loss": 5.3297, + "step": 5462 + }, + { + "epoch": 0.526808100289296, + "grad_norm": 1.5819429159164429, + "learning_rate": 2.290453191534687e-05, + "loss": 5.5832, + "step": 5463 + }, + { + "epoch": 0.5269045323047251, + "grad_norm": 1.7593692541122437, + "learning_rate": 2.289698491073167e-05, + "loss": 5.6157, + "step": 5464 + }, + { + "epoch": 0.5270009643201543, + "grad_norm": 1.8430038690567017, + "learning_rate": 2.2889438099128603e-05, + "loss": 5.403, + "step": 5465 + }, + { + "epoch": 0.5270973963355834, + "grad_norm": 1.4383752346038818, + "learning_rate": 2.2881891481230294e-05, + "loss": 5.4393, + "step": 5466 + }, + { + "epoch": 0.5271938283510126, + "grad_norm": 1.6529649496078491, + "learning_rate": 2.2874345057729394e-05, + "loss": 5.5323, + "step": 5467 + }, + { + "epoch": 0.5272902603664417, + "grad_norm": 1.481125831604004, + "learning_rate": 2.2866798829318477e-05, + "loss": 5.6005, + "step": 5468 + }, + { + "epoch": 0.5273866923818707, + "grad_norm": 2.189753770828247, + "learning_rate": 2.285925279669015e-05, + "loss": 5.6328, + "step": 5469 + }, + { + "epoch": 0.5274831243972999, + "grad_norm": 1.7615185976028442, + "learning_rate": 2.285170696053696e-05, + "loss": 5.5687, + "step": 5470 + }, + { + "epoch": 0.527579556412729, + "grad_norm": 1.9822801351547241, + "learning_rate": 2.2844161321551454e-05, + "loss": 5.609, + "step": 5471 + }, + { + "epoch": 0.5276759884281581, + "grad_norm": 1.9644742012023926, + "learning_rate": 2.2836615880426183e-05, + "loss": 5.4104, + "step": 5472 + }, + { + "epoch": 0.5277724204435873, + "grad_norm": 1.9845967292785645, + "learning_rate": 2.2829070637853634e-05, + "loss": 5.5111, + "step": 5473 + }, + { + "epoch": 0.5278688524590164, + "grad_norm": 2.3713722229003906, + "learning_rate": 2.282152559452631e-05, + "loss": 5.5706, + "step": 5474 + }, + { + "epoch": 0.5279652844744455, + "grad_norm": 2.602914810180664, + "learning_rate": 2.281398075113669e-05, + "loss": 5.5245, + "step": 5475 + }, + { + "epoch": 0.5280617164898747, + "grad_norm": 1.444299340248108, + "learning_rate": 2.2806436108377223e-05, + "loss": 5.6061, + "step": 5476 + }, + { + "epoch": 0.5281581485053037, + "grad_norm": 2.178642511367798, + "learning_rate": 2.2798891666940344e-05, + "loss": 5.6296, + "step": 5477 + }, + { + "epoch": 0.5282545805207329, + "grad_norm": 1.905186653137207, + "learning_rate": 2.2791347427518493e-05, + "loss": 5.6985, + "step": 5478 + }, + { + "epoch": 0.528351012536162, + "grad_norm": 2.099764108657837, + "learning_rate": 2.2783803390804043e-05, + "loss": 5.682, + "step": 5479 + }, + { + "epoch": 0.5284474445515911, + "grad_norm": 1.8779932260513306, + "learning_rate": 2.27762595574894e-05, + "loss": 5.8357, + "step": 5480 + }, + { + "epoch": 0.5285438765670203, + "grad_norm": 1.856121301651001, + "learning_rate": 2.276871592826692e-05, + "loss": 5.6766, + "step": 5481 + }, + { + "epoch": 0.5286403085824494, + "grad_norm": 2.5026397705078125, + "learning_rate": 2.276117250382893e-05, + "loss": 5.6219, + "step": 5482 + }, + { + "epoch": 0.5287367405978785, + "grad_norm": 2.064504623413086, + "learning_rate": 2.275362928486779e-05, + "loss": 5.5564, + "step": 5483 + }, + { + "epoch": 0.5288331726133076, + "grad_norm": 2.034388780593872, + "learning_rate": 2.2746086272075782e-05, + "loss": 5.5612, + "step": 5484 + }, + { + "epoch": 0.5289296046287367, + "grad_norm": 2.8534159660339355, + "learning_rate": 2.2738543466145196e-05, + "loss": 5.577, + "step": 5485 + }, + { + "epoch": 0.5290260366441658, + "grad_norm": 1.9989347457885742, + "learning_rate": 2.2731000867768317e-05, + "loss": 5.5628, + "step": 5486 + }, + { + "epoch": 0.529122468659595, + "grad_norm": 1.886878490447998, + "learning_rate": 2.2723458477637385e-05, + "loss": 5.58, + "step": 5487 + }, + { + "epoch": 0.5292189006750241, + "grad_norm": 1.638913869857788, + "learning_rate": 2.2715916296444622e-05, + "loss": 5.4905, + "step": 5488 + }, + { + "epoch": 0.5293153326904533, + "grad_norm": 1.7191165685653687, + "learning_rate": 2.270837432488226e-05, + "loss": 5.6796, + "step": 5489 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 2.892695426940918, + "learning_rate": 2.2700832563642475e-05, + "loss": 5.6028, + "step": 5490 + }, + { + "epoch": 0.5295081967213114, + "grad_norm": 2.561357021331787, + "learning_rate": 2.2693291013417453e-05, + "loss": 5.5463, + "step": 5491 + }, + { + "epoch": 0.5296046287367406, + "grad_norm": 3.2471137046813965, + "learning_rate": 2.2685749674899345e-05, + "loss": 5.6762, + "step": 5492 + }, + { + "epoch": 0.5297010607521697, + "grad_norm": 2.0848560333251953, + "learning_rate": 2.267820854878027e-05, + "loss": 5.5512, + "step": 5493 + }, + { + "epoch": 0.5297974927675988, + "grad_norm": 3.5201892852783203, + "learning_rate": 2.2670667635752375e-05, + "loss": 5.4432, + "step": 5494 + }, + { + "epoch": 0.529893924783028, + "grad_norm": 2.50726318359375, + "learning_rate": 2.2663126936507725e-05, + "loss": 5.6517, + "step": 5495 + }, + { + "epoch": 0.5299903567984571, + "grad_norm": 1.8895289897918701, + "learning_rate": 2.265558645173841e-05, + "loss": 5.5564, + "step": 5496 + }, + { + "epoch": 0.5300867888138862, + "grad_norm": 2.186419725418091, + "learning_rate": 2.2648046182136488e-05, + "loss": 5.605, + "step": 5497 + }, + { + "epoch": 0.5301832208293153, + "grad_norm": 2.363389015197754, + "learning_rate": 2.2640506128393994e-05, + "loss": 5.6403, + "step": 5498 + }, + { + "epoch": 0.5302796528447444, + "grad_norm": 1.665989875793457, + "learning_rate": 2.2632966291202933e-05, + "loss": 5.7183, + "step": 5499 + }, + { + "epoch": 0.5303760848601736, + "grad_norm": 1.927624225616455, + "learning_rate": 2.2625426671255325e-05, + "loss": 5.6009, + "step": 5500 + }, + { + "epoch": 0.5304725168756027, + "grad_norm": 2.7257020473480225, + "learning_rate": 2.2617887269243124e-05, + "loss": 5.4918, + "step": 5501 + }, + { + "epoch": 0.5305689488910318, + "grad_norm": 2.795987129211426, + "learning_rate": 2.2610348085858306e-05, + "loss": 5.6466, + "step": 5502 + }, + { + "epoch": 0.530665380906461, + "grad_norm": 1.9312498569488525, + "learning_rate": 2.2602809121792802e-05, + "loss": 5.5593, + "step": 5503 + }, + { + "epoch": 0.5307618129218901, + "grad_norm": 2.058480739593506, + "learning_rate": 2.2595270377738515e-05, + "loss": 5.655, + "step": 5504 + }, + { + "epoch": 0.5308582449373191, + "grad_norm": 2.7760627269744873, + "learning_rate": 2.258773185438736e-05, + "loss": 5.4521, + "step": 5505 + }, + { + "epoch": 0.5309546769527483, + "grad_norm": 2.1991851329803467, + "learning_rate": 2.258019355243121e-05, + "loss": 5.5866, + "step": 5506 + }, + { + "epoch": 0.5310511089681774, + "grad_norm": 1.3610478639602661, + "learning_rate": 2.257265547256191e-05, + "loss": 5.4792, + "step": 5507 + }, + { + "epoch": 0.5311475409836065, + "grad_norm": 1.8612116575241089, + "learning_rate": 2.256511761547131e-05, + "loss": 5.4217, + "step": 5508 + }, + { + "epoch": 0.5312439729990357, + "grad_norm": 1.4121733903884888, + "learning_rate": 2.255757998185122e-05, + "loss": 5.6035, + "step": 5509 + }, + { + "epoch": 0.5313404050144648, + "grad_norm": 1.1271133422851562, + "learning_rate": 2.2550042572393418e-05, + "loss": 5.6679, + "step": 5510 + }, + { + "epoch": 0.531436837029894, + "grad_norm": 1.736461877822876, + "learning_rate": 2.2542505387789708e-05, + "loss": 5.5252, + "step": 5511 + }, + { + "epoch": 0.531533269045323, + "grad_norm": 1.5580683946609497, + "learning_rate": 2.2534968428731817e-05, + "loss": 5.5424, + "step": 5512 + }, + { + "epoch": 0.5316297010607521, + "grad_norm": 1.8068863153457642, + "learning_rate": 2.25274316959115e-05, + "loss": 5.6069, + "step": 5513 + }, + { + "epoch": 0.5317261330761813, + "grad_norm": 1.7225921154022217, + "learning_rate": 2.2519895190020455e-05, + "loss": 5.7567, + "step": 5514 + }, + { + "epoch": 0.5318225650916104, + "grad_norm": 1.693621277809143, + "learning_rate": 2.2512358911750374e-05, + "loss": 5.5628, + "step": 5515 + }, + { + "epoch": 0.5319189971070395, + "grad_norm": 2.4116508960723877, + "learning_rate": 2.250482286179294e-05, + "loss": 5.4533, + "step": 5516 + }, + { + "epoch": 0.5320154291224687, + "grad_norm": 2.024221181869507, + "learning_rate": 2.2497287040839785e-05, + "loss": 5.594, + "step": 5517 + }, + { + "epoch": 0.5321118611378978, + "grad_norm": 1.9557231664657593, + "learning_rate": 2.2489751449582542e-05, + "loss": 5.301, + "step": 5518 + }, + { + "epoch": 0.5322082931533269, + "grad_norm": 1.5728482007980347, + "learning_rate": 2.2482216088712827e-05, + "loss": 5.554, + "step": 5519 + }, + { + "epoch": 0.532304725168756, + "grad_norm": 1.3479323387145996, + "learning_rate": 2.2474680958922224e-05, + "loss": 5.5937, + "step": 5520 + }, + { + "epoch": 0.5324011571841851, + "grad_norm": 1.5733009576797485, + "learning_rate": 2.2467146060902282e-05, + "loss": 5.5581, + "step": 5521 + }, + { + "epoch": 0.5324975891996143, + "grad_norm": 1.5606364011764526, + "learning_rate": 2.245961139534457e-05, + "loss": 5.6054, + "step": 5522 + }, + { + "epoch": 0.5325940212150434, + "grad_norm": 1.3753010034561157, + "learning_rate": 2.2452076962940582e-05, + "loss": 5.5957, + "step": 5523 + }, + { + "epoch": 0.5326904532304725, + "grad_norm": 1.977766752243042, + "learning_rate": 2.2444542764381847e-05, + "loss": 5.4428, + "step": 5524 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 1.7181471586227417, + "learning_rate": 2.2437008800359833e-05, + "loss": 5.436, + "step": 5525 + }, + { + "epoch": 0.5328833172613308, + "grad_norm": 1.5202038288116455, + "learning_rate": 2.2429475071565987e-05, + "loss": 5.6095, + "step": 5526 + }, + { + "epoch": 0.5329797492767598, + "grad_norm": 1.6571041345596313, + "learning_rate": 2.2421941578691766e-05, + "loss": 5.6144, + "step": 5527 + }, + { + "epoch": 0.533076181292189, + "grad_norm": 1.674044132232666, + "learning_rate": 2.2414408322428572e-05, + "loss": 5.5746, + "step": 5528 + }, + { + "epoch": 0.5331726133076181, + "grad_norm": 1.950019121170044, + "learning_rate": 2.240687530346779e-05, + "loss": 5.6187, + "step": 5529 + }, + { + "epoch": 0.5332690453230472, + "grad_norm": 1.923075795173645, + "learning_rate": 2.2399342522500814e-05, + "loss": 5.5913, + "step": 5530 + }, + { + "epoch": 0.5333654773384764, + "grad_norm": 2.446244478225708, + "learning_rate": 2.2391809980218977e-05, + "loss": 5.4956, + "step": 5531 + }, + { + "epoch": 0.5334619093539055, + "grad_norm": 1.9079065322875977, + "learning_rate": 2.23842776773136e-05, + "loss": 5.5649, + "step": 5532 + }, + { + "epoch": 0.5335583413693347, + "grad_norm": 1.635182499885559, + "learning_rate": 2.237674561447601e-05, + "loss": 5.5531, + "step": 5533 + }, + { + "epoch": 0.5336547733847637, + "grad_norm": 1.6240007877349854, + "learning_rate": 2.2369213792397467e-05, + "loss": 5.5883, + "step": 5534 + }, + { + "epoch": 0.5337512054001928, + "grad_norm": 1.4993157386779785, + "learning_rate": 2.236168221176926e-05, + "loss": 5.5431, + "step": 5535 + }, + { + "epoch": 0.533847637415622, + "grad_norm": 1.2149608135223389, + "learning_rate": 2.2354150873282597e-05, + "loss": 5.4954, + "step": 5536 + }, + { + "epoch": 0.5339440694310511, + "grad_norm": 1.8181010484695435, + "learning_rate": 2.234661977762872e-05, + "loss": 5.541, + "step": 5537 + }, + { + "epoch": 0.5340405014464802, + "grad_norm": 2.0726897716522217, + "learning_rate": 2.2339088925498808e-05, + "loss": 5.5889, + "step": 5538 + }, + { + "epoch": 0.5341369334619094, + "grad_norm": 1.7775225639343262, + "learning_rate": 2.2331558317584046e-05, + "loss": 5.4367, + "step": 5539 + }, + { + "epoch": 0.5342333654773385, + "grad_norm": 1.1379225254058838, + "learning_rate": 2.2324027954575573e-05, + "loss": 5.5485, + "step": 5540 + }, + { + "epoch": 0.5343297974927675, + "grad_norm": 2.1045398712158203, + "learning_rate": 2.231649783716452e-05, + "loss": 5.4704, + "step": 5541 + }, + { + "epoch": 0.5344262295081967, + "grad_norm": 1.8368397951126099, + "learning_rate": 2.2308967966042e-05, + "loss": 5.4781, + "step": 5542 + }, + { + "epoch": 0.5345226615236258, + "grad_norm": 1.8512383699417114, + "learning_rate": 2.2301438341899073e-05, + "loss": 5.5696, + "step": 5543 + }, + { + "epoch": 0.534619093539055, + "grad_norm": 1.6185414791107178, + "learning_rate": 2.2293908965426827e-05, + "loss": 5.5519, + "step": 5544 + }, + { + "epoch": 0.5347155255544841, + "grad_norm": 1.408625602722168, + "learning_rate": 2.2286379837316273e-05, + "loss": 5.4892, + "step": 5545 + }, + { + "epoch": 0.5348119575699132, + "grad_norm": 1.7448927164077759, + "learning_rate": 2.227885095825845e-05, + "loss": 5.5548, + "step": 5546 + }, + { + "epoch": 0.5349083895853424, + "grad_norm": 1.4064810276031494, + "learning_rate": 2.227132232894434e-05, + "loss": 5.5773, + "step": 5547 + }, + { + "epoch": 0.5350048216007715, + "grad_norm": 1.3261638879776, + "learning_rate": 2.2263793950064893e-05, + "loss": 5.5285, + "step": 5548 + }, + { + "epoch": 0.5351012536162005, + "grad_norm": 1.9020007848739624, + "learning_rate": 2.225626582231108e-05, + "loss": 5.5607, + "step": 5549 + }, + { + "epoch": 0.5351976856316297, + "grad_norm": 1.7966670989990234, + "learning_rate": 2.2248737946373815e-05, + "loss": 5.5962, + "step": 5550 + }, + { + "epoch": 0.5352941176470588, + "grad_norm": 2.309965133666992, + "learning_rate": 2.2241210322943983e-05, + "loss": 5.5496, + "step": 5551 + }, + { + "epoch": 0.5353905496624879, + "grad_norm": 2.2613370418548584, + "learning_rate": 2.2233682952712485e-05, + "loss": 5.5289, + "step": 5552 + }, + { + "epoch": 0.5354869816779171, + "grad_norm": 2.6301465034484863, + "learning_rate": 2.222615583637016e-05, + "loss": 5.5332, + "step": 5553 + }, + { + "epoch": 0.5355834136933462, + "grad_norm": 1.9432182312011719, + "learning_rate": 2.2218628974607826e-05, + "loss": 5.7295, + "step": 5554 + }, + { + "epoch": 0.5356798457087754, + "grad_norm": 1.466915488243103, + "learning_rate": 2.221110236811631e-05, + "loss": 5.694, + "step": 5555 + }, + { + "epoch": 0.5357762777242044, + "grad_norm": 1.9011698961257935, + "learning_rate": 2.220357601758638e-05, + "loss": 5.594, + "step": 5556 + }, + { + "epoch": 0.5358727097396335, + "grad_norm": 2.183697462081909, + "learning_rate": 2.219604992370881e-05, + "loss": 5.5872, + "step": 5557 + }, + { + "epoch": 0.5359691417550627, + "grad_norm": 1.7554761171340942, + "learning_rate": 2.2188524087174322e-05, + "loss": 5.6075, + "step": 5558 + }, + { + "epoch": 0.5360655737704918, + "grad_norm": 1.4426090717315674, + "learning_rate": 2.218099850867363e-05, + "loss": 5.5365, + "step": 5559 + }, + { + "epoch": 0.5361620057859209, + "grad_norm": 1.6199034452438354, + "learning_rate": 2.217347318889743e-05, + "loss": 5.499, + "step": 5560 + }, + { + "epoch": 0.5362584378013501, + "grad_norm": 1.6710808277130127, + "learning_rate": 2.2165948128536384e-05, + "loss": 5.5028, + "step": 5561 + }, + { + "epoch": 0.5363548698167792, + "grad_norm": 1.4234611988067627, + "learning_rate": 2.215842332828112e-05, + "loss": 5.6978, + "step": 5562 + }, + { + "epoch": 0.5364513018322082, + "grad_norm": 1.6468712091445923, + "learning_rate": 2.215089878882228e-05, + "loss": 5.718, + "step": 5563 + }, + { + "epoch": 0.5365477338476374, + "grad_norm": 2.4174106121063232, + "learning_rate": 2.214337451085044e-05, + "loss": 5.5988, + "step": 5564 + }, + { + "epoch": 0.5366441658630665, + "grad_norm": 1.510755181312561, + "learning_rate": 2.213585049505616e-05, + "loss": 5.5699, + "step": 5565 + }, + { + "epoch": 0.5367405978784957, + "grad_norm": 1.8307744264602661, + "learning_rate": 2.2128326742130008e-05, + "loss": 5.5225, + "step": 5566 + }, + { + "epoch": 0.5368370298939248, + "grad_norm": 1.6093932390213013, + "learning_rate": 2.212080325276248e-05, + "loss": 5.5257, + "step": 5567 + }, + { + "epoch": 0.5369334619093539, + "grad_norm": 1.5156362056732178, + "learning_rate": 2.21132800276441e-05, + "loss": 5.5777, + "step": 5568 + }, + { + "epoch": 0.5370298939247831, + "grad_norm": 1.4473375082015991, + "learning_rate": 2.2105757067465324e-05, + "loss": 5.5705, + "step": 5569 + }, + { + "epoch": 0.5371263259402121, + "grad_norm": 1.529850721359253, + "learning_rate": 2.2098234372916593e-05, + "loss": 5.6339, + "step": 5570 + }, + { + "epoch": 0.5372227579556412, + "grad_norm": 2.031456708908081, + "learning_rate": 2.2090711944688347e-05, + "loss": 5.5135, + "step": 5571 + }, + { + "epoch": 0.5373191899710704, + "grad_norm": 1.5275371074676514, + "learning_rate": 2.208318978347098e-05, + "loss": 5.4682, + "step": 5572 + }, + { + "epoch": 0.5374156219864995, + "grad_norm": 1.5623942613601685, + "learning_rate": 2.2075667889954857e-05, + "loss": 5.4268, + "step": 5573 + }, + { + "epoch": 0.5375120540019286, + "grad_norm": 1.747495412826538, + "learning_rate": 2.2068146264830342e-05, + "loss": 5.6714, + "step": 5574 + }, + { + "epoch": 0.5376084860173578, + "grad_norm": 1.5752308368682861, + "learning_rate": 2.2060624908787757e-05, + "loss": 5.6246, + "step": 5575 + }, + { + "epoch": 0.5377049180327869, + "grad_norm": 1.4423587322235107, + "learning_rate": 2.2053103822517387e-05, + "loss": 5.6256, + "step": 5576 + }, + { + "epoch": 0.537801350048216, + "grad_norm": 1.5532292127609253, + "learning_rate": 2.2045583006709536e-05, + "loss": 5.5762, + "step": 5577 + }, + { + "epoch": 0.5378977820636451, + "grad_norm": 2.1382884979248047, + "learning_rate": 2.2038062462054427e-05, + "loss": 5.571, + "step": 5578 + }, + { + "epoch": 0.5379942140790742, + "grad_norm": 1.5676462650299072, + "learning_rate": 2.203054218924231e-05, + "loss": 5.5538, + "step": 5579 + }, + { + "epoch": 0.5380906460945034, + "grad_norm": 1.6456271409988403, + "learning_rate": 2.202302218896337e-05, + "loss": 5.5099, + "step": 5580 + }, + { + "epoch": 0.5381870781099325, + "grad_norm": 2.0953900814056396, + "learning_rate": 2.2015502461907793e-05, + "loss": 5.6361, + "step": 5581 + }, + { + "epoch": 0.5382835101253616, + "grad_norm": 1.6619398593902588, + "learning_rate": 2.200798300876572e-05, + "loss": 5.686, + "step": 5582 + }, + { + "epoch": 0.5383799421407908, + "grad_norm": 1.555191159248352, + "learning_rate": 2.2000463830227294e-05, + "loss": 5.7447, + "step": 5583 + }, + { + "epoch": 0.5384763741562199, + "grad_norm": 1.5804558992385864, + "learning_rate": 2.1992944926982596e-05, + "loss": 5.6297, + "step": 5584 + }, + { + "epoch": 0.5385728061716489, + "grad_norm": 1.6089519262313843, + "learning_rate": 2.198542629972172e-05, + "loss": 5.6333, + "step": 5585 + }, + { + "epoch": 0.5386692381870781, + "grad_norm": 1.6957677602767944, + "learning_rate": 2.197790794913471e-05, + "loss": 5.5471, + "step": 5586 + }, + { + "epoch": 0.5387656702025072, + "grad_norm": 1.5009660720825195, + "learning_rate": 2.1970389875911576e-05, + "loss": 5.446, + "step": 5587 + }, + { + "epoch": 0.5388621022179364, + "grad_norm": 1.9925442934036255, + "learning_rate": 2.196287208074234e-05, + "loss": 5.663, + "step": 5588 + }, + { + "epoch": 0.5389585342333655, + "grad_norm": 1.3115289211273193, + "learning_rate": 2.195535456431696e-05, + "loss": 5.5744, + "step": 5589 + }, + { + "epoch": 0.5390549662487946, + "grad_norm": 1.3443198204040527, + "learning_rate": 2.1947837327325398e-05, + "loss": 5.6361, + "step": 5590 + }, + { + "epoch": 0.5391513982642238, + "grad_norm": 1.5319020748138428, + "learning_rate": 2.194032037045757e-05, + "loss": 5.517, + "step": 5591 + }, + { + "epoch": 0.5392478302796528, + "grad_norm": 1.708029866218567, + "learning_rate": 2.1932803694403366e-05, + "loss": 5.5079, + "step": 5592 + }, + { + "epoch": 0.5393442622950819, + "grad_norm": 2.1371214389801025, + "learning_rate": 2.1925287299852668e-05, + "loss": 5.661, + "step": 5593 + }, + { + "epoch": 0.5394406943105111, + "grad_norm": 1.6327272653579712, + "learning_rate": 2.191777118749532e-05, + "loss": 5.5221, + "step": 5594 + }, + { + "epoch": 0.5395371263259402, + "grad_norm": 1.7298249006271362, + "learning_rate": 2.1910255358021128e-05, + "loss": 5.4332, + "step": 5595 + }, + { + "epoch": 0.5396335583413693, + "grad_norm": 1.6745151281356812, + "learning_rate": 2.190273981211991e-05, + "loss": 5.4314, + "step": 5596 + }, + { + "epoch": 0.5397299903567985, + "grad_norm": 1.7653107643127441, + "learning_rate": 2.1895224550481412e-05, + "loss": 5.7335, + "step": 5597 + }, + { + "epoch": 0.5398264223722276, + "grad_norm": 1.9542378187179565, + "learning_rate": 2.188770957379538e-05, + "loss": 5.4988, + "step": 5598 + }, + { + "epoch": 0.5399228543876567, + "grad_norm": 2.7657968997955322, + "learning_rate": 2.1880194882751542e-05, + "loss": 5.6858, + "step": 5599 + }, + { + "epoch": 0.5400192864030858, + "grad_norm": 2.5985147953033447, + "learning_rate": 2.1872680478039564e-05, + "loss": 5.4283, + "step": 5600 + }, + { + "epoch": 0.5401157184185149, + "grad_norm": 1.5065793991088867, + "learning_rate": 2.1865166360349136e-05, + "loss": 5.7124, + "step": 5601 + }, + { + "epoch": 0.5402121504339441, + "grad_norm": 1.7390047311782837, + "learning_rate": 2.1857652530369878e-05, + "loss": 5.6309, + "step": 5602 + }, + { + "epoch": 0.5403085824493732, + "grad_norm": 1.4927266836166382, + "learning_rate": 2.18501389887914e-05, + "loss": 5.4891, + "step": 5603 + }, + { + "epoch": 0.5404050144648023, + "grad_norm": 1.7365463972091675, + "learning_rate": 2.184262573630329e-05, + "loss": 5.5572, + "step": 5604 + }, + { + "epoch": 0.5405014464802315, + "grad_norm": 1.753395676612854, + "learning_rate": 2.1835112773595106e-05, + "loss": 5.596, + "step": 5605 + }, + { + "epoch": 0.5405978784956605, + "grad_norm": 1.4767613410949707, + "learning_rate": 2.182760010135637e-05, + "loss": 5.4777, + "step": 5606 + }, + { + "epoch": 0.5406943105110896, + "grad_norm": 1.7538809776306152, + "learning_rate": 2.18200877202766e-05, + "loss": 5.5217, + "step": 5607 + }, + { + "epoch": 0.5407907425265188, + "grad_norm": 1.676123023033142, + "learning_rate": 2.181257563104527e-05, + "loss": 5.2995, + "step": 5608 + }, + { + "epoch": 0.5408871745419479, + "grad_norm": 2.1342220306396484, + "learning_rate": 2.1805063834351808e-05, + "loss": 5.4789, + "step": 5609 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 2.595621347427368, + "learning_rate": 2.1797552330885667e-05, + "loss": 5.7829, + "step": 5610 + }, + { + "epoch": 0.5410800385728062, + "grad_norm": 2.0857834815979004, + "learning_rate": 2.1790041121336225e-05, + "loss": 5.5499, + "step": 5611 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 2.7853357791900635, + "learning_rate": 2.1782530206392865e-05, + "loss": 5.416, + "step": 5612 + }, + { + "epoch": 0.5412729026036645, + "grad_norm": 1.797324776649475, + "learning_rate": 2.1775019586744923e-05, + "loss": 5.4856, + "step": 5613 + }, + { + "epoch": 0.5413693346190935, + "grad_norm": 1.8038886785507202, + "learning_rate": 2.1767509263081707e-05, + "loss": 5.5886, + "step": 5614 + }, + { + "epoch": 0.5414657666345226, + "grad_norm": 1.4879153966903687, + "learning_rate": 2.1759999236092517e-05, + "loss": 5.4055, + "step": 5615 + }, + { + "epoch": 0.5415621986499518, + "grad_norm": 1.6524041891098022, + "learning_rate": 2.1752489506466616e-05, + "loss": 5.4812, + "step": 5616 + }, + { + "epoch": 0.5416586306653809, + "grad_norm": 1.837174654006958, + "learning_rate": 2.1744980074893215e-05, + "loss": 5.5622, + "step": 5617 + }, + { + "epoch": 0.5417550626808101, + "grad_norm": 1.6691921949386597, + "learning_rate": 2.173747094206155e-05, + "loss": 5.5352, + "step": 5618 + }, + { + "epoch": 0.5418514946962392, + "grad_norm": 1.512626051902771, + "learning_rate": 2.1729962108660774e-05, + "loss": 5.8172, + "step": 5619 + }, + { + "epoch": 0.5419479267116682, + "grad_norm": 1.9485856294631958, + "learning_rate": 2.172245357538006e-05, + "loss": 5.4409, + "step": 5620 + }, + { + "epoch": 0.5420443587270974, + "grad_norm": 1.6542912721633911, + "learning_rate": 2.1714945342908526e-05, + "loss": 5.551, + "step": 5621 + }, + { + "epoch": 0.5421407907425265, + "grad_norm": 1.6246281862258911, + "learning_rate": 2.1707437411935256e-05, + "loss": 5.6292, + "step": 5622 + }, + { + "epoch": 0.5422372227579556, + "grad_norm": 1.6438220739364624, + "learning_rate": 2.1699929783149334e-05, + "loss": 5.6505, + "step": 5623 + }, + { + "epoch": 0.5423336547733848, + "grad_norm": 1.451906681060791, + "learning_rate": 2.1692422457239792e-05, + "loss": 5.6213, + "step": 5624 + }, + { + "epoch": 0.5424300867888139, + "grad_norm": 1.388639211654663, + "learning_rate": 2.1684915434895644e-05, + "loss": 5.6261, + "step": 5625 + }, + { + "epoch": 0.542526518804243, + "grad_norm": 1.3010618686676025, + "learning_rate": 2.167740871680588e-05, + "loss": 5.4473, + "step": 5626 + }, + { + "epoch": 0.5426229508196722, + "grad_norm": 1.3649450540542603, + "learning_rate": 2.1669902303659458e-05, + "loss": 5.475, + "step": 5627 + }, + { + "epoch": 0.5427193828351012, + "grad_norm": 1.497362732887268, + "learning_rate": 2.1662396196145292e-05, + "loss": 5.5044, + "step": 5628 + }, + { + "epoch": 0.5428158148505304, + "grad_norm": 1.9276471138000488, + "learning_rate": 2.165489039495231e-05, + "loss": 5.6329, + "step": 5629 + }, + { + "epoch": 0.5429122468659595, + "grad_norm": 1.8021613359451294, + "learning_rate": 2.1647384900769356e-05, + "loss": 5.3936, + "step": 5630 + }, + { + "epoch": 0.5430086788813886, + "grad_norm": 1.7638517618179321, + "learning_rate": 2.1639879714285302e-05, + "loss": 5.4485, + "step": 5631 + }, + { + "epoch": 0.5431051108968178, + "grad_norm": 1.900415301322937, + "learning_rate": 2.1632374836188952e-05, + "loss": 5.4751, + "step": 5632 + }, + { + "epoch": 0.5432015429122469, + "grad_norm": 1.3103591203689575, + "learning_rate": 2.1624870267169084e-05, + "loss": 5.3879, + "step": 5633 + }, + { + "epoch": 0.543297974927676, + "grad_norm": 1.0852974653244019, + "learning_rate": 2.161736600791448e-05, + "loss": 5.4108, + "step": 5634 + }, + { + "epoch": 0.5433944069431051, + "grad_norm": 1.7793582677841187, + "learning_rate": 2.1609862059113857e-05, + "loss": 5.4186, + "step": 5635 + }, + { + "epoch": 0.5434908389585342, + "grad_norm": 1.8256845474243164, + "learning_rate": 2.1602358421455914e-05, + "loss": 5.5064, + "step": 5636 + }, + { + "epoch": 0.5435872709739633, + "grad_norm": 1.819149136543274, + "learning_rate": 2.1594855095629345e-05, + "loss": 5.3721, + "step": 5637 + }, + { + "epoch": 0.5436837029893925, + "grad_norm": 1.9043174982070923, + "learning_rate": 2.158735208232278e-05, + "loss": 5.5493, + "step": 5638 + }, + { + "epoch": 0.5437801350048216, + "grad_norm": 2.049851179122925, + "learning_rate": 2.1579849382224834e-05, + "loss": 5.5156, + "step": 5639 + }, + { + "epoch": 0.5438765670202508, + "grad_norm": 1.6881201267242432, + "learning_rate": 2.157234699602411e-05, + "loss": 5.4466, + "step": 5640 + }, + { + "epoch": 0.5439729990356799, + "grad_norm": 2.6744203567504883, + "learning_rate": 2.1564844924409154e-05, + "loss": 5.5028, + "step": 5641 + }, + { + "epoch": 0.5440694310511089, + "grad_norm": 2.4307470321655273, + "learning_rate": 2.155734316806851e-05, + "loss": 5.4598, + "step": 5642 + }, + { + "epoch": 0.5441658630665381, + "grad_norm": 1.6273126602172852, + "learning_rate": 2.1549841727690667e-05, + "loss": 5.5141, + "step": 5643 + }, + { + "epoch": 0.5442622950819672, + "grad_norm": 2.175199270248413, + "learning_rate": 2.1542340603964102e-05, + "loss": 5.485, + "step": 5644 + }, + { + "epoch": 0.5443587270973963, + "grad_norm": 2.042095184326172, + "learning_rate": 2.1534839797577268e-05, + "loss": 5.4227, + "step": 5645 + }, + { + "epoch": 0.5444551591128255, + "grad_norm": 1.6007698774337769, + "learning_rate": 2.152733930921857e-05, + "loss": 5.3162, + "step": 5646 + }, + { + "epoch": 0.5445515911282546, + "grad_norm": 1.5306603908538818, + "learning_rate": 2.1519839139576393e-05, + "loss": 5.3909, + "step": 5647 + }, + { + "epoch": 0.5446480231436837, + "grad_norm": 2.2977099418640137, + "learning_rate": 2.1512339289339096e-05, + "loss": 5.6053, + "step": 5648 + }, + { + "epoch": 0.5447444551591129, + "grad_norm": 1.5516207218170166, + "learning_rate": 2.1504839759195015e-05, + "loss": 5.5775, + "step": 5649 + }, + { + "epoch": 0.5448408871745419, + "grad_norm": 1.9998137950897217, + "learning_rate": 2.1497340549832425e-05, + "loss": 5.4709, + "step": 5650 + }, + { + "epoch": 0.5449373191899711, + "grad_norm": 2.2060883045196533, + "learning_rate": 2.148984166193962e-05, + "loss": 5.5618, + "step": 5651 + }, + { + "epoch": 0.5450337512054002, + "grad_norm": 1.4877640008926392, + "learning_rate": 2.1482343096204817e-05, + "loss": 5.5457, + "step": 5652 + }, + { + "epoch": 0.5451301832208293, + "grad_norm": 1.8618882894515991, + "learning_rate": 2.1474844853316245e-05, + "loss": 5.6367, + "step": 5653 + }, + { + "epoch": 0.5452266152362585, + "grad_norm": 2.1273231506347656, + "learning_rate": 2.1467346933962075e-05, + "loss": 5.6237, + "step": 5654 + }, + { + "epoch": 0.5453230472516876, + "grad_norm": 1.576305627822876, + "learning_rate": 2.1459849338830443e-05, + "loss": 5.6365, + "step": 5655 + }, + { + "epoch": 0.5454194792671166, + "grad_norm": 1.9970860481262207, + "learning_rate": 2.1452352068609493e-05, + "loss": 5.5993, + "step": 5656 + }, + { + "epoch": 0.5455159112825458, + "grad_norm": 1.8531895875930786, + "learning_rate": 2.14448551239873e-05, + "loss": 5.6569, + "step": 5657 + }, + { + "epoch": 0.5456123432979749, + "grad_norm": 1.5756255388259888, + "learning_rate": 2.143735850565192e-05, + "loss": 5.5877, + "step": 5658 + }, + { + "epoch": 0.545708775313404, + "grad_norm": 2.6023292541503906, + "learning_rate": 2.1429862214291406e-05, + "loss": 5.3029, + "step": 5659 + }, + { + "epoch": 0.5458052073288332, + "grad_norm": 2.0852432250976562, + "learning_rate": 2.142236625059374e-05, + "loss": 5.2002, + "step": 5660 + }, + { + "epoch": 0.5459016393442623, + "grad_norm": 1.9413737058639526, + "learning_rate": 2.1414870615246886e-05, + "loss": 5.3312, + "step": 5661 + }, + { + "epoch": 0.5459980713596915, + "grad_norm": 2.0887677669525146, + "learning_rate": 2.1407375308938807e-05, + "loss": 5.4555, + "step": 5662 + }, + { + "epoch": 0.5460945033751206, + "grad_norm": 1.6063437461853027, + "learning_rate": 2.1399880332357385e-05, + "loss": 5.4757, + "step": 5663 + }, + { + "epoch": 0.5461909353905496, + "grad_norm": 2.0414485931396484, + "learning_rate": 2.139238568619053e-05, + "loss": 5.6022, + "step": 5664 + }, + { + "epoch": 0.5462873674059788, + "grad_norm": 2.0015246868133545, + "learning_rate": 2.138489137112607e-05, + "loss": 5.6894, + "step": 5665 + }, + { + "epoch": 0.5463837994214079, + "grad_norm": 1.4228321313858032, + "learning_rate": 2.137739738785183e-05, + "loss": 5.5644, + "step": 5666 + }, + { + "epoch": 0.546480231436837, + "grad_norm": 1.7523770332336426, + "learning_rate": 2.13699037370556e-05, + "loss": 5.5823, + "step": 5667 + }, + { + "epoch": 0.5465766634522662, + "grad_norm": 1.7865937948226929, + "learning_rate": 2.136241041942514e-05, + "loss": 5.4943, + "step": 5668 + }, + { + "epoch": 0.5466730954676953, + "grad_norm": 1.8042173385620117, + "learning_rate": 2.135491743564817e-05, + "loss": 5.5432, + "step": 5669 + }, + { + "epoch": 0.5467695274831244, + "grad_norm": 1.8858678340911865, + "learning_rate": 2.1347424786412396e-05, + "loss": 5.4596, + "step": 5670 + }, + { + "epoch": 0.5468659594985535, + "grad_norm": 1.6345771551132202, + "learning_rate": 2.133993247240548e-05, + "loss": 5.464, + "step": 5671 + }, + { + "epoch": 0.5469623915139826, + "grad_norm": 1.841962218284607, + "learning_rate": 2.1332440494315047e-05, + "loss": 5.53, + "step": 5672 + }, + { + "epoch": 0.5470588235294118, + "grad_norm": 1.9260026216506958, + "learning_rate": 2.1324948852828724e-05, + "loss": 5.6257, + "step": 5673 + }, + { + "epoch": 0.5471552555448409, + "grad_norm": 1.4716160297393799, + "learning_rate": 2.1317457548634058e-05, + "loss": 5.6109, + "step": 5674 + }, + { + "epoch": 0.54725168756027, + "grad_norm": 1.919241189956665, + "learning_rate": 2.130996658241862e-05, + "loss": 5.6462, + "step": 5675 + }, + { + "epoch": 0.5473481195756992, + "grad_norm": 2.0764923095703125, + "learning_rate": 2.1302475954869904e-05, + "loss": 5.625, + "step": 5676 + }, + { + "epoch": 0.5474445515911283, + "grad_norm": 1.8427023887634277, + "learning_rate": 2.1294985666675385e-05, + "loss": 5.6086, + "step": 5677 + }, + { + "epoch": 0.5475409836065573, + "grad_norm": 2.2911078929901123, + "learning_rate": 2.1287495718522534e-05, + "loss": 5.2554, + "step": 5678 + }, + { + "epoch": 0.5476374156219865, + "grad_norm": 1.553037166595459, + "learning_rate": 2.1280006111098755e-05, + "loss": 5.663, + "step": 5679 + }, + { + "epoch": 0.5477338476374156, + "grad_norm": 1.8313497304916382, + "learning_rate": 2.1272516845091423e-05, + "loss": 5.5138, + "step": 5680 + }, + { + "epoch": 0.5478302796528447, + "grad_norm": 2.0757570266723633, + "learning_rate": 2.126502792118792e-05, + "loss": 5.6278, + "step": 5681 + }, + { + "epoch": 0.5479267116682739, + "grad_norm": 1.7472081184387207, + "learning_rate": 2.1257539340075557e-05, + "loss": 5.4886, + "step": 5682 + }, + { + "epoch": 0.548023143683703, + "grad_norm": 1.9537365436553955, + "learning_rate": 2.125005110244162e-05, + "loss": 5.7167, + "step": 5683 + }, + { + "epoch": 0.5481195756991322, + "grad_norm": 2.297106981277466, + "learning_rate": 2.1242563208973383e-05, + "loss": 5.4958, + "step": 5684 + }, + { + "epoch": 0.5482160077145612, + "grad_norm": 2.277207136154175, + "learning_rate": 2.123507566035806e-05, + "loss": 5.5834, + "step": 5685 + }, + { + "epoch": 0.5483124397299903, + "grad_norm": 1.7613626718521118, + "learning_rate": 2.1227588457282873e-05, + "loss": 5.5569, + "step": 5686 + }, + { + "epoch": 0.5484088717454195, + "grad_norm": 2.130985736846924, + "learning_rate": 2.1220101600434968e-05, + "loss": 5.5442, + "step": 5687 + }, + { + "epoch": 0.5485053037608486, + "grad_norm": 2.0918235778808594, + "learning_rate": 2.1212615090501484e-05, + "loss": 5.5396, + "step": 5688 + }, + { + "epoch": 0.5486017357762777, + "grad_norm": 1.767620325088501, + "learning_rate": 2.1205128928169526e-05, + "loss": 5.6422, + "step": 5689 + }, + { + "epoch": 0.5486981677917069, + "grad_norm": 1.8531399965286255, + "learning_rate": 2.119764311412617e-05, + "loss": 5.5101, + "step": 5690 + }, + { + "epoch": 0.548794599807136, + "grad_norm": 1.6258960962295532, + "learning_rate": 2.119015764905844e-05, + "loss": 5.6132, + "step": 5691 + }, + { + "epoch": 0.548891031822565, + "grad_norm": 1.4370068311691284, + "learning_rate": 2.1182672533653362e-05, + "loss": 5.5141, + "step": 5692 + }, + { + "epoch": 0.5489874638379942, + "grad_norm": 1.6419092416763306, + "learning_rate": 2.1175187768597898e-05, + "loss": 5.639, + "step": 5693 + }, + { + "epoch": 0.5490838958534233, + "grad_norm": 1.5860135555267334, + "learning_rate": 2.116770335457898e-05, + "loss": 5.6757, + "step": 5694 + }, + { + "epoch": 0.5491803278688525, + "grad_norm": 2.27889347076416, + "learning_rate": 2.116021929228355e-05, + "loss": 5.541, + "step": 5695 + }, + { + "epoch": 0.5492767598842816, + "grad_norm": 1.8227137327194214, + "learning_rate": 2.1152735582398452e-05, + "loss": 5.5143, + "step": 5696 + }, + { + "epoch": 0.5493731918997107, + "grad_norm": 1.8320496082305908, + "learning_rate": 2.1145252225610557e-05, + "loss": 5.6252, + "step": 5697 + }, + { + "epoch": 0.5494696239151399, + "grad_norm": 2.2401840686798096, + "learning_rate": 2.113776922260667e-05, + "loss": 5.6756, + "step": 5698 + }, + { + "epoch": 0.549566055930569, + "grad_norm": 1.952209234237671, + "learning_rate": 2.113028657407356e-05, + "loss": 5.5632, + "step": 5699 + }, + { + "epoch": 0.549662487945998, + "grad_norm": 2.1973695755004883, + "learning_rate": 2.1122804280697996e-05, + "loss": 5.4345, + "step": 5700 + }, + { + "epoch": 0.5497589199614272, + "grad_norm": 2.1821820735931396, + "learning_rate": 2.1115322343166682e-05, + "loss": 5.4799, + "step": 5701 + }, + { + "epoch": 0.5498553519768563, + "grad_norm": 1.9344255924224854, + "learning_rate": 2.1107840762166288e-05, + "loss": 5.456, + "step": 5702 + }, + { + "epoch": 0.5499517839922854, + "grad_norm": 2.870413064956665, + "learning_rate": 2.1100359538383496e-05, + "loss": 5.6509, + "step": 5703 + }, + { + "epoch": 0.5500482160077146, + "grad_norm": 2.653101682662964, + "learning_rate": 2.10928786725049e-05, + "loss": 5.5477, + "step": 5704 + }, + { + "epoch": 0.5501446480231437, + "grad_norm": 1.7625726461410522, + "learning_rate": 2.108539816521708e-05, + "loss": 5.5244, + "step": 5705 + }, + { + "epoch": 0.5502410800385729, + "grad_norm": 2.3924551010131836, + "learning_rate": 2.1077918017206617e-05, + "loss": 5.5311, + "step": 5706 + }, + { + "epoch": 0.5503375120540019, + "grad_norm": 2.8589627742767334, + "learning_rate": 2.107043822915999e-05, + "loss": 5.5442, + "step": 5707 + }, + { + "epoch": 0.550433944069431, + "grad_norm": 2.5031630992889404, + "learning_rate": 2.1062958801763725e-05, + "loss": 5.5771, + "step": 5708 + }, + { + "epoch": 0.5505303760848602, + "grad_norm": 1.7505114078521729, + "learning_rate": 2.1055479735704244e-05, + "loss": 5.5352, + "step": 5709 + }, + { + "epoch": 0.5506268081002893, + "grad_norm": 1.5568890571594238, + "learning_rate": 2.104800103166798e-05, + "loss": 5.533, + "step": 5710 + }, + { + "epoch": 0.5507232401157184, + "grad_norm": 2.157402753829956, + "learning_rate": 2.1040522690341318e-05, + "loss": 5.4762, + "step": 5711 + }, + { + "epoch": 0.5508196721311476, + "grad_norm": 2.0164878368377686, + "learning_rate": 2.1033044712410617e-05, + "loss": 5.5931, + "step": 5712 + }, + { + "epoch": 0.5509161041465767, + "grad_norm": 1.9324051141738892, + "learning_rate": 2.1025567098562177e-05, + "loss": 5.5096, + "step": 5713 + }, + { + "epoch": 0.5510125361620057, + "grad_norm": 1.7487456798553467, + "learning_rate": 2.1018089849482307e-05, + "loss": 5.6317, + "step": 5714 + }, + { + "epoch": 0.5511089681774349, + "grad_norm": 1.5970535278320312, + "learning_rate": 2.1010612965857248e-05, + "loss": 5.5693, + "step": 5715 + }, + { + "epoch": 0.551205400192864, + "grad_norm": 1.5804675817489624, + "learning_rate": 2.1003136448373215e-05, + "loss": 5.5115, + "step": 5716 + }, + { + "epoch": 0.5513018322082932, + "grad_norm": 1.811284065246582, + "learning_rate": 2.0995660297716408e-05, + "loss": 5.4427, + "step": 5717 + }, + { + "epoch": 0.5513982642237223, + "grad_norm": 1.914981484413147, + "learning_rate": 2.0988184514572956e-05, + "loss": 5.4338, + "step": 5718 + }, + { + "epoch": 0.5514946962391514, + "grad_norm": 1.5200984477996826, + "learning_rate": 2.0980709099629006e-05, + "loss": 5.6495, + "step": 5719 + }, + { + "epoch": 0.5515911282545806, + "grad_norm": 1.892048716545105, + "learning_rate": 2.097323405357063e-05, + "loss": 5.4996, + "step": 5720 + }, + { + "epoch": 0.5516875602700096, + "grad_norm": 1.627821445465088, + "learning_rate": 2.096575937708386e-05, + "loss": 5.5939, + "step": 5721 + }, + { + "epoch": 0.5517839922854387, + "grad_norm": 1.732163429260254, + "learning_rate": 2.0958285070854744e-05, + "loss": 5.4376, + "step": 5722 + }, + { + "epoch": 0.5518804243008679, + "grad_norm": 1.6898339986801147, + "learning_rate": 2.095081113556925e-05, + "loss": 5.72, + "step": 5723 + }, + { + "epoch": 0.551976856316297, + "grad_norm": 1.7873767614364624, + "learning_rate": 2.0943337571913313e-05, + "loss": 5.5828, + "step": 5724 + }, + { + "epoch": 0.5520732883317261, + "grad_norm": 1.5562093257904053, + "learning_rate": 2.0935864380572874e-05, + "loss": 5.4975, + "step": 5725 + }, + { + "epoch": 0.5521697203471553, + "grad_norm": 1.4421929121017456, + "learning_rate": 2.0928391562233795e-05, + "loss": 5.3987, + "step": 5726 + }, + { + "epoch": 0.5522661523625844, + "grad_norm": 1.5630598068237305, + "learning_rate": 2.0920919117581922e-05, + "loss": 5.5038, + "step": 5727 + }, + { + "epoch": 0.5523625843780136, + "grad_norm": 1.370771884918213, + "learning_rate": 2.091344704730308e-05, + "loss": 5.4743, + "step": 5728 + }, + { + "epoch": 0.5524590163934426, + "grad_norm": 1.438355803489685, + "learning_rate": 2.090597535208303e-05, + "loss": 5.536, + "step": 5729 + }, + { + "epoch": 0.5525554484088717, + "grad_norm": 1.7312432527542114, + "learning_rate": 2.089850403260753e-05, + "loss": 5.5248, + "step": 5730 + }, + { + "epoch": 0.5526518804243009, + "grad_norm": 1.3860219717025757, + "learning_rate": 2.089103308956228e-05, + "loss": 5.5411, + "step": 5731 + }, + { + "epoch": 0.55274831243973, + "grad_norm": 1.3214064836502075, + "learning_rate": 2.0883562523632953e-05, + "loss": 5.5284, + "step": 5732 + }, + { + "epoch": 0.5528447444551591, + "grad_norm": 1.58625066280365, + "learning_rate": 2.0876092335505193e-05, + "loss": 5.4483, + "step": 5733 + }, + { + "epoch": 0.5529411764705883, + "grad_norm": 2.1547539234161377, + "learning_rate": 2.0868622525864605e-05, + "loss": 5.3812, + "step": 5734 + }, + { + "epoch": 0.5530376084860174, + "grad_norm": 2.036003351211548, + "learning_rate": 2.0861153095396748e-05, + "loss": 5.5132, + "step": 5735 + }, + { + "epoch": 0.5531340405014464, + "grad_norm": 1.213797926902771, + "learning_rate": 2.0853684044787178e-05, + "loss": 5.5194, + "step": 5736 + }, + { + "epoch": 0.5532304725168756, + "grad_norm": 1.5444318056106567, + "learning_rate": 2.084621537472138e-05, + "loss": 5.4865, + "step": 5737 + }, + { + "epoch": 0.5533269045323047, + "grad_norm": 1.8458729982376099, + "learning_rate": 2.0838747085884814e-05, + "loss": 5.4823, + "step": 5738 + }, + { + "epoch": 0.5534233365477339, + "grad_norm": 1.570008635520935, + "learning_rate": 2.083127917896293e-05, + "loss": 5.4907, + "step": 5739 + }, + { + "epoch": 0.553519768563163, + "grad_norm": 1.2956596612930298, + "learning_rate": 2.08238116546411e-05, + "loss": 5.5085, + "step": 5740 + }, + { + "epoch": 0.5536162005785921, + "grad_norm": 1.4708337783813477, + "learning_rate": 2.081634451360471e-05, + "loss": 5.4112, + "step": 5741 + }, + { + "epoch": 0.5537126325940213, + "grad_norm": 1.4037318229675293, + "learning_rate": 2.0808877756539068e-05, + "loss": 5.3838, + "step": 5742 + }, + { + "epoch": 0.5538090646094503, + "grad_norm": 1.9899052381515503, + "learning_rate": 2.0801411384129456e-05, + "loss": 5.3658, + "step": 5743 + }, + { + "epoch": 0.5539054966248794, + "grad_norm": 1.6454194784164429, + "learning_rate": 2.0793945397061152e-05, + "loss": 5.5427, + "step": 5744 + }, + { + "epoch": 0.5540019286403086, + "grad_norm": 1.5299161672592163, + "learning_rate": 2.078647979601936e-05, + "loss": 5.5674, + "step": 5745 + }, + { + "epoch": 0.5540983606557377, + "grad_norm": 2.1034741401672363, + "learning_rate": 2.077901458168926e-05, + "loss": 5.5011, + "step": 5746 + }, + { + "epoch": 0.5541947926711668, + "grad_norm": 1.7416125535964966, + "learning_rate": 2.0771549754756016e-05, + "loss": 5.539, + "step": 5747 + }, + { + "epoch": 0.554291224686596, + "grad_norm": 1.9454588890075684, + "learning_rate": 2.076408531590472e-05, + "loss": 5.5368, + "step": 5748 + }, + { + "epoch": 0.5543876567020251, + "grad_norm": 1.516812801361084, + "learning_rate": 2.0756621265820468e-05, + "loss": 5.5647, + "step": 5749 + }, + { + "epoch": 0.5544840887174542, + "grad_norm": 1.6972177028656006, + "learning_rate": 2.0749157605188293e-05, + "loss": 5.617, + "step": 5750 + }, + { + "epoch": 0.5545805207328833, + "grad_norm": 1.5411770343780518, + "learning_rate": 2.0741694334693192e-05, + "loss": 5.6268, + "step": 5751 + }, + { + "epoch": 0.5546769527483124, + "grad_norm": 1.6948151588439941, + "learning_rate": 2.0734231455020157e-05, + "loss": 5.5468, + "step": 5752 + }, + { + "epoch": 0.5547733847637416, + "grad_norm": 1.4514299631118774, + "learning_rate": 2.07267689668541e-05, + "loss": 5.5372, + "step": 5753 + }, + { + "epoch": 0.5548698167791707, + "grad_norm": 1.7186386585235596, + "learning_rate": 2.0719306870879933e-05, + "loss": 5.5563, + "step": 5754 + }, + { + "epoch": 0.5549662487945998, + "grad_norm": 1.8996683359146118, + "learning_rate": 2.0711845167782506e-05, + "loss": 5.4921, + "step": 5755 + }, + { + "epoch": 0.555062680810029, + "grad_norm": 1.8111474514007568, + "learning_rate": 2.0704383858246662e-05, + "loss": 5.5431, + "step": 5756 + }, + { + "epoch": 0.555159112825458, + "grad_norm": 1.6732524633407593, + "learning_rate": 2.0696922942957167e-05, + "loss": 5.4773, + "step": 5757 + }, + { + "epoch": 0.5552555448408871, + "grad_norm": 2.1825757026672363, + "learning_rate": 2.06894624225988e-05, + "loss": 5.4375, + "step": 5758 + }, + { + "epoch": 0.5553519768563163, + "grad_norm": 2.0786616802215576, + "learning_rate": 2.0682002297856268e-05, + "loss": 5.473, + "step": 5759 + }, + { + "epoch": 0.5554484088717454, + "grad_norm": 1.8522083759307861, + "learning_rate": 2.0674542569414238e-05, + "loss": 5.5534, + "step": 5760 + }, + { + "epoch": 0.5555448408871746, + "grad_norm": 1.388859510421753, + "learning_rate": 2.0667083237957377e-05, + "loss": 5.4071, + "step": 5761 + }, + { + "epoch": 0.5556412729026037, + "grad_norm": 2.1966216564178467, + "learning_rate": 2.0659624304170274e-05, + "loss": 5.4975, + "step": 5762 + }, + { + "epoch": 0.5557377049180328, + "grad_norm": 1.6820154190063477, + "learning_rate": 2.065216576873752e-05, + "loss": 5.4827, + "step": 5763 + }, + { + "epoch": 0.555834136933462, + "grad_norm": 1.4093555212020874, + "learning_rate": 2.064470763234364e-05, + "loss": 5.6654, + "step": 5764 + }, + { + "epoch": 0.555930568948891, + "grad_norm": 1.4294488430023193, + "learning_rate": 2.0637249895673123e-05, + "loss": 5.5442, + "step": 5765 + }, + { + "epoch": 0.5560270009643201, + "grad_norm": 1.4442083835601807, + "learning_rate": 2.0629792559410454e-05, + "loss": 5.545, + "step": 5766 + }, + { + "epoch": 0.5561234329797493, + "grad_norm": 2.100022315979004, + "learning_rate": 2.0622335624240043e-05, + "loss": 5.5544, + "step": 5767 + }, + { + "epoch": 0.5562198649951784, + "grad_norm": 1.8040237426757812, + "learning_rate": 2.061487909084627e-05, + "loss": 5.513, + "step": 5768 + }, + { + "epoch": 0.5563162970106075, + "grad_norm": 2.0794782638549805, + "learning_rate": 2.0607422959913507e-05, + "loss": 5.3032, + "step": 5769 + }, + { + "epoch": 0.5564127290260367, + "grad_norm": 2.16959547996521, + "learning_rate": 2.059996723212605e-05, + "loss": 5.127, + "step": 5770 + }, + { + "epoch": 0.5565091610414657, + "grad_norm": 3.0118155479431152, + "learning_rate": 2.059251190816819e-05, + "loss": 5.6877, + "step": 5771 + }, + { + "epoch": 0.5566055930568949, + "grad_norm": 2.673146963119507, + "learning_rate": 2.058505698872416e-05, + "loss": 5.6007, + "step": 5772 + }, + { + "epoch": 0.556702025072324, + "grad_norm": 1.7594493627548218, + "learning_rate": 2.057760247447817e-05, + "loss": 5.5821, + "step": 5773 + }, + { + "epoch": 0.5567984570877531, + "grad_norm": 2.0210516452789307, + "learning_rate": 2.0570148366114375e-05, + "loss": 5.6419, + "step": 5774 + }, + { + "epoch": 0.5568948891031823, + "grad_norm": 1.6254016160964966, + "learning_rate": 2.056269466431692e-05, + "loss": 5.6043, + "step": 5775 + }, + { + "epoch": 0.5569913211186114, + "grad_norm": 1.5290414094924927, + "learning_rate": 2.055524136976988e-05, + "loss": 5.526, + "step": 5776 + }, + { + "epoch": 0.5570877531340405, + "grad_norm": 1.881638765335083, + "learning_rate": 2.0547788483157318e-05, + "loss": 5.5466, + "step": 5777 + }, + { + "epoch": 0.5571841851494697, + "grad_norm": 1.6366477012634277, + "learning_rate": 2.054033600516325e-05, + "loss": 5.5051, + "step": 5778 + }, + { + "epoch": 0.5572806171648987, + "grad_norm": 1.7158210277557373, + "learning_rate": 2.0532883936471652e-05, + "loss": 5.6652, + "step": 5779 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 2.835726022720337, + "learning_rate": 2.0525432277766473e-05, + "loss": 5.5576, + "step": 5780 + }, + { + "epoch": 0.557473481195757, + "grad_norm": 1.818518042564392, + "learning_rate": 2.0517981029731616e-05, + "loss": 5.5613, + "step": 5781 + }, + { + "epoch": 0.5575699132111861, + "grad_norm": 1.6020101308822632, + "learning_rate": 2.051053019305093e-05, + "loss": 5.5059, + "step": 5782 + }, + { + "epoch": 0.5576663452266153, + "grad_norm": 2.3462984561920166, + "learning_rate": 2.050307976840827e-05, + "loss": 5.6125, + "step": 5783 + }, + { + "epoch": 0.5577627772420444, + "grad_norm": 1.991250991821289, + "learning_rate": 2.04956297564874e-05, + "loss": 5.5935, + "step": 5784 + }, + { + "epoch": 0.5578592092574735, + "grad_norm": 1.5823825597763062, + "learning_rate": 2.0488180157972102e-05, + "loss": 5.516, + "step": 5785 + }, + { + "epoch": 0.5579556412729026, + "grad_norm": 2.061142921447754, + "learning_rate": 2.0480730973546074e-05, + "loss": 5.5797, + "step": 5786 + }, + { + "epoch": 0.5580520732883317, + "grad_norm": 2.2911438941955566, + "learning_rate": 2.0473282203892984e-05, + "loss": 5.5428, + "step": 5787 + }, + { + "epoch": 0.5581485053037608, + "grad_norm": 2.635768175125122, + "learning_rate": 2.0465833849696493e-05, + "loss": 5.4459, + "step": 5788 + }, + { + "epoch": 0.55824493731919, + "grad_norm": 2.7358624935150146, + "learning_rate": 2.0458385911640188e-05, + "loss": 5.4921, + "step": 5789 + }, + { + "epoch": 0.5583413693346191, + "grad_norm": 2.544019937515259, + "learning_rate": 2.0450938390407624e-05, + "loss": 5.6918, + "step": 5790 + }, + { + "epoch": 0.5584378013500482, + "grad_norm": 1.626944661140442, + "learning_rate": 2.0443491286682347e-05, + "loss": 5.5116, + "step": 5791 + }, + { + "epoch": 0.5585342333654774, + "grad_norm": 2.085580587387085, + "learning_rate": 2.043604460114783e-05, + "loss": 5.3975, + "step": 5792 + }, + { + "epoch": 0.5586306653809064, + "grad_norm": 2.7356207370758057, + "learning_rate": 2.042859833448752e-05, + "loss": 5.5483, + "step": 5793 + }, + { + "epoch": 0.5587270973963356, + "grad_norm": 1.8508514165878296, + "learning_rate": 2.0421152487384827e-05, + "loss": 5.5589, + "step": 5794 + }, + { + "epoch": 0.5588235294117647, + "grad_norm": 1.6093170642852783, + "learning_rate": 2.0413707060523123e-05, + "loss": 5.5486, + "step": 5795 + }, + { + "epoch": 0.5589199614271938, + "grad_norm": 1.5262874364852905, + "learning_rate": 2.0406262054585738e-05, + "loss": 5.4717, + "step": 5796 + }, + { + "epoch": 0.559016393442623, + "grad_norm": 1.6614208221435547, + "learning_rate": 2.039881747025598e-05, + "loss": 5.5558, + "step": 5797 + }, + { + "epoch": 0.5591128254580521, + "grad_norm": 1.5311172008514404, + "learning_rate": 2.0391373308217077e-05, + "loss": 5.6151, + "step": 5798 + }, + { + "epoch": 0.5592092574734812, + "grad_norm": 2.2753777503967285, + "learning_rate": 2.038392956915227e-05, + "loss": 5.3041, + "step": 5799 + }, + { + "epoch": 0.5593056894889104, + "grad_norm": 1.6495474576950073, + "learning_rate": 2.0376486253744727e-05, + "loss": 5.5275, + "step": 5800 + }, + { + "epoch": 0.5594021215043394, + "grad_norm": 1.696304440498352, + "learning_rate": 2.036904336267757e-05, + "loss": 5.563, + "step": 5801 + }, + { + "epoch": 0.5594985535197685, + "grad_norm": 1.7004916667938232, + "learning_rate": 2.0361600896633924e-05, + "loss": 5.4647, + "step": 5802 + }, + { + "epoch": 0.5595949855351977, + "grad_norm": 1.3892794847488403, + "learning_rate": 2.0354158856296846e-05, + "loss": 5.4105, + "step": 5803 + }, + { + "epoch": 0.5596914175506268, + "grad_norm": 1.6968567371368408, + "learning_rate": 2.0346717242349333e-05, + "loss": 5.507, + "step": 5804 + }, + { + "epoch": 0.559787849566056, + "grad_norm": 1.3504688739776611, + "learning_rate": 2.0339276055474398e-05, + "loss": 5.5396, + "step": 5805 + }, + { + "epoch": 0.5598842815814851, + "grad_norm": 1.5611612796783447, + "learning_rate": 2.0331835296354958e-05, + "loss": 5.436, + "step": 5806 + }, + { + "epoch": 0.5599807135969141, + "grad_norm": 1.8476518392562866, + "learning_rate": 2.032439496567394e-05, + "loss": 5.5647, + "step": 5807 + }, + { + "epoch": 0.5600771456123433, + "grad_norm": 1.4748663902282715, + "learning_rate": 2.0316955064114196e-05, + "loss": 5.2848, + "step": 5808 + }, + { + "epoch": 0.5601735776277724, + "grad_norm": 1.258318305015564, + "learning_rate": 2.0309515592358543e-05, + "loss": 5.5682, + "step": 5809 + }, + { + "epoch": 0.5602700096432015, + "grad_norm": 1.6293469667434692, + "learning_rate": 2.0302076551089787e-05, + "loss": 5.4371, + "step": 5810 + }, + { + "epoch": 0.5603664416586307, + "grad_norm": 1.7234817743301392, + "learning_rate": 2.029463794099066e-05, + "loss": 5.4443, + "step": 5811 + }, + { + "epoch": 0.5604628736740598, + "grad_norm": 1.59159517288208, + "learning_rate": 2.0287199762743867e-05, + "loss": 5.5224, + "step": 5812 + }, + { + "epoch": 0.5605593056894889, + "grad_norm": 1.6536073684692383, + "learning_rate": 2.0279762017032086e-05, + "loss": 5.5757, + "step": 5813 + }, + { + "epoch": 0.5606557377049181, + "grad_norm": 1.8301277160644531, + "learning_rate": 2.027232470453793e-05, + "loss": 5.5633, + "step": 5814 + }, + { + "epoch": 0.5607521697203471, + "grad_norm": 1.8961528539657593, + "learning_rate": 2.0264887825944e-05, + "loss": 5.5161, + "step": 5815 + }, + { + "epoch": 0.5608486017357763, + "grad_norm": 2.079192876815796, + "learning_rate": 2.025745138193283e-05, + "loss": 5.4584, + "step": 5816 + }, + { + "epoch": 0.5609450337512054, + "grad_norm": 1.6179825067520142, + "learning_rate": 2.025001537318694e-05, + "loss": 5.5818, + "step": 5817 + }, + { + "epoch": 0.5610414657666345, + "grad_norm": 1.8239755630493164, + "learning_rate": 2.0242579800388792e-05, + "loss": 5.3331, + "step": 5818 + }, + { + "epoch": 0.5611378977820637, + "grad_norm": 1.5830739736557007, + "learning_rate": 2.0235144664220817e-05, + "loss": 5.3233, + "step": 5819 + }, + { + "epoch": 0.5612343297974928, + "grad_norm": 1.6913461685180664, + "learning_rate": 2.0227709965365392e-05, + "loss": 5.5986, + "step": 5820 + }, + { + "epoch": 0.5613307618129219, + "grad_norm": 1.6790339946746826, + "learning_rate": 2.0220275704504883e-05, + "loss": 5.5267, + "step": 5821 + }, + { + "epoch": 0.561427193828351, + "grad_norm": 1.7599782943725586, + "learning_rate": 2.0212841882321586e-05, + "loss": 5.5347, + "step": 5822 + }, + { + "epoch": 0.5615236258437801, + "grad_norm": 1.8210713863372803, + "learning_rate": 2.020540849949776e-05, + "loss": 5.4969, + "step": 5823 + }, + { + "epoch": 0.5616200578592092, + "grad_norm": 1.388752818107605, + "learning_rate": 2.0197975556715654e-05, + "loss": 5.6013, + "step": 5824 + }, + { + "epoch": 0.5617164898746384, + "grad_norm": 2.3632113933563232, + "learning_rate": 2.0190543054657434e-05, + "loss": 5.6886, + "step": 5825 + }, + { + "epoch": 0.5618129218900675, + "grad_norm": 1.7446085214614868, + "learning_rate": 2.0183110994005246e-05, + "loss": 5.3159, + "step": 5826 + }, + { + "epoch": 0.5619093539054967, + "grad_norm": 2.5839757919311523, + "learning_rate": 2.0175679375441214e-05, + "loss": 5.3964, + "step": 5827 + }, + { + "epoch": 0.5620057859209258, + "grad_norm": 2.316575765609741, + "learning_rate": 2.016824819964738e-05, + "loss": 5.4846, + "step": 5828 + }, + { + "epoch": 0.5621022179363548, + "grad_norm": 2.031137704849243, + "learning_rate": 2.016081746730579e-05, + "loss": 5.3576, + "step": 5829 + }, + { + "epoch": 0.562198649951784, + "grad_norm": 2.0208542346954346, + "learning_rate": 2.0153387179098415e-05, + "loss": 5.455, + "step": 5830 + }, + { + "epoch": 0.5622950819672131, + "grad_norm": 2.693835496902466, + "learning_rate": 2.0145957335707193e-05, + "loss": 5.5396, + "step": 5831 + }, + { + "epoch": 0.5623915139826422, + "grad_norm": 1.993638038635254, + "learning_rate": 2.013852793781404e-05, + "loss": 5.464, + "step": 5832 + }, + { + "epoch": 0.5624879459980714, + "grad_norm": 1.8055614233016968, + "learning_rate": 2.0131098986100808e-05, + "loss": 5.6388, + "step": 5833 + }, + { + "epoch": 0.5625843780135005, + "grad_norm": 2.418452262878418, + "learning_rate": 2.0123670481249307e-05, + "loss": 5.5707, + "step": 5834 + }, + { + "epoch": 0.5626808100289296, + "grad_norm": 2.5128581523895264, + "learning_rate": 2.0116242423941342e-05, + "loss": 5.4949, + "step": 5835 + }, + { + "epoch": 0.5627772420443587, + "grad_norm": 1.5268948078155518, + "learning_rate": 2.010881481485863e-05, + "loss": 5.4245, + "step": 5836 + }, + { + "epoch": 0.5628736740597878, + "grad_norm": 2.614649534225464, + "learning_rate": 2.0101387654682873e-05, + "loss": 5.5153, + "step": 5837 + }, + { + "epoch": 0.562970106075217, + "grad_norm": 3.1236705780029297, + "learning_rate": 2.0093960944095728e-05, + "loss": 5.4583, + "step": 5838 + }, + { + "epoch": 0.5630665380906461, + "grad_norm": 1.9762530326843262, + "learning_rate": 2.008653468377881e-05, + "loss": 5.4898, + "step": 5839 + }, + { + "epoch": 0.5631629701060752, + "grad_norm": 1.7289601564407349, + "learning_rate": 2.007910887441369e-05, + "loss": 5.359, + "step": 5840 + }, + { + "epoch": 0.5632594021215044, + "grad_norm": 2.4474964141845703, + "learning_rate": 2.007168351668191e-05, + "loss": 5.4411, + "step": 5841 + }, + { + "epoch": 0.5633558341369335, + "grad_norm": 1.7955546379089355, + "learning_rate": 2.0064258611264937e-05, + "loss": 5.4728, + "step": 5842 + }, + { + "epoch": 0.5634522661523625, + "grad_norm": 1.4739017486572266, + "learning_rate": 2.005683415884424e-05, + "loss": 5.589, + "step": 5843 + }, + { + "epoch": 0.5635486981677917, + "grad_norm": 1.627679705619812, + "learning_rate": 2.004941016010123e-05, + "loss": 5.5869, + "step": 5844 + }, + { + "epoch": 0.5636451301832208, + "grad_norm": 1.3053466081619263, + "learning_rate": 2.0041986615717245e-05, + "loss": 5.5243, + "step": 5845 + }, + { + "epoch": 0.5637415621986499, + "grad_norm": 1.842591404914856, + "learning_rate": 2.0034563526373638e-05, + "loss": 5.4174, + "step": 5846 + }, + { + "epoch": 0.5638379942140791, + "grad_norm": 1.5907248258590698, + "learning_rate": 2.002714089275168e-05, + "loss": 5.4347, + "step": 5847 + }, + { + "epoch": 0.5639344262295082, + "grad_norm": 1.5882896184921265, + "learning_rate": 2.0019718715532597e-05, + "loss": 5.4542, + "step": 5848 + }, + { + "epoch": 0.5640308582449374, + "grad_norm": 1.7525537014007568, + "learning_rate": 2.0012296995397613e-05, + "loss": 5.4359, + "step": 5849 + }, + { + "epoch": 0.5641272902603665, + "grad_norm": 1.6460729837417603, + "learning_rate": 2.0004875733027863e-05, + "loss": 5.4897, + "step": 5850 + }, + { + "epoch": 0.5642237222757955, + "grad_norm": 1.7738380432128906, + "learning_rate": 1.9997454929104477e-05, + "loss": 5.4054, + "step": 5851 + }, + { + "epoch": 0.5643201542912247, + "grad_norm": 1.7868574857711792, + "learning_rate": 1.9990034584308524e-05, + "loss": 5.4796, + "step": 5852 + }, + { + "epoch": 0.5644165863066538, + "grad_norm": 1.3878684043884277, + "learning_rate": 1.998261469932102e-05, + "loss": 5.6152, + "step": 5853 + }, + { + "epoch": 0.5645130183220829, + "grad_norm": 1.1990079879760742, + "learning_rate": 1.997519527482297e-05, + "loss": 5.5906, + "step": 5854 + }, + { + "epoch": 0.5646094503375121, + "grad_norm": 1.6445817947387695, + "learning_rate": 1.9967776311495312e-05, + "loss": 5.5734, + "step": 5855 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 2.1676523685455322, + "learning_rate": 1.9960357810018948e-05, + "loss": 5.4331, + "step": 5856 + }, + { + "epoch": 0.5648023143683703, + "grad_norm": 1.5227984189987183, + "learning_rate": 1.995293977107475e-05, + "loss": 5.5482, + "step": 5857 + }, + { + "epoch": 0.5648987463837994, + "grad_norm": 1.4600393772125244, + "learning_rate": 1.994552219534352e-05, + "loss": 5.6394, + "step": 5858 + }, + { + "epoch": 0.5649951783992285, + "grad_norm": 1.59364914894104, + "learning_rate": 1.9938105083506043e-05, + "loss": 5.439, + "step": 5859 + }, + { + "epoch": 0.5650916104146577, + "grad_norm": 1.6435110569000244, + "learning_rate": 1.993068843624305e-05, + "loss": 5.3831, + "step": 5860 + }, + { + "epoch": 0.5651880424300868, + "grad_norm": 1.643133282661438, + "learning_rate": 1.992327225423523e-05, + "loss": 5.3627, + "step": 5861 + }, + { + "epoch": 0.5652844744455159, + "grad_norm": 1.8148629665374756, + "learning_rate": 1.991585653816324e-05, + "loss": 5.4532, + "step": 5862 + }, + { + "epoch": 0.5653809064609451, + "grad_norm": 2.333003520965576, + "learning_rate": 1.9908441288707683e-05, + "loss": 5.4243, + "step": 5863 + }, + { + "epoch": 0.5654773384763742, + "grad_norm": 2.517366647720337, + "learning_rate": 1.9901026506549107e-05, + "loss": 5.1157, + "step": 5864 + }, + { + "epoch": 0.5655737704918032, + "grad_norm": 1.8328700065612793, + "learning_rate": 1.989361219236805e-05, + "loss": 5.1094, + "step": 5865 + }, + { + "epoch": 0.5656702025072324, + "grad_norm": 2.9924581050872803, + "learning_rate": 1.9886198346844987e-05, + "loss": 5.0565, + "step": 5866 + }, + { + "epoch": 0.5657666345226615, + "grad_norm": 1.527276873588562, + "learning_rate": 1.987878497066033e-05, + "loss": 5.214, + "step": 5867 + }, + { + "epoch": 0.5658630665380906, + "grad_norm": 2.7964608669281006, + "learning_rate": 1.98713720644945e-05, + "loss": 5.5743, + "step": 5868 + }, + { + "epoch": 0.5659594985535198, + "grad_norm": 2.3223090171813965, + "learning_rate": 1.986395962902782e-05, + "loss": 5.6124, + "step": 5869 + }, + { + "epoch": 0.5660559305689489, + "grad_norm": 1.2507699728012085, + "learning_rate": 1.9856547664940607e-05, + "loss": 5.6217, + "step": 5870 + }, + { + "epoch": 0.5661523625843781, + "grad_norm": 1.7840633392333984, + "learning_rate": 1.9849136172913117e-05, + "loss": 5.6703, + "step": 5871 + }, + { + "epoch": 0.5662487945998071, + "grad_norm": 1.798821210861206, + "learning_rate": 1.9841725153625568e-05, + "loss": 5.6193, + "step": 5872 + }, + { + "epoch": 0.5663452266152362, + "grad_norm": 1.32755708694458, + "learning_rate": 1.9834314607758143e-05, + "loss": 5.4718, + "step": 5873 + }, + { + "epoch": 0.5664416586306654, + "grad_norm": 2.9008796215057373, + "learning_rate": 1.9826904535990966e-05, + "loss": 5.5369, + "step": 5874 + }, + { + "epoch": 0.5665380906460945, + "grad_norm": 1.720582127571106, + "learning_rate": 1.981949493900411e-05, + "loss": 5.4909, + "step": 5875 + }, + { + "epoch": 0.5666345226615236, + "grad_norm": 1.766432762145996, + "learning_rate": 1.9812085817477642e-05, + "loss": 5.3864, + "step": 5876 + }, + { + "epoch": 0.5667309546769528, + "grad_norm": 1.4780709743499756, + "learning_rate": 1.9804677172091552e-05, + "loss": 5.4393, + "step": 5877 + }, + { + "epoch": 0.5668273866923819, + "grad_norm": 1.3771491050720215, + "learning_rate": 1.9797269003525794e-05, + "loss": 5.4213, + "step": 5878 + }, + { + "epoch": 0.566923818707811, + "grad_norm": 1.7910962104797363, + "learning_rate": 1.9789861312460285e-05, + "loss": 5.4238, + "step": 5879 + }, + { + "epoch": 0.5670202507232401, + "grad_norm": 1.6892684698104858, + "learning_rate": 1.9782454099574896e-05, + "loss": 5.4184, + "step": 5880 + }, + { + "epoch": 0.5671166827386692, + "grad_norm": 1.7776445150375366, + "learning_rate": 1.9775047365549443e-05, + "loss": 5.5377, + "step": 5881 + }, + { + "epoch": 0.5672131147540984, + "grad_norm": 1.5214787721633911, + "learning_rate": 1.976764111106371e-05, + "loss": 5.5988, + "step": 5882 + }, + { + "epoch": 0.5673095467695275, + "grad_norm": 1.5516780614852905, + "learning_rate": 1.976023533679744e-05, + "loss": 5.5492, + "step": 5883 + }, + { + "epoch": 0.5674059787849566, + "grad_norm": 1.253037691116333, + "learning_rate": 1.975283004343032e-05, + "loss": 5.515, + "step": 5884 + }, + { + "epoch": 0.5675024108003858, + "grad_norm": 2.052346706390381, + "learning_rate": 1.9745425231642007e-05, + "loss": 5.4758, + "step": 5885 + }, + { + "epoch": 0.5675988428158149, + "grad_norm": 1.9730643033981323, + "learning_rate": 1.9738020902112086e-05, + "loss": 5.6155, + "step": 5886 + }, + { + "epoch": 0.5676952748312439, + "grad_norm": 1.2626330852508545, + "learning_rate": 1.973061705552014e-05, + "loss": 5.6457, + "step": 5887 + }, + { + "epoch": 0.5677917068466731, + "grad_norm": 1.6288560628890991, + "learning_rate": 1.9723213692545678e-05, + "loss": 5.5615, + "step": 5888 + }, + { + "epoch": 0.5678881388621022, + "grad_norm": 1.7554421424865723, + "learning_rate": 1.9715810813868154e-05, + "loss": 5.5561, + "step": 5889 + }, + { + "epoch": 0.5679845708775313, + "grad_norm": 1.4662624597549438, + "learning_rate": 1.9708408420167025e-05, + "loss": 5.5212, + "step": 5890 + }, + { + "epoch": 0.5680810028929605, + "grad_norm": 1.6575603485107422, + "learning_rate": 1.9701006512121656e-05, + "loss": 5.5282, + "step": 5891 + }, + { + "epoch": 0.5681774349083896, + "grad_norm": 1.4899966716766357, + "learning_rate": 1.9693605090411376e-05, + "loss": 5.3382, + "step": 5892 + }, + { + "epoch": 0.5682738669238188, + "grad_norm": 1.4930181503295898, + "learning_rate": 1.96862041557155e-05, + "loss": 5.2672, + "step": 5893 + }, + { + "epoch": 0.5683702989392478, + "grad_norm": 1.6437140703201294, + "learning_rate": 1.967880370871326e-05, + "loss": 5.565, + "step": 5894 + }, + { + "epoch": 0.5684667309546769, + "grad_norm": 1.7194445133209229, + "learning_rate": 1.967140375008387e-05, + "loss": 5.5783, + "step": 5895 + }, + { + "epoch": 0.5685631629701061, + "grad_norm": 1.9239718914031982, + "learning_rate": 1.966400428050649e-05, + "loss": 5.5281, + "step": 5896 + }, + { + "epoch": 0.5686595949855352, + "grad_norm": 1.9907366037368774, + "learning_rate": 1.9656605300660217e-05, + "loss": 5.4627, + "step": 5897 + }, + { + "epoch": 0.5687560270009643, + "grad_norm": 1.5052396059036255, + "learning_rate": 1.9649206811224146e-05, + "loss": 5.4731, + "step": 5898 + }, + { + "epoch": 0.5688524590163935, + "grad_norm": 2.1493096351623535, + "learning_rate": 1.9641808812877283e-05, + "loss": 5.648, + "step": 5899 + }, + { + "epoch": 0.5689488910318226, + "grad_norm": 2.0632262229919434, + "learning_rate": 1.9634411306298613e-05, + "loss": 5.5916, + "step": 5900 + }, + { + "epoch": 0.5690453230472516, + "grad_norm": 1.6353429555892944, + "learning_rate": 1.9627014292167066e-05, + "loss": 5.5002, + "step": 5901 + }, + { + "epoch": 0.5691417550626808, + "grad_norm": 1.8366808891296387, + "learning_rate": 1.9619617771161537e-05, + "loss": 5.6062, + "step": 5902 + }, + { + "epoch": 0.5692381870781099, + "grad_norm": 2.685453414916992, + "learning_rate": 1.961222174396086e-05, + "loss": 5.4567, + "step": 5903 + }, + { + "epoch": 0.5693346190935391, + "grad_norm": 2.025393486022949, + "learning_rate": 1.960482621124385e-05, + "loss": 5.5535, + "step": 5904 + }, + { + "epoch": 0.5694310511089682, + "grad_norm": 1.3918795585632324, + "learning_rate": 1.9597431173689247e-05, + "loss": 5.5743, + "step": 5905 + }, + { + "epoch": 0.5695274831243973, + "grad_norm": 2.3622868061065674, + "learning_rate": 1.959003663197576e-05, + "loss": 5.5393, + "step": 5906 + }, + { + "epoch": 0.5696239151398265, + "grad_norm": 2.1915061473846436, + "learning_rate": 1.9582642586782057e-05, + "loss": 5.4459, + "step": 5907 + }, + { + "epoch": 0.5697203471552555, + "grad_norm": 1.6845998764038086, + "learning_rate": 1.957524903878674e-05, + "loss": 5.6061, + "step": 5908 + }, + { + "epoch": 0.5698167791706846, + "grad_norm": 1.4877657890319824, + "learning_rate": 1.95678559886684e-05, + "loss": 5.5238, + "step": 5909 + }, + { + "epoch": 0.5699132111861138, + "grad_norm": 1.5818194150924683, + "learning_rate": 1.9560463437105547e-05, + "loss": 5.6132, + "step": 5910 + }, + { + "epoch": 0.5700096432015429, + "grad_norm": 1.7845408916473389, + "learning_rate": 1.9553071384776658e-05, + "loss": 5.5398, + "step": 5911 + }, + { + "epoch": 0.570106075216972, + "grad_norm": 1.4054689407348633, + "learning_rate": 1.9545679832360183e-05, + "loss": 5.5497, + "step": 5912 + }, + { + "epoch": 0.5702025072324012, + "grad_norm": 1.2101666927337646, + "learning_rate": 1.9538288780534504e-05, + "loss": 5.4536, + "step": 5913 + }, + { + "epoch": 0.5702989392478303, + "grad_norm": 2.1805968284606934, + "learning_rate": 1.9530898229977943e-05, + "loss": 5.4329, + "step": 5914 + }, + { + "epoch": 0.5703953712632595, + "grad_norm": 1.6841928958892822, + "learning_rate": 1.952350818136882e-05, + "loss": 5.5281, + "step": 5915 + }, + { + "epoch": 0.5704918032786885, + "grad_norm": 1.4390488862991333, + "learning_rate": 1.951611863538537e-05, + "loss": 5.5063, + "step": 5916 + }, + { + "epoch": 0.5705882352941176, + "grad_norm": 1.5720031261444092, + "learning_rate": 1.950872959270581e-05, + "loss": 5.4903, + "step": 5917 + }, + { + "epoch": 0.5706846673095468, + "grad_norm": 1.3652846813201904, + "learning_rate": 1.9501341054008292e-05, + "loss": 5.5391, + "step": 5918 + }, + { + "epoch": 0.5707810993249759, + "grad_norm": 1.1742324829101562, + "learning_rate": 1.949395301997091e-05, + "loss": 5.5315, + "step": 5919 + }, + { + "epoch": 0.570877531340405, + "grad_norm": 1.6097947359085083, + "learning_rate": 1.948656549127176e-05, + "loss": 5.5527, + "step": 5920 + }, + { + "epoch": 0.5709739633558342, + "grad_norm": 1.3604191541671753, + "learning_rate": 1.947917846858883e-05, + "loss": 5.5368, + "step": 5921 + }, + { + "epoch": 0.5710703953712633, + "grad_norm": 1.0695505142211914, + "learning_rate": 1.9471791952600108e-05, + "loss": 5.5608, + "step": 5922 + }, + { + "epoch": 0.5711668273866923, + "grad_norm": 1.1117829084396362, + "learning_rate": 1.9464405943983515e-05, + "loss": 5.6054, + "step": 5923 + }, + { + "epoch": 0.5712632594021215, + "grad_norm": 1.217893362045288, + "learning_rate": 1.9457020443416942e-05, + "loss": 5.4406, + "step": 5924 + }, + { + "epoch": 0.5713596914175506, + "grad_norm": 1.0640689134597778, + "learning_rate": 1.9449635451578197e-05, + "loss": 5.4954, + "step": 5925 + }, + { + "epoch": 0.5714561234329798, + "grad_norm": 1.2514137029647827, + "learning_rate": 1.9442250969145088e-05, + "loss": 5.4922, + "step": 5926 + }, + { + "epoch": 0.5715525554484089, + "grad_norm": 1.2094272375106812, + "learning_rate": 1.9434866996795334e-05, + "loss": 5.4417, + "step": 5927 + }, + { + "epoch": 0.571648987463838, + "grad_norm": 1.0172996520996094, + "learning_rate": 1.9427483535206648e-05, + "loss": 5.5196, + "step": 5928 + }, + { + "epoch": 0.5717454194792672, + "grad_norm": 1.3084901571273804, + "learning_rate": 1.9420100585056665e-05, + "loss": 5.5096, + "step": 5929 + }, + { + "epoch": 0.5718418514946962, + "grad_norm": 1.3677525520324707, + "learning_rate": 1.9412718147022972e-05, + "loss": 5.4852, + "step": 5930 + }, + { + "epoch": 0.5719382835101253, + "grad_norm": 1.246281623840332, + "learning_rate": 1.9405336221783143e-05, + "loss": 5.5263, + "step": 5931 + }, + { + "epoch": 0.5720347155255545, + "grad_norm": 1.3201968669891357, + "learning_rate": 1.9397954810014667e-05, + "loss": 5.4841, + "step": 5932 + }, + { + "epoch": 0.5721311475409836, + "grad_norm": 1.5388469696044922, + "learning_rate": 1.9390573912394993e-05, + "loss": 5.4786, + "step": 5933 + }, + { + "epoch": 0.5722275795564127, + "grad_norm": 1.4951269626617432, + "learning_rate": 1.9383193529601555e-05, + "loss": 5.4914, + "step": 5934 + }, + { + "epoch": 0.5723240115718419, + "grad_norm": 1.155659794807434, + "learning_rate": 1.9375813662311698e-05, + "loss": 5.445, + "step": 5935 + }, + { + "epoch": 0.572420443587271, + "grad_norm": 1.3400917053222656, + "learning_rate": 1.9368434311202733e-05, + "loss": 5.4437, + "step": 5936 + }, + { + "epoch": 0.5725168756027001, + "grad_norm": 1.333624243736267, + "learning_rate": 1.9361055476951944e-05, + "loss": 5.4828, + "step": 5937 + }, + { + "epoch": 0.5726133076181292, + "grad_norm": 1.2534990310668945, + "learning_rate": 1.9353677160236534e-05, + "loss": 5.5587, + "step": 5938 + }, + { + "epoch": 0.5727097396335583, + "grad_norm": 1.4263795614242554, + "learning_rate": 1.9346299361733693e-05, + "loss": 5.5472, + "step": 5939 + }, + { + "epoch": 0.5728061716489875, + "grad_norm": 1.302563190460205, + "learning_rate": 1.933892208212054e-05, + "loss": 5.4245, + "step": 5940 + }, + { + "epoch": 0.5729026036644166, + "grad_norm": 1.4858332872390747, + "learning_rate": 1.933154532207414e-05, + "loss": 5.3831, + "step": 5941 + }, + { + "epoch": 0.5729990356798457, + "grad_norm": 1.2829346656799316, + "learning_rate": 1.9324169082271547e-05, + "loss": 5.4411, + "step": 5942 + }, + { + "epoch": 0.5730954676952749, + "grad_norm": 1.5391427278518677, + "learning_rate": 1.931679336338972e-05, + "loss": 5.7154, + "step": 5943 + }, + { + "epoch": 0.573191899710704, + "grad_norm": 1.2623523473739624, + "learning_rate": 1.9309418166105605e-05, + "loss": 5.5251, + "step": 5944 + }, + { + "epoch": 0.573288331726133, + "grad_norm": 1.3417352437973022, + "learning_rate": 1.9302043491096086e-05, + "loss": 5.5579, + "step": 5945 + }, + { + "epoch": 0.5733847637415622, + "grad_norm": 1.6850026845932007, + "learning_rate": 1.9294669339038007e-05, + "loss": 5.4906, + "step": 5946 + }, + { + "epoch": 0.5734811957569913, + "grad_norm": 1.6025274991989136, + "learning_rate": 1.9287295710608148e-05, + "loss": 5.565, + "step": 5947 + }, + { + "epoch": 0.5735776277724205, + "grad_norm": 1.4442328214645386, + "learning_rate": 1.9279922606483263e-05, + "loss": 5.5031, + "step": 5948 + }, + { + "epoch": 0.5736740597878496, + "grad_norm": 1.3069158792495728, + "learning_rate": 1.927255002734003e-05, + "loss": 5.4529, + "step": 5949 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 1.1679794788360596, + "learning_rate": 1.9265177973855118e-05, + "loss": 5.5391, + "step": 5950 + }, + { + "epoch": 0.5738669238187079, + "grad_norm": 1.2253974676132202, + "learning_rate": 1.9257806446705116e-05, + "loss": 5.4924, + "step": 5951 + }, + { + "epoch": 0.5739633558341369, + "grad_norm": 1.1219854354858398, + "learning_rate": 1.9250435446566558e-05, + "loss": 5.5769, + "step": 5952 + }, + { + "epoch": 0.574059787849566, + "grad_norm": 1.1286811828613281, + "learning_rate": 1.9243064974115967e-05, + "loss": 5.5479, + "step": 5953 + }, + { + "epoch": 0.5741562198649952, + "grad_norm": 2.1164841651916504, + "learning_rate": 1.923569503002979e-05, + "loss": 5.346, + "step": 5954 + }, + { + "epoch": 0.5742526518804243, + "grad_norm": 1.5024505853652954, + "learning_rate": 1.9228325614984418e-05, + "loss": 5.5504, + "step": 5955 + }, + { + "epoch": 0.5743490838958534, + "grad_norm": 1.4149144887924194, + "learning_rate": 1.9220956729656225e-05, + "loss": 5.3884, + "step": 5956 + }, + { + "epoch": 0.5744455159112826, + "grad_norm": 1.4461830854415894, + "learning_rate": 1.9213588374721513e-05, + "loss": 5.534, + "step": 5957 + }, + { + "epoch": 0.5745419479267116, + "grad_norm": 1.2853792905807495, + "learning_rate": 1.9206220550856524e-05, + "loss": 5.4807, + "step": 5958 + }, + { + "epoch": 0.5746383799421408, + "grad_norm": 1.3854079246520996, + "learning_rate": 1.9198853258737498e-05, + "loss": 5.4819, + "step": 5959 + }, + { + "epoch": 0.5747348119575699, + "grad_norm": 1.4985045194625854, + "learning_rate": 1.9191486499040563e-05, + "loss": 5.4811, + "step": 5960 + }, + { + "epoch": 0.574831243972999, + "grad_norm": 1.1652898788452148, + "learning_rate": 1.9184120272441862e-05, + "loss": 5.631, + "step": 5961 + }, + { + "epoch": 0.5749276759884282, + "grad_norm": 1.1510229110717773, + "learning_rate": 1.9176754579617446e-05, + "loss": 5.3627, + "step": 5962 + }, + { + "epoch": 0.5750241080038573, + "grad_norm": 1.2718931436538696, + "learning_rate": 1.9169389421243315e-05, + "loss": 5.5017, + "step": 5963 + }, + { + "epoch": 0.5751205400192864, + "grad_norm": 1.2968295812606812, + "learning_rate": 1.9162024797995455e-05, + "loss": 5.6145, + "step": 5964 + }, + { + "epoch": 0.5752169720347156, + "grad_norm": 1.0541164875030518, + "learning_rate": 1.915466071054977e-05, + "loss": 5.4302, + "step": 5965 + }, + { + "epoch": 0.5753134040501446, + "grad_norm": 1.3464387655258179, + "learning_rate": 1.9147297159582133e-05, + "loss": 5.5683, + "step": 5966 + }, + { + "epoch": 0.5754098360655737, + "grad_norm": 1.4266247749328613, + "learning_rate": 1.9139934145768355e-05, + "loss": 5.4875, + "step": 5967 + }, + { + "epoch": 0.5755062680810029, + "grad_norm": 1.1479979753494263, + "learning_rate": 1.9132571669784216e-05, + "loss": 5.5879, + "step": 5968 + }, + { + "epoch": 0.575602700096432, + "grad_norm": 1.222793698310852, + "learning_rate": 1.912520973230542e-05, + "loss": 5.4696, + "step": 5969 + }, + { + "epoch": 0.5756991321118612, + "grad_norm": 1.4966634511947632, + "learning_rate": 1.911784833400765e-05, + "loss": 5.4838, + "step": 5970 + }, + { + "epoch": 0.5757955641272903, + "grad_norm": 1.487646222114563, + "learning_rate": 1.911048747556651e-05, + "loss": 5.5214, + "step": 5971 + }, + { + "epoch": 0.5758919961427194, + "grad_norm": 1.2104181051254272, + "learning_rate": 1.9103127157657593e-05, + "loss": 5.4586, + "step": 5972 + }, + { + "epoch": 0.5759884281581485, + "grad_norm": 1.2961307764053345, + "learning_rate": 1.909576738095641e-05, + "loss": 5.5324, + "step": 5973 + }, + { + "epoch": 0.5760848601735776, + "grad_norm": 1.4437739849090576, + "learning_rate": 1.9088408146138415e-05, + "loss": 5.5665, + "step": 5974 + }, + { + "epoch": 0.5761812921890067, + "grad_norm": 1.4321048259735107, + "learning_rate": 1.9081049453879054e-05, + "loss": 5.4041, + "step": 5975 + }, + { + "epoch": 0.5762777242044359, + "grad_norm": 1.3341301679611206, + "learning_rate": 1.9073691304853692e-05, + "loss": 5.507, + "step": 5976 + }, + { + "epoch": 0.576374156219865, + "grad_norm": 1.3819067478179932, + "learning_rate": 1.906633369973764e-05, + "loss": 5.4791, + "step": 5977 + }, + { + "epoch": 0.5764705882352941, + "grad_norm": 2.2238683700561523, + "learning_rate": 1.9058976639206183e-05, + "loss": 5.325, + "step": 5978 + }, + { + "epoch": 0.5765670202507233, + "grad_norm": 1.617710828781128, + "learning_rate": 1.9051620123934537e-05, + "loss": 5.5177, + "step": 5979 + }, + { + "epoch": 0.5766634522661523, + "grad_norm": 1.52555251121521, + "learning_rate": 1.904426415459787e-05, + "loss": 5.4031, + "step": 5980 + }, + { + "epoch": 0.5767598842815815, + "grad_norm": 1.2481204271316528, + "learning_rate": 1.9036908731871313e-05, + "loss": 5.3423, + "step": 5981 + }, + { + "epoch": 0.5768563162970106, + "grad_norm": 1.244782567024231, + "learning_rate": 1.9029553856429927e-05, + "loss": 5.4152, + "step": 5982 + }, + { + "epoch": 0.5769527483124397, + "grad_norm": 1.2621004581451416, + "learning_rate": 1.902219952894875e-05, + "loss": 5.4932, + "step": 5983 + }, + { + "epoch": 0.5770491803278689, + "grad_norm": 1.3408865928649902, + "learning_rate": 1.9014845750102735e-05, + "loss": 5.5155, + "step": 5984 + }, + { + "epoch": 0.577145612343298, + "grad_norm": 1.1472150087356567, + "learning_rate": 1.9007492520566814e-05, + "loss": 5.4701, + "step": 5985 + }, + { + "epoch": 0.5772420443587271, + "grad_norm": 1.3565809726715088, + "learning_rate": 1.9000139841015857e-05, + "loss": 5.4898, + "step": 5986 + }, + { + "epoch": 0.5773384763741563, + "grad_norm": 1.1973391771316528, + "learning_rate": 1.8992787712124678e-05, + "loss": 5.5599, + "step": 5987 + }, + { + "epoch": 0.5774349083895853, + "grad_norm": 1.42896568775177, + "learning_rate": 1.8985436134568044e-05, + "loss": 5.4699, + "step": 5988 + }, + { + "epoch": 0.5775313404050144, + "grad_norm": 1.7417579889297485, + "learning_rate": 1.8978085109020682e-05, + "loss": 5.3149, + "step": 5989 + }, + { + "epoch": 0.5776277724204436, + "grad_norm": 1.7243412733078003, + "learning_rate": 1.8970734636157262e-05, + "loss": 5.5267, + "step": 5990 + }, + { + "epoch": 0.5777242044358727, + "grad_norm": 1.6065423488616943, + "learning_rate": 1.8963384716652386e-05, + "loss": 5.5217, + "step": 5991 + }, + { + "epoch": 0.5778206364513019, + "grad_norm": 1.6775509119033813, + "learning_rate": 1.895603535118064e-05, + "loss": 5.5303, + "step": 5992 + }, + { + "epoch": 0.577917068466731, + "grad_norm": 1.446999192237854, + "learning_rate": 1.8948686540416523e-05, + "loss": 5.4976, + "step": 5993 + }, + { + "epoch": 0.57801350048216, + "grad_norm": 1.9779279232025146, + "learning_rate": 1.8941338285034517e-05, + "loss": 5.3537, + "step": 5994 + }, + { + "epoch": 0.5781099324975892, + "grad_norm": 1.6556795835494995, + "learning_rate": 1.893399058570902e-05, + "loss": 5.4486, + "step": 5995 + }, + { + "epoch": 0.5782063645130183, + "grad_norm": 1.3756059408187866, + "learning_rate": 1.8926643443114397e-05, + "loss": 5.5403, + "step": 5996 + }, + { + "epoch": 0.5783027965284474, + "grad_norm": 2.3047938346862793, + "learning_rate": 1.8919296857924974e-05, + "loss": 5.4604, + "step": 5997 + }, + { + "epoch": 0.5783992285438766, + "grad_norm": 1.7848163843154907, + "learning_rate": 1.8911950830815e-05, + "loss": 5.4155, + "step": 5998 + }, + { + "epoch": 0.5784956605593057, + "grad_norm": 1.560414433479309, + "learning_rate": 1.8904605362458675e-05, + "loss": 5.4833, + "step": 5999 + }, + { + "epoch": 0.5785920925747348, + "grad_norm": 1.1889039278030396, + "learning_rate": 1.889726045353018e-05, + "loss": 5.4455, + "step": 6000 + }, + { + "epoch": 0.578688524590164, + "grad_norm": 1.3988704681396484, + "learning_rate": 1.8889916104703607e-05, + "loss": 5.3392, + "step": 6001 + }, + { + "epoch": 0.578784956605593, + "grad_norm": 1.5016883611679077, + "learning_rate": 1.8882572316653002e-05, + "loss": 5.4856, + "step": 6002 + }, + { + "epoch": 0.5788813886210222, + "grad_norm": 1.3071198463439941, + "learning_rate": 1.8875229090052394e-05, + "loss": 5.3797, + "step": 6003 + }, + { + "epoch": 0.5789778206364513, + "grad_norm": 1.326368808746338, + "learning_rate": 1.886788642557571e-05, + "loss": 5.5797, + "step": 6004 + }, + { + "epoch": 0.5790742526518804, + "grad_norm": 2.2121381759643555, + "learning_rate": 1.886054432389687e-05, + "loss": 5.4744, + "step": 6005 + }, + { + "epoch": 0.5791706846673096, + "grad_norm": 1.5290131568908691, + "learning_rate": 1.8853202785689716e-05, + "loss": 5.3078, + "step": 6006 + }, + { + "epoch": 0.5792671166827387, + "grad_norm": 1.621959924697876, + "learning_rate": 1.884586181162804e-05, + "loss": 5.4644, + "step": 6007 + }, + { + "epoch": 0.5793635486981678, + "grad_norm": 1.972824215888977, + "learning_rate": 1.8838521402385594e-05, + "loss": 5.505, + "step": 6008 + }, + { + "epoch": 0.579459980713597, + "grad_norm": 1.4404278993606567, + "learning_rate": 1.8831181558636076e-05, + "loss": 5.5016, + "step": 6009 + }, + { + "epoch": 0.579556412729026, + "grad_norm": 1.5814313888549805, + "learning_rate": 1.8823842281053117e-05, + "loss": 5.524, + "step": 6010 + }, + { + "epoch": 0.5796528447444551, + "grad_norm": 2.121965169906616, + "learning_rate": 1.8816503570310312e-05, + "loss": 5.4075, + "step": 6011 + }, + { + "epoch": 0.5797492767598843, + "grad_norm": 1.2068990468978882, + "learning_rate": 1.8809165427081205e-05, + "loss": 5.5376, + "step": 6012 + }, + { + "epoch": 0.5798457087753134, + "grad_norm": 1.9930944442749023, + "learning_rate": 1.880182785203926e-05, + "loss": 5.4368, + "step": 6013 + }, + { + "epoch": 0.5799421407907426, + "grad_norm": 1.5082305669784546, + "learning_rate": 1.8794490845857944e-05, + "loss": 5.4791, + "step": 6014 + }, + { + "epoch": 0.5800385728061717, + "grad_norm": 1.1993721723556519, + "learning_rate": 1.87871544092106e-05, + "loss": 5.4036, + "step": 6015 + }, + { + "epoch": 0.5801350048216007, + "grad_norm": 1.6269294023513794, + "learning_rate": 1.8779818542770597e-05, + "loss": 5.5168, + "step": 6016 + }, + { + "epoch": 0.5802314368370299, + "grad_norm": 1.6751035451889038, + "learning_rate": 1.877248324721119e-05, + "loss": 5.4986, + "step": 6017 + }, + { + "epoch": 0.580327868852459, + "grad_norm": 1.551653504371643, + "learning_rate": 1.8765148523205596e-05, + "loss": 5.4602, + "step": 6018 + }, + { + "epoch": 0.5804243008678881, + "grad_norm": 1.3562257289886475, + "learning_rate": 1.8757814371427e-05, + "loss": 5.4827, + "step": 6019 + }, + { + "epoch": 0.5805207328833173, + "grad_norm": 1.4276469945907593, + "learning_rate": 1.8750480792548523e-05, + "loss": 5.6257, + "step": 6020 + }, + { + "epoch": 0.5806171648987464, + "grad_norm": 1.1731067895889282, + "learning_rate": 1.874314778724322e-05, + "loss": 5.4676, + "step": 6021 + }, + { + "epoch": 0.5807135969141755, + "grad_norm": 1.1772187948226929, + "learning_rate": 1.873581535618412e-05, + "loss": 5.5828, + "step": 6022 + }, + { + "epoch": 0.5808100289296046, + "grad_norm": 1.6679750680923462, + "learning_rate": 1.8728483500044175e-05, + "loss": 5.5317, + "step": 6023 + }, + { + "epoch": 0.5809064609450337, + "grad_norm": 1.2460819482803345, + "learning_rate": 1.8721152219496288e-05, + "loss": 5.4998, + "step": 6024 + }, + { + "epoch": 0.5810028929604629, + "grad_norm": 1.1164294481277466, + "learning_rate": 1.871382151521333e-05, + "loss": 5.4228, + "step": 6025 + }, + { + "epoch": 0.581099324975892, + "grad_norm": 1.7902488708496094, + "learning_rate": 1.8706491387868086e-05, + "loss": 5.4818, + "step": 6026 + }, + { + "epoch": 0.5811957569913211, + "grad_norm": 1.9749417304992676, + "learning_rate": 1.8699161838133327e-05, + "loss": 5.3511, + "step": 6027 + }, + { + "epoch": 0.5812921890067503, + "grad_norm": 1.304259181022644, + "learning_rate": 1.8691832866681737e-05, + "loss": 5.401, + "step": 6028 + }, + { + "epoch": 0.5813886210221794, + "grad_norm": 1.3926588296890259, + "learning_rate": 1.868450447418596e-05, + "loss": 5.4826, + "step": 6029 + }, + { + "epoch": 0.5814850530376084, + "grad_norm": 1.6020190715789795, + "learning_rate": 1.867717666131859e-05, + "loss": 5.4014, + "step": 6030 + }, + { + "epoch": 0.5815814850530376, + "grad_norm": 2.04394268989563, + "learning_rate": 1.8669849428752167e-05, + "loss": 5.4929, + "step": 6031 + }, + { + "epoch": 0.5816779170684667, + "grad_norm": 1.6800826787948608, + "learning_rate": 1.8662522777159163e-05, + "loss": 5.4605, + "step": 6032 + }, + { + "epoch": 0.5817743490838958, + "grad_norm": 1.942000150680542, + "learning_rate": 1.865519670721203e-05, + "loss": 5.4921, + "step": 6033 + }, + { + "epoch": 0.581870781099325, + "grad_norm": 1.994470238685608, + "learning_rate": 1.8647871219583136e-05, + "loss": 5.4652, + "step": 6034 + }, + { + "epoch": 0.5819672131147541, + "grad_norm": 1.922692894935608, + "learning_rate": 1.864054631494479e-05, + "loss": 5.5696, + "step": 6035 + }, + { + "epoch": 0.5820636451301833, + "grad_norm": 1.2915987968444824, + "learning_rate": 1.8633221993969285e-05, + "loss": 5.5229, + "step": 6036 + }, + { + "epoch": 0.5821600771456124, + "grad_norm": 1.7214107513427734, + "learning_rate": 1.862589825732882e-05, + "loss": 5.4499, + "step": 6037 + }, + { + "epoch": 0.5822565091610414, + "grad_norm": 1.5182173252105713, + "learning_rate": 1.861857510569558e-05, + "loss": 5.4865, + "step": 6038 + }, + { + "epoch": 0.5823529411764706, + "grad_norm": 1.2089074850082397, + "learning_rate": 1.8611252539741657e-05, + "loss": 5.5357, + "step": 6039 + }, + { + "epoch": 0.5824493731918997, + "grad_norm": 1.7985960245132446, + "learning_rate": 1.860393056013911e-05, + "loss": 5.3943, + "step": 6040 + }, + { + "epoch": 0.5825458052073288, + "grad_norm": 2.2481777667999268, + "learning_rate": 1.8596609167559948e-05, + "loss": 5.405, + "step": 6041 + }, + { + "epoch": 0.582642237222758, + "grad_norm": 1.3343992233276367, + "learning_rate": 1.858928836267612e-05, + "loss": 5.3027, + "step": 6042 + }, + { + "epoch": 0.5827386692381871, + "grad_norm": 1.4155364036560059, + "learning_rate": 1.85819681461595e-05, + "loss": 5.5669, + "step": 6043 + }, + { + "epoch": 0.5828351012536162, + "grad_norm": 1.9720124006271362, + "learning_rate": 1.8574648518681958e-05, + "loss": 5.4376, + "step": 6044 + }, + { + "epoch": 0.5829315332690453, + "grad_norm": 1.998409628868103, + "learning_rate": 1.856732948091527e-05, + "loss": 5.6016, + "step": 6045 + }, + { + "epoch": 0.5830279652844744, + "grad_norm": 1.6262331008911133, + "learning_rate": 1.856001103353115e-05, + "loss": 5.4068, + "step": 6046 + }, + { + "epoch": 0.5831243972999036, + "grad_norm": 2.1366655826568604, + "learning_rate": 1.8552693177201302e-05, + "loss": 5.483, + "step": 6047 + }, + { + "epoch": 0.5832208293153327, + "grad_norm": 1.7159425020217896, + "learning_rate": 1.8545375912597328e-05, + "loss": 5.4068, + "step": 6048 + }, + { + "epoch": 0.5833172613307618, + "grad_norm": 1.6522672176361084, + "learning_rate": 1.853805924039082e-05, + "loss": 5.3711, + "step": 6049 + }, + { + "epoch": 0.583413693346191, + "grad_norm": 1.565377116203308, + "learning_rate": 1.8530743161253277e-05, + "loss": 5.3189, + "step": 6050 + }, + { + "epoch": 0.5835101253616201, + "grad_norm": 1.3745588064193726, + "learning_rate": 1.8523427675856164e-05, + "loss": 5.4971, + "step": 6051 + }, + { + "epoch": 0.5836065573770491, + "grad_norm": 1.7674938440322876, + "learning_rate": 1.8516112784870887e-05, + "loss": 5.4638, + "step": 6052 + }, + { + "epoch": 0.5837029893924783, + "grad_norm": 1.554784893989563, + "learning_rate": 1.8508798488968803e-05, + "loss": 5.3641, + "step": 6053 + }, + { + "epoch": 0.5837994214079074, + "grad_norm": 1.7772223949432373, + "learning_rate": 1.8501484788821193e-05, + "loss": 5.5649, + "step": 6054 + }, + { + "epoch": 0.5838958534233365, + "grad_norm": 1.5964611768722534, + "learning_rate": 1.849417168509932e-05, + "loss": 5.443, + "step": 6055 + }, + { + "epoch": 0.5839922854387657, + "grad_norm": 1.4821470975875854, + "learning_rate": 1.8486859178474367e-05, + "loss": 5.4359, + "step": 6056 + }, + { + "epoch": 0.5840887174541948, + "grad_norm": 1.5303301811218262, + "learning_rate": 1.8479547269617448e-05, + "loss": 5.4871, + "step": 6057 + }, + { + "epoch": 0.584185149469624, + "grad_norm": 1.39759361743927, + "learning_rate": 1.8472235959199667e-05, + "loss": 5.6046, + "step": 6058 + }, + { + "epoch": 0.584281581485053, + "grad_norm": 1.2056808471679688, + "learning_rate": 1.8464925247892027e-05, + "loss": 5.5913, + "step": 6059 + }, + { + "epoch": 0.5843780135004821, + "grad_norm": 1.7624610662460327, + "learning_rate": 1.8457615136365513e-05, + "loss": 5.5413, + "step": 6060 + }, + { + "epoch": 0.5844744455159113, + "grad_norm": 1.310864806175232, + "learning_rate": 1.8450305625291027e-05, + "loss": 5.5082, + "step": 6061 + }, + { + "epoch": 0.5845708775313404, + "grad_norm": 1.1844468116760254, + "learning_rate": 1.844299671533942e-05, + "loss": 5.5236, + "step": 6062 + }, + { + "epoch": 0.5846673095467695, + "grad_norm": 1.8160865306854248, + "learning_rate": 1.843568840718152e-05, + "loss": 5.4793, + "step": 6063 + }, + { + "epoch": 0.5847637415621987, + "grad_norm": 1.5560431480407715, + "learning_rate": 1.8428380701488052e-05, + "loss": 5.5047, + "step": 6064 + }, + { + "epoch": 0.5848601735776278, + "grad_norm": 1.3979140520095825, + "learning_rate": 1.8421073598929704e-05, + "loss": 5.4426, + "step": 6065 + }, + { + "epoch": 0.5849566055930568, + "grad_norm": 1.603426218032837, + "learning_rate": 1.841376710017714e-05, + "loss": 5.2477, + "step": 6066 + }, + { + "epoch": 0.585053037608486, + "grad_norm": 1.6608887910842896, + "learning_rate": 1.840646120590092e-05, + "loss": 5.5292, + "step": 6067 + }, + { + "epoch": 0.5851494696239151, + "grad_norm": 1.5281050205230713, + "learning_rate": 1.839915591677157e-05, + "loss": 5.475, + "step": 6068 + }, + { + "epoch": 0.5852459016393443, + "grad_norm": 1.841585397720337, + "learning_rate": 1.8391851233459572e-05, + "loss": 5.5038, + "step": 6069 + }, + { + "epoch": 0.5853423336547734, + "grad_norm": 1.8424456119537354, + "learning_rate": 1.8384547156635323e-05, + "loss": 5.4063, + "step": 6070 + }, + { + "epoch": 0.5854387656702025, + "grad_norm": 1.2521098852157593, + "learning_rate": 1.8377243686969208e-05, + "loss": 5.4026, + "step": 6071 + }, + { + "epoch": 0.5855351976856317, + "grad_norm": 1.6159803867340088, + "learning_rate": 1.836994082513151e-05, + "loss": 5.4409, + "step": 6072 + }, + { + "epoch": 0.5856316297010608, + "grad_norm": 1.8403388261795044, + "learning_rate": 1.836263857179248e-05, + "loss": 5.5158, + "step": 6073 + }, + { + "epoch": 0.5857280617164898, + "grad_norm": 1.377699851989746, + "learning_rate": 1.8355336927622317e-05, + "loss": 5.5023, + "step": 6074 + }, + { + "epoch": 0.585824493731919, + "grad_norm": 1.7490620613098145, + "learning_rate": 1.8348035893291155e-05, + "loss": 5.4538, + "step": 6075 + }, + { + "epoch": 0.5859209257473481, + "grad_norm": 2.017423629760742, + "learning_rate": 1.8340735469469063e-05, + "loss": 5.5207, + "step": 6076 + }, + { + "epoch": 0.5860173577627772, + "grad_norm": 1.6757283210754395, + "learning_rate": 1.8333435656826077e-05, + "loss": 5.4584, + "step": 6077 + }, + { + "epoch": 0.5861137897782064, + "grad_norm": 1.8966790437698364, + "learning_rate": 1.8326136456032167e-05, + "loss": 5.5368, + "step": 6078 + }, + { + "epoch": 0.5862102217936355, + "grad_norm": 2.3000502586364746, + "learning_rate": 1.8318837867757227e-05, + "loss": 5.3847, + "step": 6079 + }, + { + "epoch": 0.5863066538090647, + "grad_norm": 1.4454617500305176, + "learning_rate": 1.8311539892671136e-05, + "loss": 5.3572, + "step": 6080 + }, + { + "epoch": 0.5864030858244937, + "grad_norm": 2.2929413318634033, + "learning_rate": 1.8304242531443672e-05, + "loss": 5.291, + "step": 6081 + }, + { + "epoch": 0.5864995178399228, + "grad_norm": 2.457939624786377, + "learning_rate": 1.8296945784744595e-05, + "loss": 5.4192, + "step": 6082 + }, + { + "epoch": 0.586595949855352, + "grad_norm": 1.6461719274520874, + "learning_rate": 1.8289649653243585e-05, + "loss": 5.4624, + "step": 6083 + }, + { + "epoch": 0.5866923818707811, + "grad_norm": 1.6485458612442017, + "learning_rate": 1.828235413761026e-05, + "loss": 5.4428, + "step": 6084 + }, + { + "epoch": 0.5867888138862102, + "grad_norm": 1.788165807723999, + "learning_rate": 1.8275059238514217e-05, + "loss": 5.5997, + "step": 6085 + }, + { + "epoch": 0.5868852459016394, + "grad_norm": 1.8166589736938477, + "learning_rate": 1.8267764956624956e-05, + "loss": 5.5629, + "step": 6086 + }, + { + "epoch": 0.5869816779170685, + "grad_norm": 1.5007758140563965, + "learning_rate": 1.8260471292611936e-05, + "loss": 5.4827, + "step": 6087 + }, + { + "epoch": 0.5870781099324975, + "grad_norm": 2.595991373062134, + "learning_rate": 1.825317824714457e-05, + "loss": 5.4355, + "step": 6088 + }, + { + "epoch": 0.5871745419479267, + "grad_norm": 2.2150161266326904, + "learning_rate": 1.82458858208922e-05, + "loss": 5.402, + "step": 6089 + }, + { + "epoch": 0.5872709739633558, + "grad_norm": 1.5300043821334839, + "learning_rate": 1.8238594014524113e-05, + "loss": 5.4895, + "step": 6090 + }, + { + "epoch": 0.587367405978785, + "grad_norm": 1.730645775794983, + "learning_rate": 1.8231302828709553e-05, + "loss": 5.5331, + "step": 6091 + }, + { + "epoch": 0.5874638379942141, + "grad_norm": 2.4678897857666016, + "learning_rate": 1.822401226411768e-05, + "loss": 5.3225, + "step": 6092 + }, + { + "epoch": 0.5875602700096432, + "grad_norm": 2.114797830581665, + "learning_rate": 1.8216722321417627e-05, + "loss": 5.5563, + "step": 6093 + }, + { + "epoch": 0.5876567020250724, + "grad_norm": 1.2387619018554688, + "learning_rate": 1.8209433001278446e-05, + "loss": 5.4487, + "step": 6094 + }, + { + "epoch": 0.5877531340405014, + "grad_norm": 1.550891637802124, + "learning_rate": 1.820214430436915e-05, + "loss": 5.4098, + "step": 6095 + }, + { + "epoch": 0.5878495660559305, + "grad_norm": 2.517488479614258, + "learning_rate": 1.819485623135868e-05, + "loss": 5.3982, + "step": 6096 + }, + { + "epoch": 0.5879459980713597, + "grad_norm": 1.3008791208267212, + "learning_rate": 1.8187568782915935e-05, + "loss": 5.4104, + "step": 6097 + }, + { + "epoch": 0.5880424300867888, + "grad_norm": 1.6175845861434937, + "learning_rate": 1.8180281959709733e-05, + "loss": 5.6427, + "step": 6098 + }, + { + "epoch": 0.5881388621022179, + "grad_norm": 1.5459113121032715, + "learning_rate": 1.8172995762408867e-05, + "loss": 5.5332, + "step": 6099 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.8604143857955933, + "learning_rate": 1.816571019168205e-05, + "loss": 5.258, + "step": 6100 + }, + { + "epoch": 0.5883317261330762, + "grad_norm": 1.6677064895629883, + "learning_rate": 1.815842524819793e-05, + "loss": 5.4934, + "step": 6101 + }, + { + "epoch": 0.5884281581485054, + "grad_norm": 1.4997105598449707, + "learning_rate": 1.8151140932625128e-05, + "loss": 5.375, + "step": 6102 + }, + { + "epoch": 0.5885245901639344, + "grad_norm": 1.2645982503890991, + "learning_rate": 1.8143857245632172e-05, + "loss": 5.3211, + "step": 6103 + }, + { + "epoch": 0.5886210221793635, + "grad_norm": 1.4778810739517212, + "learning_rate": 1.813657418788757e-05, + "loss": 5.3526, + "step": 6104 + }, + { + "epoch": 0.5887174541947927, + "grad_norm": 1.6820436716079712, + "learning_rate": 1.812929176005974e-05, + "loss": 5.472, + "step": 6105 + }, + { + "epoch": 0.5888138862102218, + "grad_norm": 1.337783694267273, + "learning_rate": 1.812200996281705e-05, + "loss": 5.3661, + "step": 6106 + }, + { + "epoch": 0.5889103182256509, + "grad_norm": 1.6102596521377563, + "learning_rate": 1.8114728796827825e-05, + "loss": 5.5322, + "step": 6107 + }, + { + "epoch": 0.5890067502410801, + "grad_norm": 1.6939831972122192, + "learning_rate": 1.810744826276032e-05, + "loss": 5.6192, + "step": 6108 + }, + { + "epoch": 0.5891031822565092, + "grad_norm": 1.1783596277236938, + "learning_rate": 1.8100168361282722e-05, + "loss": 5.4691, + "step": 6109 + }, + { + "epoch": 0.5891996142719382, + "grad_norm": 1.8529826402664185, + "learning_rate": 1.8092889093063185e-05, + "loss": 5.498, + "step": 6110 + }, + { + "epoch": 0.5892960462873674, + "grad_norm": 1.4053915739059448, + "learning_rate": 1.808561045876978e-05, + "loss": 5.4635, + "step": 6111 + }, + { + "epoch": 0.5893924783027965, + "grad_norm": 1.1547596454620361, + "learning_rate": 1.807833245907054e-05, + "loss": 5.5488, + "step": 6112 + }, + { + "epoch": 0.5894889103182257, + "grad_norm": 1.4137475490570068, + "learning_rate": 1.8071055094633425e-05, + "loss": 5.4512, + "step": 6113 + }, + { + "epoch": 0.5895853423336548, + "grad_norm": 1.571673035621643, + "learning_rate": 1.806377836612635e-05, + "loss": 5.4433, + "step": 6114 + }, + { + "epoch": 0.5896817743490839, + "grad_norm": 1.5371805429458618, + "learning_rate": 1.8056502274217157e-05, + "loss": 5.4137, + "step": 6115 + }, + { + "epoch": 0.5897782063645131, + "grad_norm": 1.4119529724121094, + "learning_rate": 1.8049226819573644e-05, + "loss": 5.4095, + "step": 6116 + }, + { + "epoch": 0.5898746383799421, + "grad_norm": 1.4779685735702515, + "learning_rate": 1.8041952002863534e-05, + "loss": 5.456, + "step": 6117 + }, + { + "epoch": 0.5899710703953712, + "grad_norm": 1.5766195058822632, + "learning_rate": 1.8034677824754503e-05, + "loss": 5.5062, + "step": 6118 + }, + { + "epoch": 0.5900675024108004, + "grad_norm": 1.5018513202667236, + "learning_rate": 1.8027404285914176e-05, + "loss": 5.4463, + "step": 6119 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 1.2385401725769043, + "learning_rate": 1.802013138701009e-05, + "loss": 5.3992, + "step": 6120 + }, + { + "epoch": 0.5902603664416586, + "grad_norm": 1.4122414588928223, + "learning_rate": 1.8012859128709766e-05, + "loss": 5.41, + "step": 6121 + }, + { + "epoch": 0.5903567984570878, + "grad_norm": 1.4361339807510376, + "learning_rate": 1.8005587511680633e-05, + "loss": 5.5313, + "step": 6122 + }, + { + "epoch": 0.5904532304725169, + "grad_norm": 1.2636134624481201, + "learning_rate": 1.7998316536590053e-05, + "loss": 5.6025, + "step": 6123 + }, + { + "epoch": 0.590549662487946, + "grad_norm": 1.1884486675262451, + "learning_rate": 1.799104620410538e-05, + "loss": 5.4863, + "step": 6124 + }, + { + "epoch": 0.5906460945033751, + "grad_norm": 1.2294427156448364, + "learning_rate": 1.7983776514893846e-05, + "loss": 5.6186, + "step": 6125 + }, + { + "epoch": 0.5907425265188042, + "grad_norm": 1.3377097845077515, + "learning_rate": 1.797650746962268e-05, + "loss": 5.5256, + "step": 6126 + }, + { + "epoch": 0.5908389585342334, + "grad_norm": 1.5238394737243652, + "learning_rate": 1.7969239068959016e-05, + "loss": 5.4236, + "step": 6127 + }, + { + "epoch": 0.5909353905496625, + "grad_norm": 1.2492682933807373, + "learning_rate": 1.796197131356992e-05, + "loss": 5.5744, + "step": 6128 + }, + { + "epoch": 0.5910318225650916, + "grad_norm": 1.7891788482666016, + "learning_rate": 1.7954704204122452e-05, + "loss": 5.4676, + "step": 6129 + }, + { + "epoch": 0.5911282545805208, + "grad_norm": 1.2537591457366943, + "learning_rate": 1.794743774128356e-05, + "loss": 5.4641, + "step": 6130 + }, + { + "epoch": 0.5912246865959498, + "grad_norm": 2.1510117053985596, + "learning_rate": 1.7940171925720135e-05, + "loss": 5.5401, + "step": 6131 + }, + { + "epoch": 0.5913211186113789, + "grad_norm": 1.3666678667068481, + "learning_rate": 1.7932906758099054e-05, + "loss": 5.5667, + "step": 6132 + }, + { + "epoch": 0.5914175506268081, + "grad_norm": 1.207355260848999, + "learning_rate": 1.7925642239087092e-05, + "loss": 5.5603, + "step": 6133 + }, + { + "epoch": 0.5915139826422372, + "grad_norm": 1.6172536611557007, + "learning_rate": 1.7918378369350974e-05, + "loss": 5.6091, + "step": 6134 + }, + { + "epoch": 0.5916104146576664, + "grad_norm": 1.4232652187347412, + "learning_rate": 1.791111514955737e-05, + "loss": 5.4785, + "step": 6135 + }, + { + "epoch": 0.5917068466730955, + "grad_norm": 1.6656534671783447, + "learning_rate": 1.79038525803729e-05, + "loss": 5.29, + "step": 6136 + }, + { + "epoch": 0.5918032786885246, + "grad_norm": 1.3435776233673096, + "learning_rate": 1.7896590662464106e-05, + "loss": 5.408, + "step": 6137 + }, + { + "epoch": 0.5918997107039538, + "grad_norm": 1.3548812866210938, + "learning_rate": 1.788932939649748e-05, + "loss": 5.2697, + "step": 6138 + }, + { + "epoch": 0.5919961427193828, + "grad_norm": 1.4079102277755737, + "learning_rate": 1.7882068783139438e-05, + "loss": 5.3759, + "step": 6139 + }, + { + "epoch": 0.5920925747348119, + "grad_norm": 1.6358516216278076, + "learning_rate": 1.7874808823056378e-05, + "loss": 5.4136, + "step": 6140 + }, + { + "epoch": 0.5921890067502411, + "grad_norm": 1.412662386894226, + "learning_rate": 1.786754951691459e-05, + "loss": 5.5268, + "step": 6141 + }, + { + "epoch": 0.5922854387656702, + "grad_norm": 1.8743548393249512, + "learning_rate": 1.786029086538032e-05, + "loss": 5.3967, + "step": 6142 + }, + { + "epoch": 0.5923818707810993, + "grad_norm": 1.5441651344299316, + "learning_rate": 1.785303286911978e-05, + "loss": 5.4044, + "step": 6143 + }, + { + "epoch": 0.5924783027965285, + "grad_norm": 1.5535194873809814, + "learning_rate": 1.784577552879908e-05, + "loss": 5.3773, + "step": 6144 + }, + { + "epoch": 0.5925747348119575, + "grad_norm": 1.610957384109497, + "learning_rate": 1.7838518845084288e-05, + "loss": 5.3033, + "step": 6145 + }, + { + "epoch": 0.5926711668273867, + "grad_norm": 1.4669300317764282, + "learning_rate": 1.7831262818641435e-05, + "loss": 5.394, + "step": 6146 + }, + { + "epoch": 0.5927675988428158, + "grad_norm": 1.6743180751800537, + "learning_rate": 1.782400745013645e-05, + "loss": 5.3266, + "step": 6147 + }, + { + "epoch": 0.5928640308582449, + "grad_norm": 2.1793315410614014, + "learning_rate": 1.7816752740235236e-05, + "loss": 5.5473, + "step": 6148 + }, + { + "epoch": 0.5929604628736741, + "grad_norm": 1.6283445358276367, + "learning_rate": 1.7809498689603614e-05, + "loss": 5.5077, + "step": 6149 + }, + { + "epoch": 0.5930568948891032, + "grad_norm": 1.7678378820419312, + "learning_rate": 1.7802245298907343e-05, + "loss": 5.5365, + "step": 6150 + }, + { + "epoch": 0.5931533269045323, + "grad_norm": 1.4044644832611084, + "learning_rate": 1.7794992568812148e-05, + "loss": 5.4775, + "step": 6151 + }, + { + "epoch": 0.5932497589199615, + "grad_norm": 1.7300128936767578, + "learning_rate": 1.778774049998367e-05, + "loss": 5.4417, + "step": 6152 + }, + { + "epoch": 0.5933461909353905, + "grad_norm": 1.9885516166687012, + "learning_rate": 1.7780489093087478e-05, + "loss": 5.3673, + "step": 6153 + }, + { + "epoch": 0.5934426229508196, + "grad_norm": 1.5567032098770142, + "learning_rate": 1.777323834878912e-05, + "loss": 5.4794, + "step": 6154 + }, + { + "epoch": 0.5935390549662488, + "grad_norm": 1.5432039499282837, + "learning_rate": 1.7765988267754053e-05, + "loss": 5.4865, + "step": 6155 + }, + { + "epoch": 0.5936354869816779, + "grad_norm": 2.1348252296447754, + "learning_rate": 1.7758738850647674e-05, + "loss": 5.406, + "step": 6156 + }, + { + "epoch": 0.5937319189971071, + "grad_norm": 2.494757652282715, + "learning_rate": 1.775149009813533e-05, + "loss": 5.256, + "step": 6157 + }, + { + "epoch": 0.5938283510125362, + "grad_norm": 1.3874493837356567, + "learning_rate": 1.774424201088231e-05, + "loss": 5.3924, + "step": 6158 + }, + { + "epoch": 0.5939247830279653, + "grad_norm": 1.9572970867156982, + "learning_rate": 1.7736994589553824e-05, + "loss": 5.5476, + "step": 6159 + }, + { + "epoch": 0.5940212150433944, + "grad_norm": 1.5751736164093018, + "learning_rate": 1.7729747834815042e-05, + "loss": 5.3932, + "step": 6160 + }, + { + "epoch": 0.5941176470588235, + "grad_norm": 1.6498366594314575, + "learning_rate": 1.772250174733104e-05, + "loss": 5.3391, + "step": 6161 + }, + { + "epoch": 0.5942140790742526, + "grad_norm": 1.5453556776046753, + "learning_rate": 1.7715256327766886e-05, + "loss": 5.4373, + "step": 6162 + }, + { + "epoch": 0.5943105110896818, + "grad_norm": 1.551379680633545, + "learning_rate": 1.7708011576787537e-05, + "loss": 5.3269, + "step": 6163 + }, + { + "epoch": 0.5944069431051109, + "grad_norm": 1.230100393295288, + "learning_rate": 1.77007674950579e-05, + "loss": 5.4465, + "step": 6164 + }, + { + "epoch": 0.59450337512054, + "grad_norm": 1.5938235521316528, + "learning_rate": 1.7693524083242852e-05, + "loss": 5.4422, + "step": 6165 + }, + { + "epoch": 0.5945998071359692, + "grad_norm": 1.4630805253982544, + "learning_rate": 1.7686281342007173e-05, + "loss": 5.4496, + "step": 6166 + }, + { + "epoch": 0.5946962391513982, + "grad_norm": 1.5160472393035889, + "learning_rate": 1.7679039272015572e-05, + "loss": 5.3992, + "step": 6167 + }, + { + "epoch": 0.5947926711668274, + "grad_norm": 1.219497799873352, + "learning_rate": 1.7671797873932753e-05, + "loss": 5.3628, + "step": 6168 + }, + { + "epoch": 0.5948891031822565, + "grad_norm": 1.3898307085037231, + "learning_rate": 1.7664557148423296e-05, + "loss": 5.4087, + "step": 6169 + }, + { + "epoch": 0.5949855351976856, + "grad_norm": 1.7037101984024048, + "learning_rate": 1.765731709615176e-05, + "loss": 5.5508, + "step": 6170 + }, + { + "epoch": 0.5950819672131148, + "grad_norm": 1.173486590385437, + "learning_rate": 1.7650077717782627e-05, + "loss": 5.5472, + "step": 6171 + }, + { + "epoch": 0.5951783992285439, + "grad_norm": 1.9487930536270142, + "learning_rate": 1.7642839013980302e-05, + "loss": 5.5231, + "step": 6172 + }, + { + "epoch": 0.595274831243973, + "grad_norm": 1.931274175643921, + "learning_rate": 1.763560098540917e-05, + "loss": 5.5005, + "step": 6173 + }, + { + "epoch": 0.5953712632594022, + "grad_norm": 1.298840880393982, + "learning_rate": 1.7628363632733514e-05, + "loss": 5.3509, + "step": 6174 + }, + { + "epoch": 0.5954676952748312, + "grad_norm": 1.2140783071517944, + "learning_rate": 1.762112695661756e-05, + "loss": 5.4838, + "step": 6175 + }, + { + "epoch": 0.5955641272902603, + "grad_norm": 1.7709014415740967, + "learning_rate": 1.7613890957725503e-05, + "loss": 5.4652, + "step": 6176 + }, + { + "epoch": 0.5956605593056895, + "grad_norm": 1.541282296180725, + "learning_rate": 1.760665563672144e-05, + "loss": 5.5031, + "step": 6177 + }, + { + "epoch": 0.5957569913211186, + "grad_norm": 1.2983325719833374, + "learning_rate": 1.7599420994269423e-05, + "loss": 5.5132, + "step": 6178 + }, + { + "epoch": 0.5958534233365478, + "grad_norm": 1.9902175664901733, + "learning_rate": 1.7592187031033436e-05, + "loss": 5.4216, + "step": 6179 + }, + { + "epoch": 0.5959498553519769, + "grad_norm": 1.6914349794387817, + "learning_rate": 1.758495374767741e-05, + "loss": 5.3624, + "step": 6180 + }, + { + "epoch": 0.596046287367406, + "grad_norm": 1.4090077877044678, + "learning_rate": 1.7577721144865196e-05, + "loss": 5.3969, + "step": 6181 + }, + { + "epoch": 0.5961427193828351, + "grad_norm": 1.8906888961791992, + "learning_rate": 1.757048922326061e-05, + "loss": 5.4732, + "step": 6182 + }, + { + "epoch": 0.5962391513982642, + "grad_norm": 2.089873790740967, + "learning_rate": 1.7563257983527364e-05, + "loss": 5.3563, + "step": 6183 + }, + { + "epoch": 0.5963355834136933, + "grad_norm": 2.2719221115112305, + "learning_rate": 1.7556027426329165e-05, + "loss": 5.4578, + "step": 6184 + }, + { + "epoch": 0.5964320154291225, + "grad_norm": 2.2318155765533447, + "learning_rate": 1.7548797552329597e-05, + "loss": 5.4001, + "step": 6185 + }, + { + "epoch": 0.5965284474445516, + "grad_norm": 1.4540657997131348, + "learning_rate": 1.7541568362192217e-05, + "loss": 5.4871, + "step": 6186 + }, + { + "epoch": 0.5966248794599807, + "grad_norm": 1.4276723861694336, + "learning_rate": 1.753433985658052e-05, + "loss": 5.4578, + "step": 6187 + }, + { + "epoch": 0.5967213114754099, + "grad_norm": 1.6709520816802979, + "learning_rate": 1.7527112036157922e-05, + "loss": 5.4644, + "step": 6188 + }, + { + "epoch": 0.5968177434908389, + "grad_norm": 1.4859769344329834, + "learning_rate": 1.7519884901587772e-05, + "loss": 5.4809, + "step": 6189 + }, + { + "epoch": 0.5969141755062681, + "grad_norm": 1.3378748893737793, + "learning_rate": 1.751265845353339e-05, + "loss": 5.5079, + "step": 6190 + }, + { + "epoch": 0.5970106075216972, + "grad_norm": 1.4509365558624268, + "learning_rate": 1.7505432692657992e-05, + "loss": 5.4612, + "step": 6191 + }, + { + "epoch": 0.5971070395371263, + "grad_norm": 1.4244834184646606, + "learning_rate": 1.7498207619624764e-05, + "loss": 5.4347, + "step": 6192 + }, + { + "epoch": 0.5972034715525555, + "grad_norm": 1.9695894718170166, + "learning_rate": 1.749098323509681e-05, + "loss": 5.4849, + "step": 6193 + }, + { + "epoch": 0.5972999035679846, + "grad_norm": 1.7021846771240234, + "learning_rate": 1.748375953973716e-05, + "loss": 5.4523, + "step": 6194 + }, + { + "epoch": 0.5973963355834137, + "grad_norm": 1.6321333646774292, + "learning_rate": 1.747653653420882e-05, + "loss": 5.4905, + "step": 6195 + }, + { + "epoch": 0.5974927675988428, + "grad_norm": 1.53363835811615, + "learning_rate": 1.74693142191747e-05, + "loss": 5.504, + "step": 6196 + }, + { + "epoch": 0.5975891996142719, + "grad_norm": 1.2879685163497925, + "learning_rate": 1.7462092595297637e-05, + "loss": 5.5146, + "step": 6197 + }, + { + "epoch": 0.597685631629701, + "grad_norm": 1.6915605068206787, + "learning_rate": 1.745487166324045e-05, + "loss": 5.4671, + "step": 6198 + }, + { + "epoch": 0.5977820636451302, + "grad_norm": 1.5115859508514404, + "learning_rate": 1.744765142366585e-05, + "loss": 5.43, + "step": 6199 + }, + { + "epoch": 0.5978784956605593, + "grad_norm": 1.2537376880645752, + "learning_rate": 1.744043187723651e-05, + "loss": 5.4897, + "step": 6200 + }, + { + "epoch": 0.5979749276759885, + "grad_norm": 1.5639172792434692, + "learning_rate": 1.7433213024615025e-05, + "loss": 5.4769, + "step": 6201 + }, + { + "epoch": 0.5980713596914176, + "grad_norm": 1.7419780492782593, + "learning_rate": 1.7425994866463936e-05, + "loss": 5.5377, + "step": 6202 + }, + { + "epoch": 0.5981677917068466, + "grad_norm": 1.2085069417953491, + "learning_rate": 1.7418777403445717e-05, + "loss": 5.5948, + "step": 6203 + }, + { + "epoch": 0.5982642237222758, + "grad_norm": 1.5825237035751343, + "learning_rate": 1.741156063622278e-05, + "loss": 5.4283, + "step": 6204 + }, + { + "epoch": 0.5983606557377049, + "grad_norm": 2.0695486068725586, + "learning_rate": 1.740434456545746e-05, + "loss": 5.4914, + "step": 6205 + }, + { + "epoch": 0.598457087753134, + "grad_norm": 1.310482382774353, + "learning_rate": 1.7397129191812057e-05, + "loss": 5.4603, + "step": 6206 + }, + { + "epoch": 0.5985535197685632, + "grad_norm": 1.9964823722839355, + "learning_rate": 1.7389914515948774e-05, + "loss": 5.4146, + "step": 6207 + }, + { + "epoch": 0.5986499517839923, + "grad_norm": 2.2147915363311768, + "learning_rate": 1.7382700538529767e-05, + "loss": 5.5689, + "step": 6208 + }, + { + "epoch": 0.5987463837994214, + "grad_norm": 2.313178777694702, + "learning_rate": 1.7375487260217135e-05, + "loss": 5.5577, + "step": 6209 + }, + { + "epoch": 0.5988428158148505, + "grad_norm": 2.424652338027954, + "learning_rate": 1.73682746816729e-05, + "loss": 5.5897, + "step": 6210 + }, + { + "epoch": 0.5989392478302796, + "grad_norm": 2.386793613433838, + "learning_rate": 1.736106280355901e-05, + "loss": 5.5882, + "step": 6211 + }, + { + "epoch": 0.5990356798457088, + "grad_norm": 1.767552375793457, + "learning_rate": 1.735385162653739e-05, + "loss": 5.5189, + "step": 6212 + }, + { + "epoch": 0.5991321118611379, + "grad_norm": 1.7000213861465454, + "learning_rate": 1.734664115126984e-05, + "loss": 5.4765, + "step": 6213 + }, + { + "epoch": 0.599228543876567, + "grad_norm": 2.356579065322876, + "learning_rate": 1.7339431378418153e-05, + "loss": 5.5255, + "step": 6214 + }, + { + "epoch": 0.5993249758919962, + "grad_norm": 2.1003646850585938, + "learning_rate": 1.733222230864403e-05, + "loss": 5.4976, + "step": 6215 + }, + { + "epoch": 0.5994214079074253, + "grad_norm": 1.9043583869934082, + "learning_rate": 1.73250139426091e-05, + "loss": 5.3723, + "step": 6216 + }, + { + "epoch": 0.5995178399228543, + "grad_norm": 2.0945193767547607, + "learning_rate": 1.7317806280974945e-05, + "loss": 5.5092, + "step": 6217 + }, + { + "epoch": 0.5996142719382835, + "grad_norm": 1.967417597770691, + "learning_rate": 1.7310599324403075e-05, + "loss": 5.546, + "step": 6218 + }, + { + "epoch": 0.5997107039537126, + "grad_norm": 2.0280368328094482, + "learning_rate": 1.7303393073554937e-05, + "loss": 5.5819, + "step": 6219 + }, + { + "epoch": 0.5998071359691417, + "grad_norm": 1.833479642868042, + "learning_rate": 1.7296187529091904e-05, + "loss": 5.4954, + "step": 6220 + }, + { + "epoch": 0.5999035679845709, + "grad_norm": 1.808374047279358, + "learning_rate": 1.7288982691675305e-05, + "loss": 5.4252, + "step": 6221 + }, + { + "epoch": 0.6, + "grad_norm": 2.3886728286743164, + "learning_rate": 1.7281778561966378e-05, + "loss": 5.26, + "step": 6222 + }, + { + "epoch": 0.6000964320154292, + "grad_norm": 2.1506919860839844, + "learning_rate": 1.7274575140626318e-05, + "loss": 5.6494, + "step": 6223 + }, + { + "epoch": 0.6001928640308583, + "grad_norm": 1.8776265382766724, + "learning_rate": 1.7267372428316244e-05, + "loss": 5.5674, + "step": 6224 + }, + { + "epoch": 0.6002892960462873, + "grad_norm": 2.233628273010254, + "learning_rate": 1.7260170425697203e-05, + "loss": 5.3805, + "step": 6225 + }, + { + "epoch": 0.6003857280617165, + "grad_norm": 2.024728775024414, + "learning_rate": 1.7252969133430208e-05, + "loss": 5.4615, + "step": 6226 + }, + { + "epoch": 0.6004821600771456, + "grad_norm": 1.7472560405731201, + "learning_rate": 1.7245768552176156e-05, + "loss": 5.5031, + "step": 6227 + }, + { + "epoch": 0.6005785920925747, + "grad_norm": 1.7278062105178833, + "learning_rate": 1.7238568682595934e-05, + "loss": 5.3705, + "step": 6228 + }, + { + "epoch": 0.6006750241080039, + "grad_norm": 1.5679148435592651, + "learning_rate": 1.7231369525350326e-05, + "loss": 5.3573, + "step": 6229 + }, + { + "epoch": 0.600771456123433, + "grad_norm": 2.0116419792175293, + "learning_rate": 1.7224171081100048e-05, + "loss": 5.5665, + "step": 6230 + }, + { + "epoch": 0.600867888138862, + "grad_norm": 1.9903664588928223, + "learning_rate": 1.7216973350505795e-05, + "loss": 5.6128, + "step": 6231 + }, + { + "epoch": 0.6009643201542912, + "grad_norm": 1.5868442058563232, + "learning_rate": 1.720977633422814e-05, + "loss": 5.4696, + "step": 6232 + }, + { + "epoch": 0.6010607521697203, + "grad_norm": 1.7613911628723145, + "learning_rate": 1.7202580032927622e-05, + "loss": 5.5796, + "step": 6233 + }, + { + "epoch": 0.6011571841851495, + "grad_norm": 1.6214139461517334, + "learning_rate": 1.7195384447264717e-05, + "loss": 5.4322, + "step": 6234 + }, + { + "epoch": 0.6012536162005786, + "grad_norm": 1.8765355348587036, + "learning_rate": 1.718818957789981e-05, + "loss": 5.5229, + "step": 6235 + }, + { + "epoch": 0.6013500482160077, + "grad_norm": 1.5339460372924805, + "learning_rate": 1.7180995425493262e-05, + "loss": 5.4162, + "step": 6236 + }, + { + "epoch": 0.6014464802314369, + "grad_norm": 1.516581654548645, + "learning_rate": 1.717380199070533e-05, + "loss": 5.5524, + "step": 6237 + }, + { + "epoch": 0.601542912246866, + "grad_norm": 2.2232155799865723, + "learning_rate": 1.716660927419621e-05, + "loss": 5.4485, + "step": 6238 + }, + { + "epoch": 0.601639344262295, + "grad_norm": 2.098419666290283, + "learning_rate": 1.7159417276626056e-05, + "loss": 5.4388, + "step": 6239 + }, + { + "epoch": 0.6017357762777242, + "grad_norm": 1.4342005252838135, + "learning_rate": 1.7152225998654935e-05, + "loss": 5.4715, + "step": 6240 + }, + { + "epoch": 0.6018322082931533, + "grad_norm": 1.624247431755066, + "learning_rate": 1.7145035440942847e-05, + "loss": 5.5195, + "step": 6241 + }, + { + "epoch": 0.6019286403085824, + "grad_norm": 2.173987627029419, + "learning_rate": 1.713784560414974e-05, + "loss": 5.4104, + "step": 6242 + }, + { + "epoch": 0.6020250723240116, + "grad_norm": 1.6898550987243652, + "learning_rate": 1.7130656488935493e-05, + "loss": 5.5314, + "step": 6243 + }, + { + "epoch": 0.6021215043394407, + "grad_norm": 1.7153130769729614, + "learning_rate": 1.7123468095959897e-05, + "loss": 5.5573, + "step": 6244 + }, + { + "epoch": 0.6022179363548699, + "grad_norm": 1.989421010017395, + "learning_rate": 1.7116280425882714e-05, + "loss": 5.3281, + "step": 6245 + }, + { + "epoch": 0.602314368370299, + "grad_norm": 2.162959098815918, + "learning_rate": 1.710909347936361e-05, + "loss": 5.4569, + "step": 6246 + }, + { + "epoch": 0.602410800385728, + "grad_norm": 2.1100754737854004, + "learning_rate": 1.710190725706219e-05, + "loss": 5.4687, + "step": 6247 + }, + { + "epoch": 0.6025072324011572, + "grad_norm": 2.1507620811462402, + "learning_rate": 1.7094721759638004e-05, + "loss": 5.4355, + "step": 6248 + }, + { + "epoch": 0.6026036644165863, + "grad_norm": 2.469527006149292, + "learning_rate": 1.7087536987750514e-05, + "loss": 5.5368, + "step": 6249 + }, + { + "epoch": 0.6027000964320154, + "grad_norm": 1.814831018447876, + "learning_rate": 1.7080352942059157e-05, + "loss": 5.4384, + "step": 6250 + }, + { + "epoch": 0.6027965284474446, + "grad_norm": 1.527823567390442, + "learning_rate": 1.7073169623223257e-05, + "loss": 5.5573, + "step": 6251 + }, + { + "epoch": 0.6028929604628737, + "grad_norm": 1.6397954225540161, + "learning_rate": 1.706598703190208e-05, + "loss": 5.4872, + "step": 6252 + }, + { + "epoch": 0.6029893924783027, + "grad_norm": 1.7331373691558838, + "learning_rate": 1.705880516875486e-05, + "loss": 5.5342, + "step": 6253 + }, + { + "epoch": 0.6030858244937319, + "grad_norm": 1.491118311882019, + "learning_rate": 1.705162403444073e-05, + "loss": 5.5165, + "step": 6254 + }, + { + "epoch": 0.603182256509161, + "grad_norm": 1.597209095954895, + "learning_rate": 1.704444362961875e-05, + "loss": 5.6104, + "step": 6255 + }, + { + "epoch": 0.6032786885245902, + "grad_norm": 1.9568842649459839, + "learning_rate": 1.7037263954947953e-05, + "loss": 5.1953, + "step": 6256 + }, + { + "epoch": 0.6033751205400193, + "grad_norm": 1.6808255910873413, + "learning_rate": 1.703008501108726e-05, + "loss": 5.5495, + "step": 6257 + }, + { + "epoch": 0.6034715525554484, + "grad_norm": 1.1855933666229248, + "learning_rate": 1.7022906798695564e-05, + "loss": 5.5109, + "step": 6258 + }, + { + "epoch": 0.6035679845708776, + "grad_norm": 1.8863792419433594, + "learning_rate": 1.7015729318431665e-05, + "loss": 5.6447, + "step": 6259 + }, + { + "epoch": 0.6036644165863067, + "grad_norm": 1.8678009510040283, + "learning_rate": 1.700855257095429e-05, + "loss": 5.5183, + "step": 6260 + }, + { + "epoch": 0.6037608486017357, + "grad_norm": 1.4227031469345093, + "learning_rate": 1.7001376556922137e-05, + "loss": 5.4482, + "step": 6261 + }, + { + "epoch": 0.6038572806171649, + "grad_norm": 1.527625560760498, + "learning_rate": 1.6994201276993794e-05, + "loss": 5.4642, + "step": 6262 + }, + { + "epoch": 0.603953712632594, + "grad_norm": 1.9133617877960205, + "learning_rate": 1.6987026731827803e-05, + "loss": 5.5141, + "step": 6263 + }, + { + "epoch": 0.6040501446480231, + "grad_norm": 1.7403829097747803, + "learning_rate": 1.6979852922082633e-05, + "loss": 5.4536, + "step": 6264 + }, + { + "epoch": 0.6041465766634523, + "grad_norm": 1.7684069871902466, + "learning_rate": 1.69726798484167e-05, + "loss": 5.392, + "step": 6265 + }, + { + "epoch": 0.6042430086788814, + "grad_norm": 1.5122978687286377, + "learning_rate": 1.696550751148832e-05, + "loss": 5.4119, + "step": 6266 + }, + { + "epoch": 0.6043394406943106, + "grad_norm": 1.605761170387268, + "learning_rate": 1.6958335911955782e-05, + "loss": 5.4172, + "step": 6267 + }, + { + "epoch": 0.6044358727097396, + "grad_norm": 1.7753323316574097, + "learning_rate": 1.6951165050477267e-05, + "loss": 5.4819, + "step": 6268 + }, + { + "epoch": 0.6045323047251687, + "grad_norm": 1.3747479915618896, + "learning_rate": 1.6943994927710924e-05, + "loss": 5.4199, + "step": 6269 + }, + { + "epoch": 0.6046287367405979, + "grad_norm": 1.946043848991394, + "learning_rate": 1.6936825544314812e-05, + "loss": 5.2988, + "step": 6270 + }, + { + "epoch": 0.604725168756027, + "grad_norm": 1.7127050161361694, + "learning_rate": 1.692965690094692e-05, + "loss": 5.2784, + "step": 6271 + }, + { + "epoch": 0.6048216007714561, + "grad_norm": 1.472115159034729, + "learning_rate": 1.6922488998265194e-05, + "loss": 5.4408, + "step": 6272 + }, + { + "epoch": 0.6049180327868853, + "grad_norm": 1.4284151792526245, + "learning_rate": 1.6915321836927483e-05, + "loss": 5.4564, + "step": 6273 + }, + { + "epoch": 0.6050144648023144, + "grad_norm": 1.6903237104415894, + "learning_rate": 1.6908155417591575e-05, + "loss": 5.5285, + "step": 6274 + }, + { + "epoch": 0.6051108968177434, + "grad_norm": 1.524588704109192, + "learning_rate": 1.6900989740915212e-05, + "loss": 5.441, + "step": 6275 + }, + { + "epoch": 0.6052073288331726, + "grad_norm": 1.76333487033844, + "learning_rate": 1.6893824807556048e-05, + "loss": 5.3729, + "step": 6276 + }, + { + "epoch": 0.6053037608486017, + "grad_norm": 1.612895131111145, + "learning_rate": 1.688666061817165e-05, + "loss": 5.4284, + "step": 6277 + }, + { + "epoch": 0.6054001928640309, + "grad_norm": 1.490536093711853, + "learning_rate": 1.6879497173419566e-05, + "loss": 5.3452, + "step": 6278 + }, + { + "epoch": 0.60549662487946, + "grad_norm": 1.307349443435669, + "learning_rate": 1.6872334473957225e-05, + "loss": 5.3889, + "step": 6279 + }, + { + "epoch": 0.6055930568948891, + "grad_norm": 2.0149242877960205, + "learning_rate": 1.6865172520442036e-05, + "loss": 5.3802, + "step": 6280 + }, + { + "epoch": 0.6056894889103183, + "grad_norm": 1.584661602973938, + "learning_rate": 1.68580113135313e-05, + "loss": 5.4546, + "step": 6281 + }, + { + "epoch": 0.6057859209257473, + "grad_norm": 1.48549222946167, + "learning_rate": 1.6850850853882254e-05, + "loss": 5.4576, + "step": 6282 + }, + { + "epoch": 0.6058823529411764, + "grad_norm": 2.0016870498657227, + "learning_rate": 1.6843691142152096e-05, + "loss": 5.6316, + "step": 6283 + }, + { + "epoch": 0.6059787849566056, + "grad_norm": 1.9653514623641968, + "learning_rate": 1.683653217899792e-05, + "loss": 5.1824, + "step": 6284 + }, + { + "epoch": 0.6060752169720347, + "grad_norm": 1.820481300354004, + "learning_rate": 1.6829373965076778e-05, + "loss": 5.396, + "step": 6285 + }, + { + "epoch": 0.6061716489874638, + "grad_norm": 1.5956141948699951, + "learning_rate": 1.6822216501045633e-05, + "loss": 5.5299, + "step": 6286 + }, + { + "epoch": 0.606268081002893, + "grad_norm": 1.3374767303466797, + "learning_rate": 1.6815059787561397e-05, + "loss": 5.5628, + "step": 6287 + }, + { + "epoch": 0.6063645130183221, + "grad_norm": 1.434019684791565, + "learning_rate": 1.680790382528089e-05, + "loss": 5.482, + "step": 6288 + }, + { + "epoch": 0.6064609450337513, + "grad_norm": 1.2015116214752197, + "learning_rate": 1.68007486148609e-05, + "loss": 5.5549, + "step": 6289 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 1.367771863937378, + "learning_rate": 1.6793594156958097e-05, + "loss": 5.594, + "step": 6290 + }, + { + "epoch": 0.6066538090646094, + "grad_norm": 1.3612245321273804, + "learning_rate": 1.6786440452229134e-05, + "loss": 5.5929, + "step": 6291 + }, + { + "epoch": 0.6067502410800386, + "grad_norm": 1.0407084226608276, + "learning_rate": 1.6779287501330553e-05, + "loss": 5.5675, + "step": 6292 + }, + { + "epoch": 0.6068466730954677, + "grad_norm": 1.1188079118728638, + "learning_rate": 1.677213530491884e-05, + "loss": 5.5406, + "step": 6293 + }, + { + "epoch": 0.6069431051108968, + "grad_norm": 1.2558422088623047, + "learning_rate": 1.6764983863650436e-05, + "loss": 5.5031, + "step": 6294 + }, + { + "epoch": 0.607039537126326, + "grad_norm": 1.0352121591567993, + "learning_rate": 1.675783317818167e-05, + "loss": 5.4627, + "step": 6295 + }, + { + "epoch": 0.607135969141755, + "grad_norm": 1.0222293138504028, + "learning_rate": 1.6750683249168825e-05, + "loss": 5.453, + "step": 6296 + }, + { + "epoch": 0.6072324011571841, + "grad_norm": 1.410583734512329, + "learning_rate": 1.6743534077268127e-05, + "loss": 5.5665, + "step": 6297 + }, + { + "epoch": 0.6073288331726133, + "grad_norm": 1.5696990489959717, + "learning_rate": 1.6736385663135708e-05, + "loss": 5.3332, + "step": 6298 + }, + { + "epoch": 0.6074252651880424, + "grad_norm": 1.3132541179656982, + "learning_rate": 1.6729238007427635e-05, + "loss": 5.549, + "step": 6299 + }, + { + "epoch": 0.6075216972034716, + "grad_norm": 1.3033958673477173, + "learning_rate": 1.6722091110799925e-05, + "loss": 5.3175, + "step": 6300 + }, + { + "epoch": 0.6076181292189007, + "grad_norm": 1.258044719696045, + "learning_rate": 1.6714944973908496e-05, + "loss": 5.4959, + "step": 6301 + }, + { + "epoch": 0.6077145612343298, + "grad_norm": 1.3532670736312866, + "learning_rate": 1.670779959740923e-05, + "loss": 5.4884, + "step": 6302 + }, + { + "epoch": 0.607810993249759, + "grad_norm": 1.503623604774475, + "learning_rate": 1.670065498195791e-05, + "loss": 5.4586, + "step": 6303 + }, + { + "epoch": 0.607907425265188, + "grad_norm": 1.3135759830474854, + "learning_rate": 1.6693511128210253e-05, + "loss": 5.3794, + "step": 6304 + }, + { + "epoch": 0.6080038572806171, + "grad_norm": 1.3303444385528564, + "learning_rate": 1.668636803682193e-05, + "loss": 5.3665, + "step": 6305 + }, + { + "epoch": 0.6081002892960463, + "grad_norm": 1.5493271350860596, + "learning_rate": 1.667922570844851e-05, + "loss": 5.4479, + "step": 6306 + }, + { + "epoch": 0.6081967213114754, + "grad_norm": 1.3986568450927734, + "learning_rate": 1.6672084143745515e-05, + "loss": 5.4278, + "step": 6307 + }, + { + "epoch": 0.6082931533269045, + "grad_norm": 1.350487470626831, + "learning_rate": 1.6664943343368384e-05, + "loss": 5.4335, + "step": 6308 + }, + { + "epoch": 0.6083895853423337, + "grad_norm": 1.2213155031204224, + "learning_rate": 1.6657803307972502e-05, + "loss": 5.5877, + "step": 6309 + }, + { + "epoch": 0.6084860173577628, + "grad_norm": 1.4692251682281494, + "learning_rate": 1.6650664038213154e-05, + "loss": 5.4378, + "step": 6310 + }, + { + "epoch": 0.608582449373192, + "grad_norm": 1.83259916305542, + "learning_rate": 1.6643525534745597e-05, + "loss": 5.4568, + "step": 6311 + }, + { + "epoch": 0.608678881388621, + "grad_norm": 1.5187543630599976, + "learning_rate": 1.6636387798224967e-05, + "loss": 5.3777, + "step": 6312 + }, + { + "epoch": 0.6087753134040501, + "grad_norm": 1.845274806022644, + "learning_rate": 1.6629250829306384e-05, + "loss": 5.5597, + "step": 6313 + }, + { + "epoch": 0.6088717454194793, + "grad_norm": 1.9609813690185547, + "learning_rate": 1.6622114628644854e-05, + "loss": 5.3358, + "step": 6314 + }, + { + "epoch": 0.6089681774349084, + "grad_norm": 1.3944987058639526, + "learning_rate": 1.6614979196895324e-05, + "loss": 5.4932, + "step": 6315 + }, + { + "epoch": 0.6090646094503375, + "grad_norm": 1.7051979303359985, + "learning_rate": 1.6607844534712697e-05, + "loss": 5.5665, + "step": 6316 + }, + { + "epoch": 0.6091610414657667, + "grad_norm": 1.7053176164627075, + "learning_rate": 1.6600710642751764e-05, + "loss": 5.402, + "step": 6317 + }, + { + "epoch": 0.6092574734811957, + "grad_norm": 1.6313395500183105, + "learning_rate": 1.659357752166726e-05, + "loss": 5.452, + "step": 6318 + }, + { + "epoch": 0.6093539054966248, + "grad_norm": 1.476577877998352, + "learning_rate": 1.6586445172113877e-05, + "loss": 5.4645, + "step": 6319 + }, + { + "epoch": 0.609450337512054, + "grad_norm": 2.1832103729248047, + "learning_rate": 1.6579313594746198e-05, + "loss": 5.5491, + "step": 6320 + }, + { + "epoch": 0.6095467695274831, + "grad_norm": 1.322946310043335, + "learning_rate": 1.657218279021875e-05, + "loss": 5.5393, + "step": 6321 + }, + { + "epoch": 0.6096432015429123, + "grad_norm": 1.1095694303512573, + "learning_rate": 1.6565052759185995e-05, + "loss": 5.4489, + "step": 6322 + }, + { + "epoch": 0.6097396335583414, + "grad_norm": 1.5337810516357422, + "learning_rate": 1.6557923502302307e-05, + "loss": 5.2455, + "step": 6323 + }, + { + "epoch": 0.6098360655737705, + "grad_norm": 2.0775387287139893, + "learning_rate": 1.655079502022202e-05, + "loss": 5.4908, + "step": 6324 + }, + { + "epoch": 0.6099324975891997, + "grad_norm": 1.4606736898422241, + "learning_rate": 1.6543667313599365e-05, + "loss": 5.3365, + "step": 6325 + }, + { + "epoch": 0.6100289296046287, + "grad_norm": 1.4911667108535767, + "learning_rate": 1.6536540383088515e-05, + "loss": 5.409, + "step": 6326 + }, + { + "epoch": 0.6101253616200578, + "grad_norm": 1.4064422845840454, + "learning_rate": 1.6529414229343576e-05, + "loss": 5.5142, + "step": 6327 + }, + { + "epoch": 0.610221793635487, + "grad_norm": 1.3764253854751587, + "learning_rate": 1.6522288853018566e-05, + "loss": 5.4784, + "step": 6328 + }, + { + "epoch": 0.6103182256509161, + "grad_norm": 1.1944921016693115, + "learning_rate": 1.651516425476745e-05, + "loss": 5.445, + "step": 6329 + }, + { + "epoch": 0.6104146576663452, + "grad_norm": 1.4174102544784546, + "learning_rate": 1.650804043524412e-05, + "loss": 5.339, + "step": 6330 + }, + { + "epoch": 0.6105110896817744, + "grad_norm": 1.9128037691116333, + "learning_rate": 1.650091739510239e-05, + "loss": 5.5101, + "step": 6331 + }, + { + "epoch": 0.6106075216972034, + "grad_norm": 2.2763497829437256, + "learning_rate": 1.6493795134995992e-05, + "loss": 5.4161, + "step": 6332 + }, + { + "epoch": 0.6107039537126326, + "grad_norm": 2.151259422302246, + "learning_rate": 1.6486673655578617e-05, + "loss": 5.4847, + "step": 6333 + }, + { + "epoch": 0.6108003857280617, + "grad_norm": 1.6133860349655151, + "learning_rate": 1.6479552957503844e-05, + "loss": 5.495, + "step": 6334 + }, + { + "epoch": 0.6108968177434908, + "grad_norm": 2.3457283973693848, + "learning_rate": 1.6472433041425222e-05, + "loss": 5.5077, + "step": 6335 + }, + { + "epoch": 0.61099324975892, + "grad_norm": 2.860154867172241, + "learning_rate": 1.6465313907996204e-05, + "loss": 5.402, + "step": 6336 + }, + { + "epoch": 0.6110896817743491, + "grad_norm": 1.828184723854065, + "learning_rate": 1.645819555787016e-05, + "loss": 5.2979, + "step": 6337 + }, + { + "epoch": 0.6111861137897782, + "grad_norm": 1.2384029626846313, + "learning_rate": 1.645107799170042e-05, + "loss": 5.3844, + "step": 6338 + }, + { + "epoch": 0.6112825458052074, + "grad_norm": 2.1908328533172607, + "learning_rate": 1.6443961210140228e-05, + "loss": 5.5728, + "step": 6339 + }, + { + "epoch": 0.6113789778206364, + "grad_norm": 1.8434118032455444, + "learning_rate": 1.643684521384273e-05, + "loss": 5.5259, + "step": 6340 + }, + { + "epoch": 0.6114754098360655, + "grad_norm": 1.3927327394485474, + "learning_rate": 1.642973000346105e-05, + "loss": 5.3529, + "step": 6341 + }, + { + "epoch": 0.6115718418514947, + "grad_norm": 1.8431344032287598, + "learning_rate": 1.6422615579648202e-05, + "loss": 5.3612, + "step": 6342 + }, + { + "epoch": 0.6116682738669238, + "grad_norm": 1.7631804943084717, + "learning_rate": 1.6415501943057127e-05, + "loss": 5.3165, + "step": 6343 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 2.256368637084961, + "learning_rate": 1.640838909434073e-05, + "loss": 5.3048, + "step": 6344 + }, + { + "epoch": 0.6118611378977821, + "grad_norm": 2.0317962169647217, + "learning_rate": 1.6401277034151798e-05, + "loss": 5.3983, + "step": 6345 + }, + { + "epoch": 0.6119575699132112, + "grad_norm": 1.462544560432434, + "learning_rate": 1.6394165763143083e-05, + "loss": 5.449, + "step": 6346 + }, + { + "epoch": 0.6120540019286403, + "grad_norm": 1.6894539594650269, + "learning_rate": 1.638705528196724e-05, + "loss": 5.3802, + "step": 6347 + }, + { + "epoch": 0.6121504339440694, + "grad_norm": 1.5203042030334473, + "learning_rate": 1.6379945591276863e-05, + "loss": 5.4121, + "step": 6348 + }, + { + "epoch": 0.6122468659594985, + "grad_norm": 1.582154631614685, + "learning_rate": 1.6372836691724468e-05, + "loss": 5.4023, + "step": 6349 + }, + { + "epoch": 0.6123432979749277, + "grad_norm": 1.5406423807144165, + "learning_rate": 1.636572858396251e-05, + "loss": 5.3721, + "step": 6350 + }, + { + "epoch": 0.6124397299903568, + "grad_norm": 1.8762234449386597, + "learning_rate": 1.6358621268643347e-05, + "loss": 5.4315, + "step": 6351 + }, + { + "epoch": 0.6125361620057859, + "grad_norm": 1.5209747552871704, + "learning_rate": 1.6351514746419293e-05, + "loss": 5.4905, + "step": 6352 + }, + { + "epoch": 0.6126325940212151, + "grad_norm": 1.390978217124939, + "learning_rate": 1.6344409017942574e-05, + "loss": 5.3866, + "step": 6353 + }, + { + "epoch": 0.6127290260366441, + "grad_norm": 1.7482848167419434, + "learning_rate": 1.6337304083865333e-05, + "loss": 5.4337, + "step": 6354 + }, + { + "epoch": 0.6128254580520733, + "grad_norm": 1.6318365335464478, + "learning_rate": 1.6330199944839668e-05, + "loss": 5.331, + "step": 6355 + }, + { + "epoch": 0.6129218900675024, + "grad_norm": 1.6373534202575684, + "learning_rate": 1.6323096601517573e-05, + "loss": 5.3999, + "step": 6356 + }, + { + "epoch": 0.6130183220829315, + "grad_norm": 1.7716082334518433, + "learning_rate": 1.6315994054551003e-05, + "loss": 5.463, + "step": 6357 + }, + { + "epoch": 0.6131147540983607, + "grad_norm": 1.6466774940490723, + "learning_rate": 1.630889230459181e-05, + "loss": 5.4867, + "step": 6358 + }, + { + "epoch": 0.6132111861137898, + "grad_norm": 1.6753699779510498, + "learning_rate": 1.6301791352291773e-05, + "loss": 5.3381, + "step": 6359 + }, + { + "epoch": 0.6133076181292189, + "grad_norm": 1.377506136894226, + "learning_rate": 1.6294691198302634e-05, + "loss": 5.3081, + "step": 6360 + }, + { + "epoch": 0.613404050144648, + "grad_norm": 1.4655506610870361, + "learning_rate": 1.628759184327602e-05, + "loss": 5.2963, + "step": 6361 + }, + { + "epoch": 0.6135004821600771, + "grad_norm": 1.5903615951538086, + "learning_rate": 1.6280493287863492e-05, + "loss": 5.4556, + "step": 6362 + }, + { + "epoch": 0.6135969141755062, + "grad_norm": 1.52695894241333, + "learning_rate": 1.6273395532716566e-05, + "loss": 5.4355, + "step": 6363 + }, + { + "epoch": 0.6136933461909354, + "grad_norm": 1.4211838245391846, + "learning_rate": 1.6266298578486662e-05, + "loss": 5.4752, + "step": 6364 + }, + { + "epoch": 0.6137897782063645, + "grad_norm": 1.9399594068527222, + "learning_rate": 1.6259202425825117e-05, + "loss": 5.4147, + "step": 6365 + }, + { + "epoch": 0.6138862102217937, + "grad_norm": 1.597341775894165, + "learning_rate": 1.625210707538322e-05, + "loss": 5.5989, + "step": 6366 + }, + { + "epoch": 0.6139826422372228, + "grad_norm": 1.6842056512832642, + "learning_rate": 1.624501252781216e-05, + "loss": 5.5103, + "step": 6367 + }, + { + "epoch": 0.6140790742526518, + "grad_norm": 1.5454353094100952, + "learning_rate": 1.6237918783763084e-05, + "loss": 5.3482, + "step": 6368 + }, + { + "epoch": 0.614175506268081, + "grad_norm": 1.6137319803237915, + "learning_rate": 1.6230825843887035e-05, + "loss": 5.3615, + "step": 6369 + }, + { + "epoch": 0.6142719382835101, + "grad_norm": 1.6250489950180054, + "learning_rate": 1.6223733708834994e-05, + "loss": 5.2739, + "step": 6370 + }, + { + "epoch": 0.6143683702989392, + "grad_norm": 1.505279779434204, + "learning_rate": 1.6216642379257875e-05, + "loss": 5.4526, + "step": 6371 + }, + { + "epoch": 0.6144648023143684, + "grad_norm": 1.6258294582366943, + "learning_rate": 1.620955185580651e-05, + "loss": 5.4589, + "step": 6372 + }, + { + "epoch": 0.6145612343297975, + "grad_norm": 1.2493916749954224, + "learning_rate": 1.6202462139131645e-05, + "loss": 5.5329, + "step": 6373 + }, + { + "epoch": 0.6146576663452266, + "grad_norm": 1.4797614812850952, + "learning_rate": 1.6195373229883993e-05, + "loss": 5.4046, + "step": 6374 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 1.5083616971969604, + "learning_rate": 1.6188285128714142e-05, + "loss": 5.4339, + "step": 6375 + }, + { + "epoch": 0.6148505303760848, + "grad_norm": 1.6497890949249268, + "learning_rate": 1.618119783627263e-05, + "loss": 5.4499, + "step": 6376 + }, + { + "epoch": 0.614946962391514, + "grad_norm": 1.6018253564834595, + "learning_rate": 1.6174111353209935e-05, + "loss": 5.4973, + "step": 6377 + }, + { + "epoch": 0.6150433944069431, + "grad_norm": 1.936540126800537, + "learning_rate": 1.616702568017643e-05, + "loss": 5.4155, + "step": 6378 + }, + { + "epoch": 0.6151398264223722, + "grad_norm": 1.65013587474823, + "learning_rate": 1.6159940817822444e-05, + "loss": 5.4655, + "step": 6379 + }, + { + "epoch": 0.6152362584378014, + "grad_norm": 1.5616995096206665, + "learning_rate": 1.615285676679821e-05, + "loss": 5.3965, + "step": 6380 + }, + { + "epoch": 0.6153326904532305, + "grad_norm": 1.4362242221832275, + "learning_rate": 1.6145773527753883e-05, + "loss": 5.4573, + "step": 6381 + }, + { + "epoch": 0.6154291224686596, + "grad_norm": 1.8678089380264282, + "learning_rate": 1.6138691101339575e-05, + "loss": 5.5104, + "step": 6382 + }, + { + "epoch": 0.6155255544840887, + "grad_norm": 1.8829981088638306, + "learning_rate": 1.613160948820529e-05, + "loss": 5.3816, + "step": 6383 + }, + { + "epoch": 0.6156219864995178, + "grad_norm": 1.6287648677825928, + "learning_rate": 1.612452868900096e-05, + "loss": 5.473, + "step": 6384 + }, + { + "epoch": 0.6157184185149469, + "grad_norm": 1.5764888525009155, + "learning_rate": 1.6117448704376475e-05, + "loss": 5.4616, + "step": 6385 + }, + { + "epoch": 0.6158148505303761, + "grad_norm": 1.4074509143829346, + "learning_rate": 1.611036953498161e-05, + "loss": 5.3055, + "step": 6386 + }, + { + "epoch": 0.6159112825458052, + "grad_norm": 1.2886801958084106, + "learning_rate": 1.6103291181466086e-05, + "loss": 5.4392, + "step": 6387 + }, + { + "epoch": 0.6160077145612344, + "grad_norm": 1.968427062034607, + "learning_rate": 1.6096213644479552e-05, + "loss": 5.2921, + "step": 6388 + }, + { + "epoch": 0.6161041465766635, + "grad_norm": 1.7130018472671509, + "learning_rate": 1.608913692467156e-05, + "loss": 5.6112, + "step": 6389 + }, + { + "epoch": 0.6162005785920925, + "grad_norm": 1.4192399978637695, + "learning_rate": 1.6082061022691626e-05, + "loss": 5.5824, + "step": 6390 + }, + { + "epoch": 0.6162970106075217, + "grad_norm": 1.3708062171936035, + "learning_rate": 1.6074985939189153e-05, + "loss": 5.4235, + "step": 6391 + }, + { + "epoch": 0.6163934426229508, + "grad_norm": 1.6701611280441284, + "learning_rate": 1.606791167481348e-05, + "loss": 5.3731, + "step": 6392 + }, + { + "epoch": 0.6164898746383799, + "grad_norm": 1.9572240114212036, + "learning_rate": 1.6060838230213883e-05, + "loss": 5.5981, + "step": 6393 + }, + { + "epoch": 0.6165863066538091, + "grad_norm": 1.5590169429779053, + "learning_rate": 1.6053765606039554e-05, + "loss": 5.3428, + "step": 6394 + }, + { + "epoch": 0.6166827386692382, + "grad_norm": 2.435472011566162, + "learning_rate": 1.60466938029396e-05, + "loss": 5.5118, + "step": 6395 + }, + { + "epoch": 0.6167791706846673, + "grad_norm": 2.1689984798431396, + "learning_rate": 1.6039622821563078e-05, + "loss": 5.6524, + "step": 6396 + }, + { + "epoch": 0.6168756027000964, + "grad_norm": 1.5441373586654663, + "learning_rate": 1.603255266255894e-05, + "loss": 5.5496, + "step": 6397 + }, + { + "epoch": 0.6169720347155255, + "grad_norm": 2.7724902629852295, + "learning_rate": 1.6025483326576078e-05, + "loss": 5.5991, + "step": 6398 + }, + { + "epoch": 0.6170684667309547, + "grad_norm": 2.3479344844818115, + "learning_rate": 1.6018414814263317e-05, + "loss": 5.501, + "step": 6399 + }, + { + "epoch": 0.6171648987463838, + "grad_norm": 1.906103253364563, + "learning_rate": 1.6011347126269382e-05, + "loss": 5.4778, + "step": 6400 + }, + { + "epoch": 0.6172613307618129, + "grad_norm": 1.3826022148132324, + "learning_rate": 1.6004280263242953e-05, + "loss": 5.3076, + "step": 6401 + }, + { + "epoch": 0.6173577627772421, + "grad_norm": 1.6923601627349854, + "learning_rate": 1.599721422583261e-05, + "loss": 5.1593, + "step": 6402 + }, + { + "epoch": 0.6174541947926712, + "grad_norm": 1.6251702308654785, + "learning_rate": 1.5990149014686852e-05, + "loss": 5.1279, + "step": 6403 + }, + { + "epoch": 0.6175506268081002, + "grad_norm": 1.8317747116088867, + "learning_rate": 1.5983084630454143e-05, + "loss": 5.4476, + "step": 6404 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 2.1475915908813477, + "learning_rate": 1.5976021073782826e-05, + "loss": 5.2696, + "step": 6405 + }, + { + "epoch": 0.6177434908389585, + "grad_norm": 2.3341760635375977, + "learning_rate": 1.5968958345321178e-05, + "loss": 5.1785, + "step": 6406 + }, + { + "epoch": 0.6178399228543876, + "grad_norm": 2.5320932865142822, + "learning_rate": 1.5961896445717427e-05, + "loss": 5.175, + "step": 6407 + }, + { + "epoch": 0.6179363548698168, + "grad_norm": 1.7467237710952759, + "learning_rate": 1.59548353756197e-05, + "loss": 5.423, + "step": 6408 + }, + { + "epoch": 0.6180327868852459, + "grad_norm": 2.2994115352630615, + "learning_rate": 1.5947775135676035e-05, + "loss": 5.3356, + "step": 6409 + }, + { + "epoch": 0.6181292189006751, + "grad_norm": 3.1574814319610596, + "learning_rate": 1.594071572653444e-05, + "loss": 5.3182, + "step": 6410 + }, + { + "epoch": 0.6182256509161042, + "grad_norm": 2.2994000911712646, + "learning_rate": 1.593365714884279e-05, + "loss": 5.3886, + "step": 6411 + }, + { + "epoch": 0.6183220829315332, + "grad_norm": 1.6100066900253296, + "learning_rate": 1.5926599403248942e-05, + "loss": 5.5011, + "step": 6412 + }, + { + "epoch": 0.6184185149469624, + "grad_norm": 1.9736613035202026, + "learning_rate": 1.591954249040063e-05, + "loss": 5.4452, + "step": 6413 + }, + { + "epoch": 0.6185149469623915, + "grad_norm": 2.398627281188965, + "learning_rate": 1.5912486410945528e-05, + "loss": 5.4144, + "step": 6414 + }, + { + "epoch": 0.6186113789778206, + "grad_norm": 1.7701852321624756, + "learning_rate": 1.590543116553124e-05, + "loss": 5.4384, + "step": 6415 + }, + { + "epoch": 0.6187078109932498, + "grad_norm": 1.4234226942062378, + "learning_rate": 1.589837675480529e-05, + "loss": 5.47, + "step": 6416 + }, + { + "epoch": 0.6188042430086789, + "grad_norm": 1.5574982166290283, + "learning_rate": 1.589132317941511e-05, + "loss": 5.4851, + "step": 6417 + }, + { + "epoch": 0.618900675024108, + "grad_norm": 1.4619815349578857, + "learning_rate": 1.588427044000809e-05, + "loss": 5.2916, + "step": 6418 + }, + { + "epoch": 0.6189971070395371, + "grad_norm": 1.155175805091858, + "learning_rate": 1.5877218537231504e-05, + "loss": 5.3569, + "step": 6419 + }, + { + "epoch": 0.6190935390549662, + "grad_norm": 1.7565966844558716, + "learning_rate": 1.5870167471732566e-05, + "loss": 5.3283, + "step": 6420 + }, + { + "epoch": 0.6191899710703954, + "grad_norm": 2.527730703353882, + "learning_rate": 1.5863117244158433e-05, + "loss": 5.4671, + "step": 6421 + }, + { + "epoch": 0.6192864030858245, + "grad_norm": 1.353524923324585, + "learning_rate": 1.585606785515614e-05, + "loss": 5.408, + "step": 6422 + }, + { + "epoch": 0.6193828351012536, + "grad_norm": 2.0853168964385986, + "learning_rate": 1.5849019305372696e-05, + "loss": 5.3446, + "step": 6423 + }, + { + "epoch": 0.6194792671166828, + "grad_norm": 1.6082592010498047, + "learning_rate": 1.5841971595454996e-05, + "loss": 5.3611, + "step": 6424 + }, + { + "epoch": 0.6195756991321119, + "grad_norm": 1.5181070566177368, + "learning_rate": 1.583492472604986e-05, + "loss": 5.3616, + "step": 6425 + }, + { + "epoch": 0.6196721311475409, + "grad_norm": 1.7294325828552246, + "learning_rate": 1.5827878697804066e-05, + "loss": 5.29, + "step": 6426 + }, + { + "epoch": 0.6197685631629701, + "grad_norm": 1.5413637161254883, + "learning_rate": 1.5820833511364276e-05, + "loss": 5.3336, + "step": 6427 + }, + { + "epoch": 0.6198649951783992, + "grad_norm": 2.2127392292022705, + "learning_rate": 1.5813789167377076e-05, + "loss": 5.3413, + "step": 6428 + }, + { + "epoch": 0.6199614271938283, + "grad_norm": 2.4476873874664307, + "learning_rate": 1.5806745666489008e-05, + "loss": 5.4338, + "step": 6429 + }, + { + "epoch": 0.6200578592092575, + "grad_norm": 1.8382172584533691, + "learning_rate": 1.5799703009346506e-05, + "loss": 5.5127, + "step": 6430 + }, + { + "epoch": 0.6201542912246866, + "grad_norm": 1.5685551166534424, + "learning_rate": 1.579266119659594e-05, + "loss": 5.5315, + "step": 6431 + }, + { + "epoch": 0.6202507232401158, + "grad_norm": 1.9152294397354126, + "learning_rate": 1.5785620228883598e-05, + "loss": 5.3756, + "step": 6432 + }, + { + "epoch": 0.6203471552555448, + "grad_norm": 2.166825771331787, + "learning_rate": 1.5778580106855683e-05, + "loss": 5.3646, + "step": 6433 + }, + { + "epoch": 0.6204435872709739, + "grad_norm": 2.195241689682007, + "learning_rate": 1.5771540831158343e-05, + "loss": 5.2088, + "step": 6434 + }, + { + "epoch": 0.6205400192864031, + "grad_norm": 2.142092227935791, + "learning_rate": 1.5764502402437625e-05, + "loss": 5.337, + "step": 6435 + }, + { + "epoch": 0.6206364513018322, + "grad_norm": 2.0858073234558105, + "learning_rate": 1.5757464821339508e-05, + "loss": 5.5278, + "step": 6436 + }, + { + "epoch": 0.6207328833172613, + "grad_norm": 2.509373426437378, + "learning_rate": 1.5750428088509895e-05, + "loss": 5.4022, + "step": 6437 + }, + { + "epoch": 0.6208293153326905, + "grad_norm": 1.6591358184814453, + "learning_rate": 1.5743392204594615e-05, + "loss": 5.4421, + "step": 6438 + }, + { + "epoch": 0.6209257473481196, + "grad_norm": 2.3116047382354736, + "learning_rate": 1.5736357170239392e-05, + "loss": 5.548, + "step": 6439 + }, + { + "epoch": 0.6210221793635486, + "grad_norm": 1.6378272771835327, + "learning_rate": 1.5729322986089918e-05, + "loss": 5.5864, + "step": 6440 + }, + { + "epoch": 0.6211186113789778, + "grad_norm": 1.4794867038726807, + "learning_rate": 1.5722289652791765e-05, + "loss": 5.4858, + "step": 6441 + }, + { + "epoch": 0.6212150433944069, + "grad_norm": 2.5345771312713623, + "learning_rate": 1.5715257170990444e-05, + "loss": 5.3572, + "step": 6442 + }, + { + "epoch": 0.6213114754098361, + "grad_norm": 1.8904120922088623, + "learning_rate": 1.57082255413314e-05, + "loss": 5.2643, + "step": 6443 + }, + { + "epoch": 0.6214079074252652, + "grad_norm": 1.8435518741607666, + "learning_rate": 1.570119476445997e-05, + "loss": 5.6182, + "step": 6444 + }, + { + "epoch": 0.6215043394406943, + "grad_norm": 2.1597883701324463, + "learning_rate": 1.5694164841021454e-05, + "loss": 5.5357, + "step": 6445 + }, + { + "epoch": 0.6216007714561235, + "grad_norm": 2.3162524700164795, + "learning_rate": 1.5687135771661028e-05, + "loss": 5.5775, + "step": 6446 + }, + { + "epoch": 0.6216972034715526, + "grad_norm": 2.41178035736084, + "learning_rate": 1.568010755702381e-05, + "loss": 5.301, + "step": 6447 + }, + { + "epoch": 0.6217936354869816, + "grad_norm": 2.022035837173462, + "learning_rate": 1.5673080197754864e-05, + "loss": 5.3338, + "step": 6448 + }, + { + "epoch": 0.6218900675024108, + "grad_norm": 2.4460694789886475, + "learning_rate": 1.5666053694499133e-05, + "loss": 5.5711, + "step": 6449 + }, + { + "epoch": 0.6219864995178399, + "grad_norm": 3.1294541358947754, + "learning_rate": 1.5659028047901496e-05, + "loss": 5.5371, + "step": 6450 + }, + { + "epoch": 0.622082931533269, + "grad_norm": 2.4555742740631104, + "learning_rate": 1.5652003258606782e-05, + "loss": 5.2306, + "step": 6451 + }, + { + "epoch": 0.6221793635486982, + "grad_norm": 2.220236301422119, + "learning_rate": 1.5644979327259696e-05, + "loss": 5.2972, + "step": 6452 + }, + { + "epoch": 0.6222757955641273, + "grad_norm": 2.157735824584961, + "learning_rate": 1.5637956254504894e-05, + "loss": 5.4974, + "step": 6453 + }, + { + "epoch": 0.6223722275795565, + "grad_norm": 2.2504587173461914, + "learning_rate": 1.5630934040986945e-05, + "loss": 5.4489, + "step": 6454 + }, + { + "epoch": 0.6224686595949855, + "grad_norm": 2.1994616985321045, + "learning_rate": 1.5623912687350334e-05, + "loss": 5.5932, + "step": 6455 + }, + { + "epoch": 0.6225650916104146, + "grad_norm": 2.171830177307129, + "learning_rate": 1.5616892194239486e-05, + "loss": 5.2789, + "step": 6456 + }, + { + "epoch": 0.6226615236258438, + "grad_norm": 1.905173659324646, + "learning_rate": 1.5609872562298717e-05, + "loss": 5.4249, + "step": 6457 + }, + { + "epoch": 0.6227579556412729, + "grad_norm": 1.4882245063781738, + "learning_rate": 1.5602853792172284e-05, + "loss": 5.5102, + "step": 6458 + }, + { + "epoch": 0.622854387656702, + "grad_norm": 1.454981803894043, + "learning_rate": 1.5595835884504367e-05, + "loss": 5.4335, + "step": 6459 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 1.2996755838394165, + "learning_rate": 1.558881883993906e-05, + "loss": 5.453, + "step": 6460 + }, + { + "epoch": 0.6230472516875603, + "grad_norm": 1.1284549236297607, + "learning_rate": 1.558180265912037e-05, + "loss": 5.4312, + "step": 6461 + }, + { + "epoch": 0.6231436837029893, + "grad_norm": 1.260876178741455, + "learning_rate": 1.5574787342692247e-05, + "loss": 5.3917, + "step": 6462 + }, + { + "epoch": 0.6232401157184185, + "grad_norm": 1.7895413637161255, + "learning_rate": 1.5567772891298547e-05, + "loss": 5.2166, + "step": 6463 + }, + { + "epoch": 0.6233365477338476, + "grad_norm": 1.4672083854675293, + "learning_rate": 1.556075930558303e-05, + "loss": 5.3881, + "step": 6464 + }, + { + "epoch": 0.6234329797492768, + "grad_norm": 2.0599653720855713, + "learning_rate": 1.555374658618941e-05, + "loss": 5.4388, + "step": 6465 + }, + { + "epoch": 0.6235294117647059, + "grad_norm": 2.7055838108062744, + "learning_rate": 1.55467347337613e-05, + "loss": 5.1741, + "step": 6466 + }, + { + "epoch": 0.623625843780135, + "grad_norm": 1.430856466293335, + "learning_rate": 1.5539723748942245e-05, + "loss": 5.4274, + "step": 6467 + }, + { + "epoch": 0.6237222757955642, + "grad_norm": 1.417527437210083, + "learning_rate": 1.5532713632375703e-05, + "loss": 5.5929, + "step": 6468 + }, + { + "epoch": 0.6238187078109932, + "grad_norm": 1.1876096725463867, + "learning_rate": 1.552570438470504e-05, + "loss": 5.3082, + "step": 6469 + }, + { + "epoch": 0.6239151398264223, + "grad_norm": 1.0741328001022339, + "learning_rate": 1.551869600657358e-05, + "loss": 5.3856, + "step": 6470 + }, + { + "epoch": 0.6240115718418515, + "grad_norm": 1.6933164596557617, + "learning_rate": 1.551168849862453e-05, + "loss": 5.4563, + "step": 6471 + }, + { + "epoch": 0.6241080038572806, + "grad_norm": 1.797311782836914, + "learning_rate": 1.5504681861501018e-05, + "loss": 5.5387, + "step": 6472 + }, + { + "epoch": 0.6242044358727097, + "grad_norm": 1.171932339668274, + "learning_rate": 1.5497676095846135e-05, + "loss": 5.3874, + "step": 6473 + }, + { + "epoch": 0.6243008678881389, + "grad_norm": 1.3890776634216309, + "learning_rate": 1.5490671202302836e-05, + "loss": 5.4831, + "step": 6474 + }, + { + "epoch": 0.624397299903568, + "grad_norm": 1.3984431028366089, + "learning_rate": 1.5483667181514026e-05, + "loss": 5.4653, + "step": 6475 + }, + { + "epoch": 0.6244937319189972, + "grad_norm": 1.7339576482772827, + "learning_rate": 1.5476664034122534e-05, + "loss": 5.5194, + "step": 6476 + }, + { + "epoch": 0.6245901639344262, + "grad_norm": 1.270532250404358, + "learning_rate": 1.5469661760771097e-05, + "loss": 5.4585, + "step": 6477 + }, + { + "epoch": 0.6246865959498553, + "grad_norm": 1.5781912803649902, + "learning_rate": 1.546266036210237e-05, + "loss": 5.4477, + "step": 6478 + }, + { + "epoch": 0.6247830279652845, + "grad_norm": 1.501891016960144, + "learning_rate": 1.5455659838758943e-05, + "loss": 5.2643, + "step": 6479 + }, + { + "epoch": 0.6248794599807136, + "grad_norm": 1.4353559017181396, + "learning_rate": 1.5448660191383297e-05, + "loss": 5.4261, + "step": 6480 + }, + { + "epoch": 0.6249758919961427, + "grad_norm": 1.8051460981369019, + "learning_rate": 1.5441661420617874e-05, + "loss": 5.4938, + "step": 6481 + }, + { + "epoch": 0.6250723240115719, + "grad_norm": 1.566380262374878, + "learning_rate": 1.5434663527105003e-05, + "loss": 5.2478, + "step": 6482 + }, + { + "epoch": 0.625168756027001, + "grad_norm": 1.1593761444091797, + "learning_rate": 1.5427666511486925e-05, + "loss": 5.3755, + "step": 6483 + }, + { + "epoch": 0.6252651880424301, + "grad_norm": 1.546067476272583, + "learning_rate": 1.542067037440585e-05, + "loss": 5.5583, + "step": 6484 + }, + { + "epoch": 0.6253616200578592, + "grad_norm": 1.7289092540740967, + "learning_rate": 1.5413675116503845e-05, + "loss": 5.3579, + "step": 6485 + }, + { + "epoch": 0.6254580520732883, + "grad_norm": 1.498066782951355, + "learning_rate": 1.540668073842295e-05, + "loss": 5.3602, + "step": 6486 + }, + { + "epoch": 0.6255544840887175, + "grad_norm": 1.4589663743972778, + "learning_rate": 1.539968724080509e-05, + "loss": 5.5026, + "step": 6487 + }, + { + "epoch": 0.6256509161041466, + "grad_norm": 1.6804972887039185, + "learning_rate": 1.5392694624292104e-05, + "loss": 5.5359, + "step": 6488 + }, + { + "epoch": 0.6257473481195757, + "grad_norm": 1.9410617351531982, + "learning_rate": 1.5385702889525795e-05, + "loss": 5.4074, + "step": 6489 + }, + { + "epoch": 0.6258437801350049, + "grad_norm": 2.0437495708465576, + "learning_rate": 1.537871203714784e-05, + "loss": 5.4576, + "step": 6490 + }, + { + "epoch": 0.6259402121504339, + "grad_norm": 1.4725559949874878, + "learning_rate": 1.537172206779984e-05, + "loss": 5.3631, + "step": 6491 + }, + { + "epoch": 0.626036644165863, + "grad_norm": 1.593124270439148, + "learning_rate": 1.5364732982123352e-05, + "loss": 5.5354, + "step": 6492 + }, + { + "epoch": 0.6261330761812922, + "grad_norm": 1.6080152988433838, + "learning_rate": 1.5357744780759813e-05, + "loss": 5.4879, + "step": 6493 + }, + { + "epoch": 0.6262295081967213, + "grad_norm": 1.4671961069107056, + "learning_rate": 1.5350757464350578e-05, + "loss": 5.2104, + "step": 6494 + }, + { + "epoch": 0.6263259402121505, + "grad_norm": 1.6770308017730713, + "learning_rate": 1.534377103353696e-05, + "loss": 4.6823, + "step": 6495 + }, + { + "epoch": 0.6264223722275796, + "grad_norm": 1.741676688194275, + "learning_rate": 1.5336785488960137e-05, + "loss": 4.6496, + "step": 6496 + }, + { + "epoch": 0.6265188042430087, + "grad_norm": 1.5443552732467651, + "learning_rate": 1.532980083126126e-05, + "loss": 4.8295, + "step": 6497 + }, + { + "epoch": 0.6266152362584378, + "grad_norm": 1.9144738912582397, + "learning_rate": 1.532281706108135e-05, + "loss": 5.5177, + "step": 6498 + }, + { + "epoch": 0.6267116682738669, + "grad_norm": 1.9217685461044312, + "learning_rate": 1.531583417906139e-05, + "loss": 5.4962, + "step": 6499 + }, + { + "epoch": 0.626808100289296, + "grad_norm": 2.505446672439575, + "learning_rate": 1.5308852185842244e-05, + "loss": 5.5711, + "step": 6500 + }, + { + "epoch": 0.6269045323047252, + "grad_norm": 2.258319616317749, + "learning_rate": 1.530187108206472e-05, + "loss": 5.4956, + "step": 6501 + }, + { + "epoch": 0.6270009643201543, + "grad_norm": 1.7433933019638062, + "learning_rate": 1.5294890868369525e-05, + "loss": 5.4482, + "step": 6502 + }, + { + "epoch": 0.6270973963355834, + "grad_norm": 2.503955841064453, + "learning_rate": 1.528791154539731e-05, + "loss": 5.7188, + "step": 6503 + }, + { + "epoch": 0.6271938283510126, + "grad_norm": 2.18326997756958, + "learning_rate": 1.528093311378862e-05, + "loss": 5.4954, + "step": 6504 + }, + { + "epoch": 0.6272902603664416, + "grad_norm": 2.19061017036438, + "learning_rate": 1.5273955574183916e-05, + "loss": 5.2869, + "step": 6505 + }, + { + "epoch": 0.6273866923818708, + "grad_norm": 1.3406908512115479, + "learning_rate": 1.5266978927223607e-05, + "loss": 5.2887, + "step": 6506 + }, + { + "epoch": 0.6274831243972999, + "grad_norm": 1.8854799270629883, + "learning_rate": 1.5260003173547985e-05, + "loss": 5.4449, + "step": 6507 + }, + { + "epoch": 0.627579556412729, + "grad_norm": 2.187152624130249, + "learning_rate": 1.525302831379729e-05, + "loss": 5.3021, + "step": 6508 + }, + { + "epoch": 0.6276759884281582, + "grad_norm": 1.725217580795288, + "learning_rate": 1.5246054348611667e-05, + "loss": 5.419, + "step": 6509 + }, + { + "epoch": 0.6277724204435873, + "grad_norm": 2.0472159385681152, + "learning_rate": 1.5239081278631157e-05, + "loss": 5.5092, + "step": 6510 + }, + { + "epoch": 0.6278688524590164, + "grad_norm": 2.1798009872436523, + "learning_rate": 1.5232109104495762e-05, + "loss": 5.3906, + "step": 6511 + }, + { + "epoch": 0.6279652844744456, + "grad_norm": 1.6721842288970947, + "learning_rate": 1.5225137826845371e-05, + "loss": 5.5192, + "step": 6512 + }, + { + "epoch": 0.6280617164898746, + "grad_norm": 1.2797894477844238, + "learning_rate": 1.5218167446319792e-05, + "loss": 5.4865, + "step": 6513 + }, + { + "epoch": 0.6281581485053037, + "grad_norm": 1.8843780755996704, + "learning_rate": 1.5211197963558771e-05, + "loss": 5.4243, + "step": 6514 + }, + { + "epoch": 0.6282545805207329, + "grad_norm": 1.6385908126831055, + "learning_rate": 1.5204229379201956e-05, + "loss": 5.4475, + "step": 6515 + }, + { + "epoch": 0.628351012536162, + "grad_norm": 1.569313406944275, + "learning_rate": 1.5197261693888903e-05, + "loss": 5.478, + "step": 6516 + }, + { + "epoch": 0.6284474445515912, + "grad_norm": 1.9421712160110474, + "learning_rate": 1.5190294908259117e-05, + "loss": 5.3808, + "step": 6517 + }, + { + "epoch": 0.6285438765670203, + "grad_norm": 1.9806369543075562, + "learning_rate": 1.5183329022951976e-05, + "loss": 5.2594, + "step": 6518 + }, + { + "epoch": 0.6286403085824493, + "grad_norm": 1.8274942636489868, + "learning_rate": 1.5176364038606827e-05, + "loss": 5.4537, + "step": 6519 + }, + { + "epoch": 0.6287367405978785, + "grad_norm": 2.0055620670318604, + "learning_rate": 1.5169399955862893e-05, + "loss": 5.426, + "step": 6520 + }, + { + "epoch": 0.6288331726133076, + "grad_norm": 1.5526260137557983, + "learning_rate": 1.5162436775359329e-05, + "loss": 5.468, + "step": 6521 + }, + { + "epoch": 0.6289296046287367, + "grad_norm": 1.2271679639816284, + "learning_rate": 1.5155474497735211e-05, + "loss": 5.5024, + "step": 6522 + }, + { + "epoch": 0.6290260366441659, + "grad_norm": 1.855043888092041, + "learning_rate": 1.5148513123629532e-05, + "loss": 5.5569, + "step": 6523 + }, + { + "epoch": 0.629122468659595, + "grad_norm": 2.5637149810791016, + "learning_rate": 1.5141552653681181e-05, + "loss": 5.3307, + "step": 6524 + }, + { + "epoch": 0.6292189006750241, + "grad_norm": 1.837220549583435, + "learning_rate": 1.5134593088529005e-05, + "loss": 5.2807, + "step": 6525 + }, + { + "epoch": 0.6293153326904533, + "grad_norm": 1.4816912412643433, + "learning_rate": 1.5127634428811734e-05, + "loss": 5.4548, + "step": 6526 + }, + { + "epoch": 0.6294117647058823, + "grad_norm": 2.380789279937744, + "learning_rate": 1.5120676675168016e-05, + "loss": 5.3891, + "step": 6527 + }, + { + "epoch": 0.6295081967213115, + "grad_norm": 2.1898014545440674, + "learning_rate": 1.5113719828236438e-05, + "loss": 5.3704, + "step": 6528 + }, + { + "epoch": 0.6296046287367406, + "grad_norm": 1.17307710647583, + "learning_rate": 1.5106763888655478e-05, + "loss": 5.4755, + "step": 6529 + }, + { + "epoch": 0.6297010607521697, + "grad_norm": 1.730305790901184, + "learning_rate": 1.5099808857063567e-05, + "loss": 5.4476, + "step": 6530 + }, + { + "epoch": 0.6297974927675989, + "grad_norm": 1.5957461595535278, + "learning_rate": 1.509285473409901e-05, + "loss": 5.408, + "step": 6531 + }, + { + "epoch": 0.629893924783028, + "grad_norm": 1.4219510555267334, + "learning_rate": 1.508590152040004e-05, + "loss": 5.238, + "step": 6532 + }, + { + "epoch": 0.629990356798457, + "grad_norm": 1.5620317459106445, + "learning_rate": 1.5078949216604837e-05, + "loss": 5.124, + "step": 6533 + }, + { + "epoch": 0.6300867888138862, + "grad_norm": 1.5534536838531494, + "learning_rate": 1.5071997823351464e-05, + "loss": 5.3671, + "step": 6534 + }, + { + "epoch": 0.6301832208293153, + "grad_norm": 1.589764952659607, + "learning_rate": 1.5065047341277905e-05, + "loss": 5.4325, + "step": 6535 + }, + { + "epoch": 0.6302796528447444, + "grad_norm": 1.4474903345108032, + "learning_rate": 1.5058097771022078e-05, + "loss": 5.4796, + "step": 6536 + }, + { + "epoch": 0.6303760848601736, + "grad_norm": 1.3656198978424072, + "learning_rate": 1.5051149113221803e-05, + "loss": 5.4626, + "step": 6537 + }, + { + "epoch": 0.6304725168756027, + "grad_norm": 1.2303439378738403, + "learning_rate": 1.504420136851481e-05, + "loss": 5.4187, + "step": 6538 + }, + { + "epoch": 0.6305689488910319, + "grad_norm": 1.3147395849227905, + "learning_rate": 1.5037254537538767e-05, + "loss": 5.5328, + "step": 6539 + }, + { + "epoch": 0.630665380906461, + "grad_norm": 1.5953218936920166, + "learning_rate": 1.5030308620931233e-05, + "loss": 5.4185, + "step": 6540 + }, + { + "epoch": 0.63076181292189, + "grad_norm": 1.3850622177124023, + "learning_rate": 1.5023363619329715e-05, + "loss": 5.4267, + "step": 6541 + }, + { + "epoch": 0.6308582449373192, + "grad_norm": 1.384965419769287, + "learning_rate": 1.5016419533371595e-05, + "loss": 5.4485, + "step": 6542 + }, + { + "epoch": 0.6309546769527483, + "grad_norm": 1.3974838256835938, + "learning_rate": 1.5009476363694206e-05, + "loss": 5.637, + "step": 6543 + }, + { + "epoch": 0.6310511089681774, + "grad_norm": 1.3221979141235352, + "learning_rate": 1.5002534110934779e-05, + "loss": 5.5591, + "step": 6544 + }, + { + "epoch": 0.6311475409836066, + "grad_norm": 1.8492226600646973, + "learning_rate": 1.4995592775730472e-05, + "loss": 5.1641, + "step": 6545 + }, + { + "epoch": 0.6312439729990357, + "grad_norm": 1.5296130180358887, + "learning_rate": 1.4988652358718336e-05, + "loss": 5.3759, + "step": 6546 + }, + { + "epoch": 0.6313404050144648, + "grad_norm": 1.4027665853500366, + "learning_rate": 1.4981712860535375e-05, + "loss": 5.3509, + "step": 6547 + }, + { + "epoch": 0.631436837029894, + "grad_norm": 1.6090564727783203, + "learning_rate": 1.4974774281818476e-05, + "loss": 5.1973, + "step": 6548 + }, + { + "epoch": 0.631533269045323, + "grad_norm": 1.4029812812805176, + "learning_rate": 1.4967836623204445e-05, + "loss": 5.5104, + "step": 6549 + }, + { + "epoch": 0.6316297010607522, + "grad_norm": 1.67266047000885, + "learning_rate": 1.4960899885330033e-05, + "loss": 5.5064, + "step": 6550 + }, + { + "epoch": 0.6317261330761813, + "grad_norm": 1.5109072923660278, + "learning_rate": 1.495396406883186e-05, + "loss": 5.3687, + "step": 6551 + }, + { + "epoch": 0.6318225650916104, + "grad_norm": 1.449612021446228, + "learning_rate": 1.4947029174346511e-05, + "loss": 5.4579, + "step": 6552 + }, + { + "epoch": 0.6319189971070396, + "grad_norm": 1.5278459787368774, + "learning_rate": 1.4940095202510454e-05, + "loss": 5.2693, + "step": 6553 + }, + { + "epoch": 0.6320154291224687, + "grad_norm": 1.516418218612671, + "learning_rate": 1.4933162153960065e-05, + "loss": 5.4342, + "step": 6554 + }, + { + "epoch": 0.6321118611378977, + "grad_norm": 1.8197999000549316, + "learning_rate": 1.4926230029331672e-05, + "loss": 5.4288, + "step": 6555 + }, + { + "epoch": 0.6322082931533269, + "grad_norm": 1.5804927349090576, + "learning_rate": 1.491929882926149e-05, + "loss": 5.2501, + "step": 6556 + }, + { + "epoch": 0.632304725168756, + "grad_norm": 1.5845649242401123, + "learning_rate": 1.4912368554385648e-05, + "loss": 5.5095, + "step": 6557 + }, + { + "epoch": 0.6324011571841851, + "grad_norm": 2.4055192470550537, + "learning_rate": 1.490543920534021e-05, + "loss": 5.5606, + "step": 6558 + }, + { + "epoch": 0.6324975891996143, + "grad_norm": 2.3915326595306396, + "learning_rate": 1.4898510782761133e-05, + "loss": 5.5327, + "step": 6559 + }, + { + "epoch": 0.6325940212150434, + "grad_norm": 1.7815419435501099, + "learning_rate": 1.48915832872843e-05, + "loss": 5.3868, + "step": 6560 + }, + { + "epoch": 0.6326904532304726, + "grad_norm": 1.424340844154358, + "learning_rate": 1.4884656719545515e-05, + "loss": 5.3476, + "step": 6561 + }, + { + "epoch": 0.6327868852459017, + "grad_norm": 2.178927183151245, + "learning_rate": 1.4877731080180487e-05, + "loss": 5.4267, + "step": 6562 + }, + { + "epoch": 0.6328833172613307, + "grad_norm": 1.7637895345687866, + "learning_rate": 1.4870806369824847e-05, + "loss": 5.2345, + "step": 6563 + }, + { + "epoch": 0.6329797492767599, + "grad_norm": 1.6176906824111938, + "learning_rate": 1.4863882589114125e-05, + "loss": 5.4906, + "step": 6564 + }, + { + "epoch": 0.633076181292189, + "grad_norm": 1.4829843044281006, + "learning_rate": 1.4856959738683784e-05, + "loss": 5.549, + "step": 6565 + }, + { + "epoch": 0.6331726133076181, + "grad_norm": 2.352847099304199, + "learning_rate": 1.4850037819169193e-05, + "loss": 5.2032, + "step": 6566 + }, + { + "epoch": 0.6332690453230473, + "grad_norm": 2.7382404804229736, + "learning_rate": 1.4843116831205644e-05, + "loss": 5.2849, + "step": 6567 + }, + { + "epoch": 0.6333654773384764, + "grad_norm": 1.695103406906128, + "learning_rate": 1.483619677542832e-05, + "loss": 5.3095, + "step": 6568 + }, + { + "epoch": 0.6334619093539055, + "grad_norm": 1.79180109500885, + "learning_rate": 1.4829277652472358e-05, + "loss": 5.1985, + "step": 6569 + }, + { + "epoch": 0.6335583413693346, + "grad_norm": 2.163712978363037, + "learning_rate": 1.4822359462972776e-05, + "loss": 5.381, + "step": 6570 + }, + { + "epoch": 0.6336547733847637, + "grad_norm": 1.935970664024353, + "learning_rate": 1.4815442207564506e-05, + "loss": 5.2986, + "step": 6571 + }, + { + "epoch": 0.6337512054001929, + "grad_norm": 1.8368762731552124, + "learning_rate": 1.4808525886882424e-05, + "loss": 5.3877, + "step": 6572 + }, + { + "epoch": 0.633847637415622, + "grad_norm": 1.5711045265197754, + "learning_rate": 1.4801610501561281e-05, + "loss": 5.4105, + "step": 6573 + }, + { + "epoch": 0.6339440694310511, + "grad_norm": 1.4127174615859985, + "learning_rate": 1.4794696052235785e-05, + "loss": 5.1792, + "step": 6574 + }, + { + "epoch": 0.6340405014464803, + "grad_norm": 1.696278691291809, + "learning_rate": 1.4787782539540523e-05, + "loss": 5.441, + "step": 6575 + }, + { + "epoch": 0.6341369334619094, + "grad_norm": 1.7678734064102173, + "learning_rate": 1.4780869964110003e-05, + "loss": 5.2366, + "step": 6576 + }, + { + "epoch": 0.6342333654773384, + "grad_norm": 1.8128236532211304, + "learning_rate": 1.4773958326578668e-05, + "loss": 5.3419, + "step": 6577 + }, + { + "epoch": 0.6343297974927676, + "grad_norm": 1.3404176235198975, + "learning_rate": 1.4767047627580854e-05, + "loss": 5.5458, + "step": 6578 + }, + { + "epoch": 0.6344262295081967, + "grad_norm": 1.4969855546951294, + "learning_rate": 1.4760137867750801e-05, + "loss": 5.3993, + "step": 6579 + }, + { + "epoch": 0.6345226615236258, + "grad_norm": 1.7059202194213867, + "learning_rate": 1.4753229047722703e-05, + "loss": 5.3538, + "step": 6580 + }, + { + "epoch": 0.634619093539055, + "grad_norm": 1.6331156492233276, + "learning_rate": 1.4746321168130628e-05, + "loss": 5.3742, + "step": 6581 + }, + { + "epoch": 0.6347155255544841, + "grad_norm": 1.645334243774414, + "learning_rate": 1.4739414229608572e-05, + "loss": 5.4391, + "step": 6582 + }, + { + "epoch": 0.6348119575699133, + "grad_norm": 1.5522898435592651, + "learning_rate": 1.4732508232790451e-05, + "loss": 5.4801, + "step": 6583 + }, + { + "epoch": 0.6349083895853423, + "grad_norm": 1.7403534650802612, + "learning_rate": 1.4725603178310088e-05, + "loss": 5.5741, + "step": 6584 + }, + { + "epoch": 0.6350048216007714, + "grad_norm": 1.8295011520385742, + "learning_rate": 1.471869906680122e-05, + "loss": 5.2206, + "step": 6585 + }, + { + "epoch": 0.6351012536162006, + "grad_norm": 2.5143911838531494, + "learning_rate": 1.4711795898897501e-05, + "loss": 5.4777, + "step": 6586 + }, + { + "epoch": 0.6351976856316297, + "grad_norm": 1.9091746807098389, + "learning_rate": 1.4704893675232486e-05, + "loss": 5.4613, + "step": 6587 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 2.3034825325012207, + "learning_rate": 1.4697992396439658e-05, + "loss": 5.2434, + "step": 6588 + }, + { + "epoch": 0.635390549662488, + "grad_norm": 1.5759515762329102, + "learning_rate": 1.4691092063152417e-05, + "loss": 5.4645, + "step": 6589 + }, + { + "epoch": 0.6354869816779171, + "grad_norm": 1.6588753461837769, + "learning_rate": 1.4684192676004045e-05, + "loss": 5.4472, + "step": 6590 + }, + { + "epoch": 0.6355834136933461, + "grad_norm": 1.4451946020126343, + "learning_rate": 1.4677294235627784e-05, + "loss": 5.4268, + "step": 6591 + }, + { + "epoch": 0.6356798457087753, + "grad_norm": 1.4911231994628906, + "learning_rate": 1.4670396742656753e-05, + "loss": 5.6432, + "step": 6592 + }, + { + "epoch": 0.6357762777242044, + "grad_norm": 1.6357476711273193, + "learning_rate": 1.4663500197723985e-05, + "loss": 5.5508, + "step": 6593 + }, + { + "epoch": 0.6358727097396336, + "grad_norm": 1.9197920560836792, + "learning_rate": 1.4656604601462453e-05, + "loss": 5.5146, + "step": 6594 + }, + { + "epoch": 0.6359691417550627, + "grad_norm": 2.0131514072418213, + "learning_rate": 1.4649709954505014e-05, + "loss": 5.4773, + "step": 6595 + }, + { + "epoch": 0.6360655737704918, + "grad_norm": 1.8240416049957275, + "learning_rate": 1.4642816257484465e-05, + "loss": 5.5244, + "step": 6596 + }, + { + "epoch": 0.636162005785921, + "grad_norm": 1.7143973112106323, + "learning_rate": 1.4635923511033494e-05, + "loss": 5.3517, + "step": 6597 + }, + { + "epoch": 0.63625843780135, + "grad_norm": 1.7343049049377441, + "learning_rate": 1.4629031715784697e-05, + "loss": 5.3811, + "step": 6598 + }, + { + "epoch": 0.6363548698167791, + "grad_norm": 1.4973729848861694, + "learning_rate": 1.4622140872370613e-05, + "loss": 5.3703, + "step": 6599 + }, + { + "epoch": 0.6364513018322083, + "grad_norm": 1.3107988834381104, + "learning_rate": 1.4615250981423668e-05, + "loss": 5.2268, + "step": 6600 + }, + { + "epoch": 0.6365477338476374, + "grad_norm": 1.2797225713729858, + "learning_rate": 1.4608362043576193e-05, + "loss": 5.4841, + "step": 6601 + }, + { + "epoch": 0.6366441658630665, + "grad_norm": 1.3365650177001953, + "learning_rate": 1.4601474059460474e-05, + "loss": 5.4425, + "step": 6602 + }, + { + "epoch": 0.6367405978784957, + "grad_norm": 1.4855409860610962, + "learning_rate": 1.459458702970866e-05, + "loss": 5.4946, + "step": 6603 + }, + { + "epoch": 0.6368370298939248, + "grad_norm": 1.472511649131775, + "learning_rate": 1.458770095495284e-05, + "loss": 5.4339, + "step": 6604 + }, + { + "epoch": 0.636933461909354, + "grad_norm": 1.2025728225708008, + "learning_rate": 1.4580815835825013e-05, + "loss": 5.2982, + "step": 6605 + }, + { + "epoch": 0.637029893924783, + "grad_norm": 1.4125866889953613, + "learning_rate": 1.4573931672957086e-05, + "loss": 5.4139, + "step": 6606 + }, + { + "epoch": 0.6371263259402121, + "grad_norm": 1.6839008331298828, + "learning_rate": 1.4567048466980876e-05, + "loss": 5.4263, + "step": 6607 + }, + { + "epoch": 0.6372227579556413, + "grad_norm": 1.3362162113189697, + "learning_rate": 1.4560166218528126e-05, + "loss": 5.3069, + "step": 6608 + }, + { + "epoch": 0.6373191899710704, + "grad_norm": 1.3990992307662964, + "learning_rate": 1.4553284928230452e-05, + "loss": 5.482, + "step": 6609 + }, + { + "epoch": 0.6374156219864995, + "grad_norm": 1.2178795337677002, + "learning_rate": 1.4546404596719449e-05, + "loss": 5.4345, + "step": 6610 + }, + { + "epoch": 0.6375120540019287, + "grad_norm": 1.5265252590179443, + "learning_rate": 1.4539525224626555e-05, + "loss": 5.201, + "step": 6611 + }, + { + "epoch": 0.6376084860173578, + "grad_norm": 1.403456449508667, + "learning_rate": 1.4532646812583161e-05, + "loss": 5.3863, + "step": 6612 + }, + { + "epoch": 0.6377049180327868, + "grad_norm": 1.1334820985794067, + "learning_rate": 1.4525769361220562e-05, + "loss": 5.4904, + "step": 6613 + }, + { + "epoch": 0.637801350048216, + "grad_norm": 1.3964533805847168, + "learning_rate": 1.451889287116996e-05, + "loss": 5.2714, + "step": 6614 + }, + { + "epoch": 0.6378977820636451, + "grad_norm": 1.1649833917617798, + "learning_rate": 1.4512017343062468e-05, + "loss": 5.3878, + "step": 6615 + }, + { + "epoch": 0.6379942140790743, + "grad_norm": 1.3067797422409058, + "learning_rate": 1.4505142777529113e-05, + "loss": 5.3665, + "step": 6616 + }, + { + "epoch": 0.6380906460945034, + "grad_norm": 1.751410722732544, + "learning_rate": 1.449826917520084e-05, + "loss": 5.4012, + "step": 6617 + }, + { + "epoch": 0.6381870781099325, + "grad_norm": 1.5930885076522827, + "learning_rate": 1.4491396536708496e-05, + "loss": 5.443, + "step": 6618 + }, + { + "epoch": 0.6382835101253617, + "grad_norm": 1.6730897426605225, + "learning_rate": 1.4484524862682856e-05, + "loss": 5.5321, + "step": 6619 + }, + { + "epoch": 0.6383799421407907, + "grad_norm": 1.804298758506775, + "learning_rate": 1.4477654153754558e-05, + "loss": 5.4894, + "step": 6620 + }, + { + "epoch": 0.6384763741562198, + "grad_norm": 1.4624581336975098, + "learning_rate": 1.4470784410554234e-05, + "loss": 5.4552, + "step": 6621 + }, + { + "epoch": 0.638572806171649, + "grad_norm": 1.6699680089950562, + "learning_rate": 1.4463915633712345e-05, + "loss": 5.2864, + "step": 6622 + }, + { + "epoch": 0.6386692381870781, + "grad_norm": 1.8783069849014282, + "learning_rate": 1.4457047823859313e-05, + "loss": 5.4662, + "step": 6623 + }, + { + "epoch": 0.6387656702025072, + "grad_norm": 1.4265953302383423, + "learning_rate": 1.4450180981625456e-05, + "loss": 5.4224, + "step": 6624 + }, + { + "epoch": 0.6388621022179364, + "grad_norm": 1.3292791843414307, + "learning_rate": 1.4443315107641008e-05, + "loss": 5.4927, + "step": 6625 + }, + { + "epoch": 0.6389585342333655, + "grad_norm": 1.9278613328933716, + "learning_rate": 1.4436450202536094e-05, + "loss": 5.5226, + "step": 6626 + }, + { + "epoch": 0.6390549662487947, + "grad_norm": 1.5659582614898682, + "learning_rate": 1.4429586266940795e-05, + "loss": 5.4999, + "step": 6627 + }, + { + "epoch": 0.6391513982642237, + "grad_norm": 1.1197929382324219, + "learning_rate": 1.442272330148504e-05, + "loss": 5.5197, + "step": 6628 + }, + { + "epoch": 0.6392478302796528, + "grad_norm": 1.4192955493927002, + "learning_rate": 1.4415861306798739e-05, + "loss": 5.4009, + "step": 6629 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 1.2147136926651, + "learning_rate": 1.4409000283511654e-05, + "loss": 5.4452, + "step": 6630 + }, + { + "epoch": 0.6394406943105111, + "grad_norm": 1.412916660308838, + "learning_rate": 1.4402140232253486e-05, + "loss": 5.5274, + "step": 6631 + }, + { + "epoch": 0.6395371263259402, + "grad_norm": 1.7003049850463867, + "learning_rate": 1.4395281153653842e-05, + "loss": 5.3069, + "step": 6632 + }, + { + "epoch": 0.6396335583413694, + "grad_norm": 1.3785130977630615, + "learning_rate": 1.4388423048342253e-05, + "loss": 5.5031, + "step": 6633 + }, + { + "epoch": 0.6397299903567985, + "grad_norm": 1.1931997537612915, + "learning_rate": 1.4381565916948115e-05, + "loss": 5.3461, + "step": 6634 + }, + { + "epoch": 0.6398264223722275, + "grad_norm": 1.4184505939483643, + "learning_rate": 1.437470976010081e-05, + "loss": 5.4076, + "step": 6635 + }, + { + "epoch": 0.6399228543876567, + "grad_norm": 1.664989948272705, + "learning_rate": 1.4367854578429554e-05, + "loss": 5.2205, + "step": 6636 + }, + { + "epoch": 0.6400192864030858, + "grad_norm": 2.0000813007354736, + "learning_rate": 1.4361000372563519e-05, + "loss": 5.4653, + "step": 6637 + }, + { + "epoch": 0.640115718418515, + "grad_norm": 1.6029504537582397, + "learning_rate": 1.4354147143131774e-05, + "loss": 5.3587, + "step": 6638 + }, + { + "epoch": 0.6402121504339441, + "grad_norm": 1.334256887435913, + "learning_rate": 1.4347294890763303e-05, + "loss": 5.4209, + "step": 6639 + }, + { + "epoch": 0.6403085824493732, + "grad_norm": 1.4721624851226807, + "learning_rate": 1.4340443616086995e-05, + "loss": 5.3881, + "step": 6640 + }, + { + "epoch": 0.6404050144648024, + "grad_norm": 1.5077892541885376, + "learning_rate": 1.4333593319731653e-05, + "loss": 5.1705, + "step": 6641 + }, + { + "epoch": 0.6405014464802314, + "grad_norm": 1.305518388748169, + "learning_rate": 1.4326744002325986e-05, + "loss": 5.3747, + "step": 6642 + }, + { + "epoch": 0.6405978784956605, + "grad_norm": 1.4438257217407227, + "learning_rate": 1.4319895664498622e-05, + "loss": 5.3633, + "step": 6643 + }, + { + "epoch": 0.6406943105110897, + "grad_norm": 1.6183232069015503, + "learning_rate": 1.4313048306878097e-05, + "loss": 5.3669, + "step": 6644 + }, + { + "epoch": 0.6407907425265188, + "grad_norm": 1.2761632204055786, + "learning_rate": 1.4306201930092822e-05, + "loss": 5.4174, + "step": 6645 + }, + { + "epoch": 0.6408871745419479, + "grad_norm": 1.9330883026123047, + "learning_rate": 1.4299356534771194e-05, + "loss": 5.5645, + "step": 6646 + }, + { + "epoch": 0.6409836065573771, + "grad_norm": 1.6389813423156738, + "learning_rate": 1.429251212154144e-05, + "loss": 5.3035, + "step": 6647 + }, + { + "epoch": 0.6410800385728062, + "grad_norm": 1.1374655961990356, + "learning_rate": 1.428566869103175e-05, + "loss": 5.415, + "step": 6648 + }, + { + "epoch": 0.6411764705882353, + "grad_norm": 1.657650351524353, + "learning_rate": 1.4278826243870197e-05, + "loss": 5.4359, + "step": 6649 + }, + { + "epoch": 0.6412729026036644, + "grad_norm": 1.4911959171295166, + "learning_rate": 1.4271984780684778e-05, + "loss": 5.3413, + "step": 6650 + }, + { + "epoch": 0.6413693346190935, + "grad_norm": 1.3385456800460815, + "learning_rate": 1.426514430210339e-05, + "loss": 5.4478, + "step": 6651 + }, + { + "epoch": 0.6414657666345227, + "grad_norm": 1.348460078239441, + "learning_rate": 1.4258304808753853e-05, + "loss": 5.3406, + "step": 6652 + }, + { + "epoch": 0.6415621986499518, + "grad_norm": 1.8438873291015625, + "learning_rate": 1.4251466301263865e-05, + "loss": 5.4037, + "step": 6653 + }, + { + "epoch": 0.6416586306653809, + "grad_norm": 1.6144249439239502, + "learning_rate": 1.4244628780261088e-05, + "loss": 5.3494, + "step": 6654 + }, + { + "epoch": 0.6417550626808101, + "grad_norm": 1.8339627981185913, + "learning_rate": 1.4237792246373034e-05, + "loss": 5.5381, + "step": 6655 + }, + { + "epoch": 0.6418514946962391, + "grad_norm": 1.759019136428833, + "learning_rate": 1.423095670022716e-05, + "loss": 5.31, + "step": 6656 + }, + { + "epoch": 0.6419479267116682, + "grad_norm": 1.848312258720398, + "learning_rate": 1.4224122142450826e-05, + "loss": 5.1765, + "step": 6657 + }, + { + "epoch": 0.6420443587270974, + "grad_norm": 1.2782378196716309, + "learning_rate": 1.4217288573671301e-05, + "loss": 5.2106, + "step": 6658 + }, + { + "epoch": 0.6421407907425265, + "grad_norm": 1.5085561275482178, + "learning_rate": 1.4210455994515758e-05, + "loss": 5.2934, + "step": 6659 + }, + { + "epoch": 0.6422372227579557, + "grad_norm": 1.5969278812408447, + "learning_rate": 1.4203624405611284e-05, + "loss": 5.5019, + "step": 6660 + }, + { + "epoch": 0.6423336547733848, + "grad_norm": 1.5527185201644897, + "learning_rate": 1.4196793807584873e-05, + "loss": 5.4469, + "step": 6661 + }, + { + "epoch": 0.6424300867888139, + "grad_norm": 1.3767038583755493, + "learning_rate": 1.4189964201063432e-05, + "loss": 5.3372, + "step": 6662 + }, + { + "epoch": 0.642526518804243, + "grad_norm": 1.1810476779937744, + "learning_rate": 1.418313558667378e-05, + "loss": 5.2045, + "step": 6663 + }, + { + "epoch": 0.6426229508196721, + "grad_norm": 1.8571311235427856, + "learning_rate": 1.4176307965042612e-05, + "loss": 5.4119, + "step": 6664 + }, + { + "epoch": 0.6427193828351012, + "grad_norm": 2.0127768516540527, + "learning_rate": 1.4169481336796597e-05, + "loss": 5.4523, + "step": 6665 + }, + { + "epoch": 0.6428158148505304, + "grad_norm": 1.5710617303848267, + "learning_rate": 1.4162655702562245e-05, + "loss": 5.3312, + "step": 6666 + }, + { + "epoch": 0.6429122468659595, + "grad_norm": 1.3657495975494385, + "learning_rate": 1.4155831062966011e-05, + "loss": 5.3924, + "step": 6667 + }, + { + "epoch": 0.6430086788813886, + "grad_norm": 1.675489902496338, + "learning_rate": 1.4149007418634256e-05, + "loss": 5.3461, + "step": 6668 + }, + { + "epoch": 0.6431051108968178, + "grad_norm": 1.558320164680481, + "learning_rate": 1.4142184770193257e-05, + "loss": 5.3604, + "step": 6669 + }, + { + "epoch": 0.6432015429122468, + "grad_norm": 1.7386127710342407, + "learning_rate": 1.4135363118269151e-05, + "loss": 5.455, + "step": 6670 + }, + { + "epoch": 0.643297974927676, + "grad_norm": 1.2201170921325684, + "learning_rate": 1.412854246348807e-05, + "loss": 5.4392, + "step": 6671 + }, + { + "epoch": 0.6433944069431051, + "grad_norm": 1.4156211614608765, + "learning_rate": 1.4121722806475962e-05, + "loss": 5.4216, + "step": 6672 + }, + { + "epoch": 0.6434908389585342, + "grad_norm": 1.4438058137893677, + "learning_rate": 1.4114904147858764e-05, + "loss": 5.3731, + "step": 6673 + }, + { + "epoch": 0.6435872709739634, + "grad_norm": 1.4220514297485352, + "learning_rate": 1.4108086488262257e-05, + "loss": 5.3265, + "step": 6674 + }, + { + "epoch": 0.6436837029893925, + "grad_norm": 1.6410397291183472, + "learning_rate": 1.4101269828312164e-05, + "loss": 5.3507, + "step": 6675 + }, + { + "epoch": 0.6437801350048216, + "grad_norm": 2.0364856719970703, + "learning_rate": 1.4094454168634114e-05, + "loss": 5.3373, + "step": 6676 + }, + { + "epoch": 0.6438765670202508, + "grad_norm": 1.7365431785583496, + "learning_rate": 1.408763950985364e-05, + "loss": 5.487, + "step": 6677 + }, + { + "epoch": 0.6439729990356798, + "grad_norm": 1.387023687362671, + "learning_rate": 1.4080825852596174e-05, + "loss": 5.4171, + "step": 6678 + }, + { + "epoch": 0.6440694310511089, + "grad_norm": 1.9149247407913208, + "learning_rate": 1.407401319748709e-05, + "loss": 5.4227, + "step": 6679 + }, + { + "epoch": 0.6441658630665381, + "grad_norm": 2.1912403106689453, + "learning_rate": 1.4067201545151612e-05, + "loss": 5.1643, + "step": 6680 + }, + { + "epoch": 0.6442622950819672, + "grad_norm": 1.926045536994934, + "learning_rate": 1.406039089621492e-05, + "loss": 5.3327, + "step": 6681 + }, + { + "epoch": 0.6443587270973964, + "grad_norm": 1.802456259727478, + "learning_rate": 1.4053581251302089e-05, + "loss": 5.2714, + "step": 6682 + }, + { + "epoch": 0.6444551591128255, + "grad_norm": 2.019982099533081, + "learning_rate": 1.4046772611038097e-05, + "loss": 5.4721, + "step": 6683 + }, + { + "epoch": 0.6445515911282546, + "grad_norm": 1.2033699750900269, + "learning_rate": 1.4039964976047835e-05, + "loss": 5.372, + "step": 6684 + }, + { + "epoch": 0.6446480231436837, + "grad_norm": 1.3699731826782227, + "learning_rate": 1.4033158346956096e-05, + "loss": 5.3791, + "step": 6685 + }, + { + "epoch": 0.6447444551591128, + "grad_norm": 1.2656116485595703, + "learning_rate": 1.4026352724387586e-05, + "loss": 5.2852, + "step": 6686 + }, + { + "epoch": 0.6448408871745419, + "grad_norm": 1.4117543697357178, + "learning_rate": 1.4019548108966918e-05, + "loss": 5.4669, + "step": 6687 + }, + { + "epoch": 0.6449373191899711, + "grad_norm": 1.1999958753585815, + "learning_rate": 1.4012744501318619e-05, + "loss": 5.3859, + "step": 6688 + }, + { + "epoch": 0.6450337512054002, + "grad_norm": 1.136569619178772, + "learning_rate": 1.4005941902067088e-05, + "loss": 5.3944, + "step": 6689 + }, + { + "epoch": 0.6451301832208293, + "grad_norm": 1.2129696607589722, + "learning_rate": 1.3999140311836695e-05, + "loss": 5.2742, + "step": 6690 + }, + { + "epoch": 0.6452266152362585, + "grad_norm": 1.308272361755371, + "learning_rate": 1.3992339731251658e-05, + "loss": 5.4028, + "step": 6691 + }, + { + "epoch": 0.6453230472516875, + "grad_norm": 1.4360792636871338, + "learning_rate": 1.3985540160936127e-05, + "loss": 5.4543, + "step": 6692 + }, + { + "epoch": 0.6454194792671167, + "grad_norm": 1.2449771165847778, + "learning_rate": 1.3978741601514167e-05, + "loss": 5.4366, + "step": 6693 + }, + { + "epoch": 0.6455159112825458, + "grad_norm": 1.058129072189331, + "learning_rate": 1.3971944053609736e-05, + "loss": 5.383, + "step": 6694 + }, + { + "epoch": 0.6456123432979749, + "grad_norm": 1.1819603443145752, + "learning_rate": 1.3965147517846711e-05, + "loss": 5.273, + "step": 6695 + }, + { + "epoch": 0.6457087753134041, + "grad_norm": 1.1181306838989258, + "learning_rate": 1.395835199484887e-05, + "loss": 5.424, + "step": 6696 + }, + { + "epoch": 0.6458052073288332, + "grad_norm": 1.256018042564392, + "learning_rate": 1.3951557485239874e-05, + "loss": 5.4079, + "step": 6697 + }, + { + "epoch": 0.6459016393442623, + "grad_norm": 1.3549076318740845, + "learning_rate": 1.3944763989643356e-05, + "loss": 5.1885, + "step": 6698 + }, + { + "epoch": 0.6459980713596915, + "grad_norm": 1.5633344650268555, + "learning_rate": 1.3937971508682779e-05, + "loss": 5.4467, + "step": 6699 + }, + { + "epoch": 0.6460945033751205, + "grad_norm": 1.5601332187652588, + "learning_rate": 1.3931180042981562e-05, + "loss": 5.4336, + "step": 6700 + }, + { + "epoch": 0.6461909353905496, + "grad_norm": 1.5711610317230225, + "learning_rate": 1.392438959316302e-05, + "loss": 5.312, + "step": 6701 + }, + { + "epoch": 0.6462873674059788, + "grad_norm": 1.2944369316101074, + "learning_rate": 1.391760015985037e-05, + "loss": 5.2509, + "step": 6702 + }, + { + "epoch": 0.6463837994214079, + "grad_norm": 1.4722301959991455, + "learning_rate": 1.3910811743666732e-05, + "loss": 5.4593, + "step": 6703 + }, + { + "epoch": 0.6464802314368371, + "grad_norm": 1.7245351076126099, + "learning_rate": 1.3904024345235147e-05, + "loss": 5.3068, + "step": 6704 + }, + { + "epoch": 0.6465766634522662, + "grad_norm": 1.377807378768921, + "learning_rate": 1.389723796517855e-05, + "loss": 5.4504, + "step": 6705 + }, + { + "epoch": 0.6466730954676952, + "grad_norm": 1.2968748807907104, + "learning_rate": 1.3890452604119786e-05, + "loss": 5.3102, + "step": 6706 + }, + { + "epoch": 0.6467695274831244, + "grad_norm": 1.4869754314422607, + "learning_rate": 1.388366826268162e-05, + "loss": 5.2933, + "step": 6707 + }, + { + "epoch": 0.6468659594985535, + "grad_norm": 1.3837848901748657, + "learning_rate": 1.387688494148668e-05, + "loss": 5.3727, + "step": 6708 + }, + { + "epoch": 0.6469623915139826, + "grad_norm": 1.474521517753601, + "learning_rate": 1.3870102641157568e-05, + "loss": 5.414, + "step": 6709 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 1.613924503326416, + "learning_rate": 1.3863321362316728e-05, + "loss": 5.4286, + "step": 6710 + }, + { + "epoch": 0.6471552555448409, + "grad_norm": 1.9593651294708252, + "learning_rate": 1.3856541105586545e-05, + "loss": 5.4728, + "step": 6711 + }, + { + "epoch": 0.64725168756027, + "grad_norm": 1.5664699077606201, + "learning_rate": 1.3849761871589306e-05, + "loss": 5.4789, + "step": 6712 + }, + { + "epoch": 0.6473481195756992, + "grad_norm": 1.679974913597107, + "learning_rate": 1.3842983660947206e-05, + "loss": 5.5056, + "step": 6713 + }, + { + "epoch": 0.6474445515911282, + "grad_norm": 1.6783480644226074, + "learning_rate": 1.3836206474282315e-05, + "loss": 5.4657, + "step": 6714 + }, + { + "epoch": 0.6475409836065574, + "grad_norm": 1.507644534111023, + "learning_rate": 1.3829430312216674e-05, + "loss": 5.4004, + "step": 6715 + }, + { + "epoch": 0.6476374156219865, + "grad_norm": 1.278664469718933, + "learning_rate": 1.3822655175372149e-05, + "loss": 5.3439, + "step": 6716 + }, + { + "epoch": 0.6477338476374156, + "grad_norm": 1.4691814184188843, + "learning_rate": 1.3815881064370595e-05, + "loss": 5.4802, + "step": 6717 + }, + { + "epoch": 0.6478302796528448, + "grad_norm": 1.7720733880996704, + "learning_rate": 1.3809107979833702e-05, + "loss": 5.4382, + "step": 6718 + }, + { + "epoch": 0.6479267116682739, + "grad_norm": 1.3180731534957886, + "learning_rate": 1.3802335922383103e-05, + "loss": 5.3398, + "step": 6719 + }, + { + "epoch": 0.648023143683703, + "grad_norm": 1.1189320087432861, + "learning_rate": 1.3795564892640334e-05, + "loss": 5.3981, + "step": 6720 + }, + { + "epoch": 0.6481195756991321, + "grad_norm": 1.521683692932129, + "learning_rate": 1.3788794891226828e-05, + "loss": 5.437, + "step": 6721 + }, + { + "epoch": 0.6482160077145612, + "grad_norm": 1.2180265188217163, + "learning_rate": 1.3782025918763927e-05, + "loss": 5.4375, + "step": 6722 + }, + { + "epoch": 0.6483124397299903, + "grad_norm": 1.1064428091049194, + "learning_rate": 1.377525797587288e-05, + "loss": 5.3938, + "step": 6723 + }, + { + "epoch": 0.6484088717454195, + "grad_norm": 1.231873631477356, + "learning_rate": 1.3768491063174852e-05, + "loss": 5.4886, + "step": 6724 + }, + { + "epoch": 0.6485053037608486, + "grad_norm": 1.357276201248169, + "learning_rate": 1.3761725181290869e-05, + "loss": 5.2766, + "step": 6725 + }, + { + "epoch": 0.6486017357762778, + "grad_norm": 1.7488678693771362, + "learning_rate": 1.3754960330841937e-05, + "loss": 5.2787, + "step": 6726 + }, + { + "epoch": 0.6486981677917069, + "grad_norm": 1.419762134552002, + "learning_rate": 1.3748196512448886e-05, + "loss": 5.4194, + "step": 6727 + }, + { + "epoch": 0.6487945998071359, + "grad_norm": 1.3656928539276123, + "learning_rate": 1.3741433726732533e-05, + "loss": 5.4289, + "step": 6728 + }, + { + "epoch": 0.6488910318225651, + "grad_norm": 1.5644721984863281, + "learning_rate": 1.3734671974313523e-05, + "loss": 5.4041, + "step": 6729 + }, + { + "epoch": 0.6489874638379942, + "grad_norm": 1.3753662109375, + "learning_rate": 1.3727911255812454e-05, + "loss": 5.4395, + "step": 6730 + }, + { + "epoch": 0.6490838958534233, + "grad_norm": 1.5213744640350342, + "learning_rate": 1.3721151571849813e-05, + "loss": 5.2987, + "step": 6731 + }, + { + "epoch": 0.6491803278688525, + "grad_norm": 1.890395998954773, + "learning_rate": 1.3714392923046013e-05, + "loss": 5.1398, + "step": 6732 + }, + { + "epoch": 0.6492767598842816, + "grad_norm": 1.2337126731872559, + "learning_rate": 1.370763531002132e-05, + "loss": 5.2753, + "step": 6733 + }, + { + "epoch": 0.6493731918997107, + "grad_norm": 1.3752288818359375, + "learning_rate": 1.3700878733395977e-05, + "loss": 5.4443, + "step": 6734 + }, + { + "epoch": 0.6494696239151398, + "grad_norm": 1.398425579071045, + "learning_rate": 1.3694123193790067e-05, + "loss": 5.2622, + "step": 6735 + }, + { + "epoch": 0.6495660559305689, + "grad_norm": 1.498599886894226, + "learning_rate": 1.3687368691823616e-05, + "loss": 5.4124, + "step": 6736 + }, + { + "epoch": 0.6496624879459981, + "grad_norm": 1.4191057682037354, + "learning_rate": 1.3680615228116544e-05, + "loss": 5.3259, + "step": 6737 + }, + { + "epoch": 0.6497589199614272, + "grad_norm": 1.540998935699463, + "learning_rate": 1.3673862803288673e-05, + "loss": 5.5035, + "step": 6738 + }, + { + "epoch": 0.6498553519768563, + "grad_norm": 1.5397812128067017, + "learning_rate": 1.3667111417959736e-05, + "loss": 5.3768, + "step": 6739 + }, + { + "epoch": 0.6499517839922855, + "grad_norm": 1.3498661518096924, + "learning_rate": 1.3660361072749372e-05, + "loss": 5.4085, + "step": 6740 + }, + { + "epoch": 0.6500482160077146, + "grad_norm": 1.2036463022232056, + "learning_rate": 1.3653611768277094e-05, + "loss": 5.3068, + "step": 6741 + }, + { + "epoch": 0.6501446480231436, + "grad_norm": 1.4320555925369263, + "learning_rate": 1.3646863505162383e-05, + "loss": 5.4697, + "step": 6742 + }, + { + "epoch": 0.6502410800385728, + "grad_norm": 1.3958972692489624, + "learning_rate": 1.364011628402455e-05, + "loss": 5.4579, + "step": 6743 + }, + { + "epoch": 0.6503375120540019, + "grad_norm": 1.2740720510482788, + "learning_rate": 1.363337010548287e-05, + "loss": 5.359, + "step": 6744 + }, + { + "epoch": 0.650433944069431, + "grad_norm": 1.2365354299545288, + "learning_rate": 1.3626624970156487e-05, + "loss": 5.4875, + "step": 6745 + }, + { + "epoch": 0.6505303760848602, + "grad_norm": 1.2779988050460815, + "learning_rate": 1.3619880878664471e-05, + "loss": 5.3226, + "step": 6746 + }, + { + "epoch": 0.6506268081002893, + "grad_norm": 1.5686089992523193, + "learning_rate": 1.3613137831625777e-05, + "loss": 5.4107, + "step": 6747 + }, + { + "epoch": 0.6507232401157185, + "grad_norm": 1.3883455991744995, + "learning_rate": 1.360639582965928e-05, + "loss": 5.3068, + "step": 6748 + }, + { + "epoch": 0.6508196721311476, + "grad_norm": 1.1385372877120972, + "learning_rate": 1.359965487338375e-05, + "loss": 5.4462, + "step": 6749 + }, + { + "epoch": 0.6509161041465766, + "grad_norm": 1.540315866470337, + "learning_rate": 1.3592914963417863e-05, + "loss": 5.4064, + "step": 6750 + }, + { + "epoch": 0.6510125361620058, + "grad_norm": 1.5160640478134155, + "learning_rate": 1.358617610038021e-05, + "loss": 5.4328, + "step": 6751 + }, + { + "epoch": 0.6511089681774349, + "grad_norm": 1.1910964250564575, + "learning_rate": 1.357943828488925e-05, + "loss": 5.5117, + "step": 6752 + }, + { + "epoch": 0.651205400192864, + "grad_norm": 1.4333044290542603, + "learning_rate": 1.3572701517563408e-05, + "loss": 5.4369, + "step": 6753 + }, + { + "epoch": 0.6513018322082932, + "grad_norm": 1.40633225440979, + "learning_rate": 1.356596579902094e-05, + "loss": 5.2636, + "step": 6754 + }, + { + "epoch": 0.6513982642237223, + "grad_norm": 1.2094684839248657, + "learning_rate": 1.3559231129880062e-05, + "loss": 5.4582, + "step": 6755 + }, + { + "epoch": 0.6514946962391513, + "grad_norm": 1.3181471824645996, + "learning_rate": 1.3552497510758871e-05, + "loss": 5.3805, + "step": 6756 + }, + { + "epoch": 0.6515911282545805, + "grad_norm": 1.5354864597320557, + "learning_rate": 1.3545764942275376e-05, + "loss": 5.437, + "step": 6757 + }, + { + "epoch": 0.6516875602700096, + "grad_norm": 1.3944206237792969, + "learning_rate": 1.353903342504746e-05, + "loss": 5.5539, + "step": 6758 + }, + { + "epoch": 0.6517839922854388, + "grad_norm": 1.8671196699142456, + "learning_rate": 1.3532302959692966e-05, + "loss": 5.239, + "step": 6759 + }, + { + "epoch": 0.6518804243008679, + "grad_norm": 1.3325388431549072, + "learning_rate": 1.3525573546829576e-05, + "loss": 5.3668, + "step": 6760 + }, + { + "epoch": 0.651976856316297, + "grad_norm": 1.4417989253997803, + "learning_rate": 1.3518845187074941e-05, + "loss": 5.4493, + "step": 6761 + }, + { + "epoch": 0.6520732883317262, + "grad_norm": 1.9437013864517212, + "learning_rate": 1.3512117881046554e-05, + "loss": 5.49, + "step": 6762 + }, + { + "epoch": 0.6521697203471553, + "grad_norm": 1.5361177921295166, + "learning_rate": 1.3505391629361846e-05, + "loss": 5.2795, + "step": 6763 + }, + { + "epoch": 0.6522661523625843, + "grad_norm": 1.341273307800293, + "learning_rate": 1.3498666432638151e-05, + "loss": 5.2425, + "step": 6764 + }, + { + "epoch": 0.6523625843780135, + "grad_norm": 2.18037748336792, + "learning_rate": 1.3491942291492693e-05, + "loss": 5.3291, + "step": 6765 + }, + { + "epoch": 0.6524590163934426, + "grad_norm": 1.9703752994537354, + "learning_rate": 1.3485219206542607e-05, + "loss": 5.2071, + "step": 6766 + }, + { + "epoch": 0.6525554484088717, + "grad_norm": 1.6568797826766968, + "learning_rate": 1.347849717840493e-05, + "loss": 5.4345, + "step": 6767 + }, + { + "epoch": 0.6526518804243009, + "grad_norm": 1.7782942056655884, + "learning_rate": 1.3471776207696612e-05, + "loss": 5.2914, + "step": 6768 + }, + { + "epoch": 0.65274831243973, + "grad_norm": 2.068955659866333, + "learning_rate": 1.3465056295034465e-05, + "loss": 5.4327, + "step": 6769 + }, + { + "epoch": 0.6528447444551592, + "grad_norm": 1.7661983966827393, + "learning_rate": 1.3458337441035274e-05, + "loss": 5.3641, + "step": 6770 + }, + { + "epoch": 0.6529411764705882, + "grad_norm": 1.4880772829055786, + "learning_rate": 1.3451619646315645e-05, + "loss": 5.543, + "step": 6771 + }, + { + "epoch": 0.6530376084860173, + "grad_norm": 1.3058931827545166, + "learning_rate": 1.3444902911492174e-05, + "loss": 5.3707, + "step": 6772 + }, + { + "epoch": 0.6531340405014465, + "grad_norm": 1.7270874977111816, + "learning_rate": 1.3438187237181279e-05, + "loss": 5.2164, + "step": 6773 + }, + { + "epoch": 0.6532304725168756, + "grad_norm": 1.434352159500122, + "learning_rate": 1.343147262399933e-05, + "loss": 5.4367, + "step": 6774 + }, + { + "epoch": 0.6533269045323047, + "grad_norm": 1.906847596168518, + "learning_rate": 1.3424759072562587e-05, + "loss": 5.3501, + "step": 6775 + }, + { + "epoch": 0.6534233365477339, + "grad_norm": 1.8477908372879028, + "learning_rate": 1.3418046583487218e-05, + "loss": 5.4333, + "step": 6776 + }, + { + "epoch": 0.653519768563163, + "grad_norm": 1.526650071144104, + "learning_rate": 1.341133515738926e-05, + "loss": 5.3653, + "step": 6777 + }, + { + "epoch": 0.653616200578592, + "grad_norm": 1.3122957944869995, + "learning_rate": 1.3404624794884716e-05, + "loss": 5.4407, + "step": 6778 + }, + { + "epoch": 0.6537126325940212, + "grad_norm": 1.5030577182769775, + "learning_rate": 1.3397915496589425e-05, + "loss": 5.3479, + "step": 6779 + }, + { + "epoch": 0.6538090646094503, + "grad_norm": 1.6537399291992188, + "learning_rate": 1.3391207263119175e-05, + "loss": 5.4179, + "step": 6780 + }, + { + "epoch": 0.6539054966248795, + "grad_norm": 1.2684110403060913, + "learning_rate": 1.338450009508963e-05, + "loss": 5.3562, + "step": 6781 + }, + { + "epoch": 0.6540019286403086, + "grad_norm": 1.3733550310134888, + "learning_rate": 1.3377793993116373e-05, + "loss": 5.3892, + "step": 6782 + }, + { + "epoch": 0.6540983606557377, + "grad_norm": 1.564736008644104, + "learning_rate": 1.3371088957814876e-05, + "loss": 5.2966, + "step": 6783 + }, + { + "epoch": 0.6541947926711669, + "grad_norm": 1.5166658163070679, + "learning_rate": 1.3364384989800521e-05, + "loss": 5.4299, + "step": 6784 + }, + { + "epoch": 0.654291224686596, + "grad_norm": 1.6621859073638916, + "learning_rate": 1.3357682089688595e-05, + "loss": 5.4705, + "step": 6785 + }, + { + "epoch": 0.654387656702025, + "grad_norm": 1.7818795442581177, + "learning_rate": 1.3350980258094281e-05, + "loss": 5.3619, + "step": 6786 + }, + { + "epoch": 0.6544840887174542, + "grad_norm": 1.452756643295288, + "learning_rate": 1.3344279495632655e-05, + "loss": 5.4418, + "step": 6787 + }, + { + "epoch": 0.6545805207328833, + "grad_norm": 1.093998908996582, + "learning_rate": 1.3337579802918714e-05, + "loss": 5.4233, + "step": 6788 + }, + { + "epoch": 0.6546769527483124, + "grad_norm": 1.4592005014419556, + "learning_rate": 1.3330881180567339e-05, + "loss": 5.4242, + "step": 6789 + }, + { + "epoch": 0.6547733847637416, + "grad_norm": 1.5143420696258545, + "learning_rate": 1.3324183629193333e-05, + "loss": 5.3587, + "step": 6790 + }, + { + "epoch": 0.6548698167791707, + "grad_norm": 1.2044835090637207, + "learning_rate": 1.331748714941138e-05, + "loss": 5.408, + "step": 6791 + }, + { + "epoch": 0.6549662487945999, + "grad_norm": 1.0968626737594604, + "learning_rate": 1.3310791741836077e-05, + "loss": 5.2847, + "step": 6792 + }, + { + "epoch": 0.6550626808100289, + "grad_norm": 1.3806421756744385, + "learning_rate": 1.3304097407081917e-05, + "loss": 5.3493, + "step": 6793 + }, + { + "epoch": 0.655159112825458, + "grad_norm": 1.3903952836990356, + "learning_rate": 1.3297404145763306e-05, + "loss": 5.2974, + "step": 6794 + }, + { + "epoch": 0.6552555448408872, + "grad_norm": 1.373105764389038, + "learning_rate": 1.3290711958494551e-05, + "loss": 5.2855, + "step": 6795 + }, + { + "epoch": 0.6553519768563163, + "grad_norm": 1.2152148485183716, + "learning_rate": 1.3284020845889817e-05, + "loss": 5.3176, + "step": 6796 + }, + { + "epoch": 0.6554484088717454, + "grad_norm": 1.2947618961334229, + "learning_rate": 1.3277330808563248e-05, + "loss": 5.2795, + "step": 6797 + }, + { + "epoch": 0.6555448408871746, + "grad_norm": 1.3783438205718994, + "learning_rate": 1.3270641847128823e-05, + "loss": 5.492, + "step": 6798 + }, + { + "epoch": 0.6556412729026037, + "grad_norm": 1.3973699808120728, + "learning_rate": 1.3263953962200455e-05, + "loss": 5.3618, + "step": 6799 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 1.297666072845459, + "learning_rate": 1.3257267154391942e-05, + "loss": 5.3518, + "step": 6800 + }, + { + "epoch": 0.6558341369334619, + "grad_norm": 1.308956265449524, + "learning_rate": 1.325058142431701e-05, + "loss": 5.2207, + "step": 6801 + }, + { + "epoch": 0.655930568948891, + "grad_norm": 1.438524603843689, + "learning_rate": 1.3243896772589232e-05, + "loss": 5.273, + "step": 6802 + }, + { + "epoch": 0.6560270009643202, + "grad_norm": 1.3690788745880127, + "learning_rate": 1.3237213199822157e-05, + "loss": 5.4234, + "step": 6803 + }, + { + "epoch": 0.6561234329797493, + "grad_norm": 1.2177003622055054, + "learning_rate": 1.3230530706629163e-05, + "loss": 5.3944, + "step": 6804 + }, + { + "epoch": 0.6562198649951784, + "grad_norm": 1.134748935699463, + "learning_rate": 1.3223849293623591e-05, + "loss": 5.3573, + "step": 6805 + }, + { + "epoch": 0.6563162970106076, + "grad_norm": 1.1266499757766724, + "learning_rate": 1.3217168961418625e-05, + "loss": 5.4209, + "step": 6806 + }, + { + "epoch": 0.6564127290260366, + "grad_norm": 1.1908142566680908, + "learning_rate": 1.3210489710627393e-05, + "loss": 5.384, + "step": 6807 + }, + { + "epoch": 0.6565091610414657, + "grad_norm": 1.1881741285324097, + "learning_rate": 1.3203811541862905e-05, + "loss": 5.2349, + "step": 6808 + }, + { + "epoch": 0.6566055930568949, + "grad_norm": 1.1233402490615845, + "learning_rate": 1.319713445573807e-05, + "loss": 5.3608, + "step": 6809 + }, + { + "epoch": 0.656702025072324, + "grad_norm": 1.1458370685577393, + "learning_rate": 1.3190458452865712e-05, + "loss": 5.3159, + "step": 6810 + }, + { + "epoch": 0.6567984570877531, + "grad_norm": 1.186647653579712, + "learning_rate": 1.318378353385854e-05, + "loss": 5.3287, + "step": 6811 + }, + { + "epoch": 0.6568948891031823, + "grad_norm": 1.4451303482055664, + "learning_rate": 1.3177109699329183e-05, + "loss": 5.3631, + "step": 6812 + }, + { + "epoch": 0.6569913211186114, + "grad_norm": 1.307792067527771, + "learning_rate": 1.3170436949890125e-05, + "loss": 5.2159, + "step": 6813 + }, + { + "epoch": 0.6570877531340406, + "grad_norm": 1.3936506509780884, + "learning_rate": 1.3163765286153825e-05, + "loss": 5.369, + "step": 6814 + }, + { + "epoch": 0.6571841851494696, + "grad_norm": 1.2676091194152832, + "learning_rate": 1.3157094708732559e-05, + "loss": 5.4156, + "step": 6815 + }, + { + "epoch": 0.6572806171648987, + "grad_norm": 1.2169963121414185, + "learning_rate": 1.3150425218238585e-05, + "loss": 5.3816, + "step": 6816 + }, + { + "epoch": 0.6573770491803279, + "grad_norm": 1.132468581199646, + "learning_rate": 1.314375681528399e-05, + "loss": 5.3309, + "step": 6817 + }, + { + "epoch": 0.657473481195757, + "grad_norm": 1.0487695932388306, + "learning_rate": 1.3137089500480801e-05, + "loss": 5.4224, + "step": 6818 + }, + { + "epoch": 0.6575699132111861, + "grad_norm": 1.0487772226333618, + "learning_rate": 1.3130423274440937e-05, + "loss": 5.3622, + "step": 6819 + }, + { + "epoch": 0.6576663452266153, + "grad_norm": 1.0414893627166748, + "learning_rate": 1.3123758137776226e-05, + "loss": 5.4352, + "step": 6820 + }, + { + "epoch": 0.6577627772420443, + "grad_norm": 1.167650818824768, + "learning_rate": 1.3117094091098353e-05, + "loss": 5.3894, + "step": 6821 + }, + { + "epoch": 0.6578592092574734, + "grad_norm": 1.1084662675857544, + "learning_rate": 1.3110431135018984e-05, + "loss": 5.3397, + "step": 6822 + }, + { + "epoch": 0.6579556412729026, + "grad_norm": 1.1783701181411743, + "learning_rate": 1.3103769270149597e-05, + "loss": 5.4545, + "step": 6823 + }, + { + "epoch": 0.6580520732883317, + "grad_norm": 1.261796474456787, + "learning_rate": 1.309710849710163e-05, + "loss": 5.515, + "step": 6824 + }, + { + "epoch": 0.6581485053037609, + "grad_norm": 1.1340850591659546, + "learning_rate": 1.3090448816486388e-05, + "loss": 5.4054, + "step": 6825 + }, + { + "epoch": 0.65824493731919, + "grad_norm": 1.1226403713226318, + "learning_rate": 1.3083790228915099e-05, + "loss": 5.3834, + "step": 6826 + }, + { + "epoch": 0.6583413693346191, + "grad_norm": 1.2314703464508057, + "learning_rate": 1.3077132734998876e-05, + "loss": 5.4924, + "step": 6827 + }, + { + "epoch": 0.6584378013500483, + "grad_norm": 1.0820653438568115, + "learning_rate": 1.3070476335348739e-05, + "loss": 5.3983, + "step": 6828 + }, + { + "epoch": 0.6585342333654773, + "grad_norm": 1.1077066659927368, + "learning_rate": 1.3063821030575597e-05, + "loss": 5.3083, + "step": 6829 + }, + { + "epoch": 0.6586306653809064, + "grad_norm": 1.4002517461776733, + "learning_rate": 1.3057166821290274e-05, + "loss": 5.3215, + "step": 6830 + }, + { + "epoch": 0.6587270973963356, + "grad_norm": 1.2130311727523804, + "learning_rate": 1.3050513708103485e-05, + "loss": 5.4048, + "step": 6831 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 1.9625263214111328, + "learning_rate": 1.3043861691625824e-05, + "loss": 5.3511, + "step": 6832 + }, + { + "epoch": 0.6589199614271938, + "grad_norm": 1.2052178382873535, + "learning_rate": 1.303721077246784e-05, + "loss": 5.3396, + "step": 6833 + }, + { + "epoch": 0.659016393442623, + "grad_norm": 1.0535962581634521, + "learning_rate": 1.3030560951239917e-05, + "loss": 5.3436, + "step": 6834 + }, + { + "epoch": 0.659112825458052, + "grad_norm": 1.1249154806137085, + "learning_rate": 1.3023912228552381e-05, + "loss": 5.3848, + "step": 6835 + }, + { + "epoch": 0.6592092574734812, + "grad_norm": 1.0140150785446167, + "learning_rate": 1.3017264605015439e-05, + "loss": 5.3876, + "step": 6836 + }, + { + "epoch": 0.6593056894889103, + "grad_norm": 1.0333150625228882, + "learning_rate": 1.30106180812392e-05, + "loss": 5.2818, + "step": 6837 + }, + { + "epoch": 0.6594021215043394, + "grad_norm": 1.2550684213638306, + "learning_rate": 1.3003972657833683e-05, + "loss": 5.3842, + "step": 6838 + }, + { + "epoch": 0.6594985535197686, + "grad_norm": 1.0571240186691284, + "learning_rate": 1.2997328335408798e-05, + "loss": 5.3163, + "step": 6839 + }, + { + "epoch": 0.6595949855351977, + "grad_norm": 1.0432710647583008, + "learning_rate": 1.2990685114574325e-05, + "loss": 5.393, + "step": 6840 + }, + { + "epoch": 0.6596914175506268, + "grad_norm": 1.1686222553253174, + "learning_rate": 1.298404299594001e-05, + "loss": 5.3682, + "step": 6841 + }, + { + "epoch": 0.659787849566056, + "grad_norm": 1.134943962097168, + "learning_rate": 1.2977401980115433e-05, + "loss": 5.3271, + "step": 6842 + }, + { + "epoch": 0.659884281581485, + "grad_norm": 1.1408883333206177, + "learning_rate": 1.2970762067710102e-05, + "loss": 5.3966, + "step": 6843 + }, + { + "epoch": 0.6599807135969141, + "grad_norm": 1.069542646408081, + "learning_rate": 1.296412325933342e-05, + "loss": 5.3586, + "step": 6844 + }, + { + "epoch": 0.6600771456123433, + "grad_norm": 1.0906155109405518, + "learning_rate": 1.2957485555594705e-05, + "loss": 5.2658, + "step": 6845 + }, + { + "epoch": 0.6601735776277724, + "grad_norm": 1.1539701223373413, + "learning_rate": 1.2950848957103126e-05, + "loss": 5.3735, + "step": 6846 + }, + { + "epoch": 0.6602700096432016, + "grad_norm": 1.1463232040405273, + "learning_rate": 1.2944213464467819e-05, + "loss": 5.4257, + "step": 6847 + }, + { + "epoch": 0.6603664416586307, + "grad_norm": 1.1160216331481934, + "learning_rate": 1.293757907829774e-05, + "loss": 5.2903, + "step": 6848 + }, + { + "epoch": 0.6604628736740598, + "grad_norm": 1.2618153095245361, + "learning_rate": 1.2930945799201827e-05, + "loss": 5.3816, + "step": 6849 + }, + { + "epoch": 0.660559305689489, + "grad_norm": 1.3620413541793823, + "learning_rate": 1.2924313627788841e-05, + "loss": 5.34, + "step": 6850 + }, + { + "epoch": 0.660655737704918, + "grad_norm": 1.3618102073669434, + "learning_rate": 1.2917682564667493e-05, + "loss": 5.1938, + "step": 6851 + }, + { + "epoch": 0.6607521697203471, + "grad_norm": 1.768540859222412, + "learning_rate": 1.2911052610446367e-05, + "loss": 5.6013, + "step": 6852 + }, + { + "epoch": 0.6608486017357763, + "grad_norm": 1.7204029560089111, + "learning_rate": 1.2904423765733953e-05, + "loss": 5.4573, + "step": 6853 + }, + { + "epoch": 0.6609450337512054, + "grad_norm": 1.7434660196304321, + "learning_rate": 1.2897796031138637e-05, + "loss": 5.3984, + "step": 6854 + }, + { + "epoch": 0.6610414657666345, + "grad_norm": 1.3152616024017334, + "learning_rate": 1.2891169407268704e-05, + "loss": 5.2308, + "step": 6855 + }, + { + "epoch": 0.6611378977820637, + "grad_norm": 1.9057848453521729, + "learning_rate": 1.2884543894732353e-05, + "loss": 5.3961, + "step": 6856 + }, + { + "epoch": 0.6612343297974927, + "grad_norm": 2.248241424560547, + "learning_rate": 1.287791949413763e-05, + "loss": 5.5005, + "step": 6857 + }, + { + "epoch": 0.6613307618129219, + "grad_norm": 1.999792456626892, + "learning_rate": 1.287129620609256e-05, + "loss": 5.4127, + "step": 6858 + }, + { + "epoch": 0.661427193828351, + "grad_norm": 2.062347173690796, + "learning_rate": 1.2864674031204971e-05, + "loss": 5.304, + "step": 6859 + }, + { + "epoch": 0.6615236258437801, + "grad_norm": 1.8959921598434448, + "learning_rate": 1.2858052970082684e-05, + "loss": 5.5425, + "step": 6860 + }, + { + "epoch": 0.6616200578592093, + "grad_norm": 2.6010806560516357, + "learning_rate": 1.2851433023333343e-05, + "loss": 5.2032, + "step": 6861 + }, + { + "epoch": 0.6617164898746384, + "grad_norm": 2.066286087036133, + "learning_rate": 1.2844814191564525e-05, + "loss": 5.4442, + "step": 6862 + }, + { + "epoch": 0.6618129218900675, + "grad_norm": 2.2913355827331543, + "learning_rate": 1.2838196475383702e-05, + "loss": 5.3682, + "step": 6863 + }, + { + "epoch": 0.6619093539054967, + "grad_norm": 2.3345162868499756, + "learning_rate": 1.2831579875398248e-05, + "loss": 5.2528, + "step": 6864 + }, + { + "epoch": 0.6620057859209257, + "grad_norm": 1.879050612449646, + "learning_rate": 1.2824964392215394e-05, + "loss": 5.3863, + "step": 6865 + }, + { + "epoch": 0.6621022179363548, + "grad_norm": 1.9708545207977295, + "learning_rate": 1.2818350026442343e-05, + "loss": 5.2315, + "step": 6866 + }, + { + "epoch": 0.662198649951784, + "grad_norm": 2.385420799255371, + "learning_rate": 1.2811736778686123e-05, + "loss": 5.4342, + "step": 6867 + }, + { + "epoch": 0.6622950819672131, + "grad_norm": 2.211362361907959, + "learning_rate": 1.2805124649553702e-05, + "loss": 5.3096, + "step": 6868 + }, + { + "epoch": 0.6623915139826423, + "grad_norm": 2.8282268047332764, + "learning_rate": 1.279851363965193e-05, + "loss": 5.3351, + "step": 6869 + }, + { + "epoch": 0.6624879459980714, + "grad_norm": 2.0512466430664062, + "learning_rate": 1.2791903749587558e-05, + "loss": 5.4514, + "step": 6870 + }, + { + "epoch": 0.6625843780135005, + "grad_norm": 2.013944387435913, + "learning_rate": 1.2785294979967236e-05, + "loss": 5.492, + "step": 6871 + }, + { + "epoch": 0.6626808100289296, + "grad_norm": 2.75254225730896, + "learning_rate": 1.2778687331397505e-05, + "loss": 5.3118, + "step": 6872 + }, + { + "epoch": 0.6627772420443587, + "grad_norm": 2.6522843837738037, + "learning_rate": 1.277208080448481e-05, + "loss": 5.4156, + "step": 6873 + }, + { + "epoch": 0.6628736740597878, + "grad_norm": 2.0916056632995605, + "learning_rate": 1.2765475399835489e-05, + "loss": 5.1506, + "step": 6874 + }, + { + "epoch": 0.662970106075217, + "grad_norm": 2.524564266204834, + "learning_rate": 1.2758871118055788e-05, + "loss": 5.6358, + "step": 6875 + }, + { + "epoch": 0.6630665380906461, + "grad_norm": 2.7882161140441895, + "learning_rate": 1.2752267959751807e-05, + "loss": 5.6749, + "step": 6876 + }, + { + "epoch": 0.6631629701060752, + "grad_norm": 3.0832173824310303, + "learning_rate": 1.2745665925529621e-05, + "loss": 5.3711, + "step": 6877 + }, + { + "epoch": 0.6632594021215044, + "grad_norm": 3.1012320518493652, + "learning_rate": 1.2739065015995122e-05, + "loss": 5.2557, + "step": 6878 + }, + { + "epoch": 0.6633558341369334, + "grad_norm": 2.5421555042266846, + "learning_rate": 1.2732465231754143e-05, + "loss": 5.2098, + "step": 6879 + }, + { + "epoch": 0.6634522661523626, + "grad_norm": 2.922231912612915, + "learning_rate": 1.272586657341241e-05, + "loss": 5.319, + "step": 6880 + }, + { + "epoch": 0.6635486981677917, + "grad_norm": 2.400125026702881, + "learning_rate": 1.2719269041575532e-05, + "loss": 5.4795, + "step": 6881 + }, + { + "epoch": 0.6636451301832208, + "grad_norm": 2.6413636207580566, + "learning_rate": 1.2712672636849032e-05, + "loss": 5.6785, + "step": 6882 + }, + { + "epoch": 0.66374156219865, + "grad_norm": 2.0705738067626953, + "learning_rate": 1.270607735983832e-05, + "loss": 5.5989, + "step": 6883 + }, + { + "epoch": 0.6638379942140791, + "grad_norm": 2.2415473461151123, + "learning_rate": 1.2699483211148677e-05, + "loss": 5.4391, + "step": 6884 + }, + { + "epoch": 0.6639344262295082, + "grad_norm": 3.0460996627807617, + "learning_rate": 1.2692890191385342e-05, + "loss": 5.2132, + "step": 6885 + }, + { + "epoch": 0.6640308582449373, + "grad_norm": 3.0453598499298096, + "learning_rate": 1.2686298301153393e-05, + "loss": 5.6368, + "step": 6886 + }, + { + "epoch": 0.6641272902603664, + "grad_norm": 2.525240182876587, + "learning_rate": 1.2679707541057829e-05, + "loss": 5.5607, + "step": 6887 + }, + { + "epoch": 0.6642237222757955, + "grad_norm": 2.612426996231079, + "learning_rate": 1.2673117911703542e-05, + "loss": 5.3815, + "step": 6888 + }, + { + "epoch": 0.6643201542912247, + "grad_norm": 2.303694486618042, + "learning_rate": 1.2666529413695322e-05, + "loss": 5.2903, + "step": 6889 + }, + { + "epoch": 0.6644165863066538, + "grad_norm": 2.638686418533325, + "learning_rate": 1.2659942047637852e-05, + "loss": 5.3599, + "step": 6890 + }, + { + "epoch": 0.664513018322083, + "grad_norm": 2.4064266681671143, + "learning_rate": 1.2653355814135726e-05, + "loss": 5.4854, + "step": 6891 + }, + { + "epoch": 0.6646094503375121, + "grad_norm": 2.880077838897705, + "learning_rate": 1.2646770713793382e-05, + "loss": 5.2849, + "step": 6892 + }, + { + "epoch": 0.6647058823529411, + "grad_norm": 3.818741798400879, + "learning_rate": 1.2640186747215239e-05, + "loss": 5.3169, + "step": 6893 + }, + { + "epoch": 0.6648023143683703, + "grad_norm": 3.361238479614258, + "learning_rate": 1.2633603915005534e-05, + "loss": 5.3072, + "step": 6894 + }, + { + "epoch": 0.6648987463837994, + "grad_norm": 2.9524476528167725, + "learning_rate": 1.2627022217768442e-05, + "loss": 5.4803, + "step": 6895 + }, + { + "epoch": 0.6649951783992285, + "grad_norm": 2.1095149517059326, + "learning_rate": 1.2620441656108024e-05, + "loss": 5.2986, + "step": 6896 + }, + { + "epoch": 0.6650916104146577, + "grad_norm": 2.4606528282165527, + "learning_rate": 1.2613862230628229e-05, + "loss": 5.2232, + "step": 6897 + }, + { + "epoch": 0.6651880424300868, + "grad_norm": 3.375005006790161, + "learning_rate": 1.2607283941932918e-05, + "loss": 5.264, + "step": 6898 + }, + { + "epoch": 0.6652844744455159, + "grad_norm": 2.923541307449341, + "learning_rate": 1.2600706790625833e-05, + "loss": 5.6313, + "step": 6899 + }, + { + "epoch": 0.665380906460945, + "grad_norm": 2.6062653064727783, + "learning_rate": 1.2594130777310625e-05, + "loss": 5.5082, + "step": 6900 + }, + { + "epoch": 0.6654773384763741, + "grad_norm": 2.5786774158477783, + "learning_rate": 1.2587555902590809e-05, + "loss": 5.4527, + "step": 6901 + }, + { + "epoch": 0.6655737704918033, + "grad_norm": 4.413331508636475, + "learning_rate": 1.2580982167069849e-05, + "loss": 5.3253, + "step": 6902 + }, + { + "epoch": 0.6656702025072324, + "grad_norm": 4.198273658752441, + "learning_rate": 1.2574409571351047e-05, + "loss": 5.2108, + "step": 6903 + }, + { + "epoch": 0.6657666345226615, + "grad_norm": 3.32527756690979, + "learning_rate": 1.2567838116037658e-05, + "loss": 5.1377, + "step": 6904 + }, + { + "epoch": 0.6658630665380907, + "grad_norm": 2.654911994934082, + "learning_rate": 1.2561267801732776e-05, + "loss": 5.3414, + "step": 6905 + }, + { + "epoch": 0.6659594985535198, + "grad_norm": 3.045034885406494, + "learning_rate": 1.2554698629039425e-05, + "loss": 5.5121, + "step": 6906 + }, + { + "epoch": 0.6660559305689489, + "grad_norm": 2.3447227478027344, + "learning_rate": 1.2548130598560518e-05, + "loss": 5.4639, + "step": 6907 + }, + { + "epoch": 0.666152362584378, + "grad_norm": 4.147785663604736, + "learning_rate": 1.2541563710898865e-05, + "loss": 5.3447, + "step": 6908 + }, + { + "epoch": 0.6662487945998071, + "grad_norm": 3.1964681148529053, + "learning_rate": 1.2534997966657145e-05, + "loss": 5.3131, + "step": 6909 + }, + { + "epoch": 0.6663452266152362, + "grad_norm": 3.094109296798706, + "learning_rate": 1.2528433366437987e-05, + "loss": 5.3602, + "step": 6910 + }, + { + "epoch": 0.6664416586306654, + "grad_norm": 3.081285238265991, + "learning_rate": 1.2521869910843856e-05, + "loss": 5.3016, + "step": 6911 + }, + { + "epoch": 0.6665380906460945, + "grad_norm": 3.8226447105407715, + "learning_rate": 1.2515307600477145e-05, + "loss": 5.2213, + "step": 6912 + }, + { + "epoch": 0.6666345226615237, + "grad_norm": 2.860697031021118, + "learning_rate": 1.2508746435940138e-05, + "loss": 5.4769, + "step": 6913 + }, + { + "epoch": 0.6667309546769528, + "grad_norm": 2.763798952102661, + "learning_rate": 1.250218641783501e-05, + "loss": 5.4795, + "step": 6914 + }, + { + "epoch": 0.6668273866923818, + "grad_norm": 3.33294939994812, + "learning_rate": 1.2495627546763828e-05, + "loss": 5.4856, + "step": 6915 + }, + { + "epoch": 0.666923818707811, + "grad_norm": 3.000805377960205, + "learning_rate": 1.2489069823328558e-05, + "loss": 5.4422, + "step": 6916 + }, + { + "epoch": 0.6670202507232401, + "grad_norm": 4.22330904006958, + "learning_rate": 1.2482513248131065e-05, + "loss": 5.3807, + "step": 6917 + }, + { + "epoch": 0.6671166827386692, + "grad_norm": 2.8570632934570312, + "learning_rate": 1.2475957821773096e-05, + "loss": 5.4938, + "step": 6918 + }, + { + "epoch": 0.6672131147540984, + "grad_norm": 2.6621460914611816, + "learning_rate": 1.2469403544856314e-05, + "loss": 5.3807, + "step": 6919 + }, + { + "epoch": 0.6673095467695275, + "grad_norm": 3.3875319957733154, + "learning_rate": 1.2462850417982234e-05, + "loss": 5.563, + "step": 6920 + }, + { + "epoch": 0.6674059787849566, + "grad_norm": 1.9764052629470825, + "learning_rate": 1.2456298441752332e-05, + "loss": 5.7335, + "step": 6921 + }, + { + "epoch": 0.6675024108003857, + "grad_norm": 2.3664543628692627, + "learning_rate": 1.244974761676791e-05, + "loss": 5.2868, + "step": 6922 + }, + { + "epoch": 0.6675988428158148, + "grad_norm": 2.983640432357788, + "learning_rate": 1.2443197943630202e-05, + "loss": 5.466, + "step": 6923 + }, + { + "epoch": 0.667695274831244, + "grad_norm": 3.68776798248291, + "learning_rate": 1.2436649422940333e-05, + "loss": 5.7306, + "step": 6924 + }, + { + "epoch": 0.6677917068466731, + "grad_norm": 2.7878646850585938, + "learning_rate": 1.243010205529932e-05, + "loss": 5.6992, + "step": 6925 + }, + { + "epoch": 0.6678881388621022, + "grad_norm": 2.41921067237854, + "learning_rate": 1.2423555841308066e-05, + "loss": 5.4014, + "step": 6926 + }, + { + "epoch": 0.6679845708775314, + "grad_norm": 2.233776330947876, + "learning_rate": 1.241701078156739e-05, + "loss": 5.3987, + "step": 6927 + }, + { + "epoch": 0.6680810028929605, + "grad_norm": 2.93894624710083, + "learning_rate": 1.2410466876677956e-05, + "loss": 5.4334, + "step": 6928 + }, + { + "epoch": 0.6681774349083895, + "grad_norm": 3.1341137886047363, + "learning_rate": 1.2403924127240395e-05, + "loss": 5.318, + "step": 6929 + }, + { + "epoch": 0.6682738669238187, + "grad_norm": 2.624337911605835, + "learning_rate": 1.2397382533855168e-05, + "loss": 5.2927, + "step": 6930 + }, + { + "epoch": 0.6683702989392478, + "grad_norm": 1.9157921075820923, + "learning_rate": 1.239084209712266e-05, + "loss": 5.4549, + "step": 6931 + }, + { + "epoch": 0.6684667309546769, + "grad_norm": 2.087374210357666, + "learning_rate": 1.2384302817643145e-05, + "loss": 5.3657, + "step": 6932 + }, + { + "epoch": 0.6685631629701061, + "grad_norm": 1.783261775970459, + "learning_rate": 1.2377764696016788e-05, + "loss": 5.5323, + "step": 6933 + }, + { + "epoch": 0.6686595949855352, + "grad_norm": 2.2554433345794678, + "learning_rate": 1.2371227732843655e-05, + "loss": 5.3772, + "step": 6934 + }, + { + "epoch": 0.6687560270009644, + "grad_norm": 2.283552408218384, + "learning_rate": 1.2364691928723692e-05, + "loss": 5.544, + "step": 6935 + }, + { + "epoch": 0.6688524590163935, + "grad_norm": 2.1379446983337402, + "learning_rate": 1.2358157284256758e-05, + "loss": 5.3408, + "step": 6936 + }, + { + "epoch": 0.6689488910318225, + "grad_norm": 2.0039210319519043, + "learning_rate": 1.2351623800042587e-05, + "loss": 5.4861, + "step": 6937 + }, + { + "epoch": 0.6690453230472517, + "grad_norm": 2.315960168838501, + "learning_rate": 1.2345091476680824e-05, + "loss": 5.578, + "step": 6938 + }, + { + "epoch": 0.6691417550626808, + "grad_norm": 2.4705591201782227, + "learning_rate": 1.2338560314770972e-05, + "loss": 5.5073, + "step": 6939 + }, + { + "epoch": 0.6692381870781099, + "grad_norm": 2.5311896800994873, + "learning_rate": 1.2332030314912488e-05, + "loss": 5.395, + "step": 6940 + }, + { + "epoch": 0.6693346190935391, + "grad_norm": 2.4124701023101807, + "learning_rate": 1.2325501477704663e-05, + "loss": 5.3034, + "step": 6941 + }, + { + "epoch": 0.6694310511089682, + "grad_norm": 3.3760290145874023, + "learning_rate": 1.231897380374671e-05, + "loss": 5.3613, + "step": 6942 + }, + { + "epoch": 0.6695274831243972, + "grad_norm": 4.232667446136475, + "learning_rate": 1.2312447293637735e-05, + "loss": 5.4261, + "step": 6943 + }, + { + "epoch": 0.6696239151398264, + "grad_norm": 3.0329744815826416, + "learning_rate": 1.2305921947976743e-05, + "loss": 5.4128, + "step": 6944 + }, + { + "epoch": 0.6697203471552555, + "grad_norm": 2.2127954959869385, + "learning_rate": 1.2299397767362588e-05, + "loss": 5.42, + "step": 6945 + }, + { + "epoch": 0.6698167791706847, + "grad_norm": 2.4272100925445557, + "learning_rate": 1.2292874752394093e-05, + "loss": 5.429, + "step": 6946 + }, + { + "epoch": 0.6699132111861138, + "grad_norm": 2.341684103012085, + "learning_rate": 1.2286352903669899e-05, + "loss": 5.2943, + "step": 6947 + }, + { + "epoch": 0.6700096432015429, + "grad_norm": 2.7695634365081787, + "learning_rate": 1.22798322217886e-05, + "loss": 5.3852, + "step": 6948 + }, + { + "epoch": 0.6701060752169721, + "grad_norm": 2.0934510231018066, + "learning_rate": 1.2273312707348638e-05, + "loss": 5.3992, + "step": 6949 + }, + { + "epoch": 0.6702025072324012, + "grad_norm": 1.834904432296753, + "learning_rate": 1.2266794360948372e-05, + "loss": 5.547, + "step": 6950 + }, + { + "epoch": 0.6702989392478302, + "grad_norm": 1.9521993398666382, + "learning_rate": 1.2260277183186047e-05, + "loss": 5.2187, + "step": 6951 + }, + { + "epoch": 0.6703953712632594, + "grad_norm": 2.2559974193573, + "learning_rate": 1.2253761174659811e-05, + "loss": 5.4325, + "step": 6952 + }, + { + "epoch": 0.6704918032786885, + "grad_norm": 2.061861515045166, + "learning_rate": 1.224724633596767e-05, + "loss": 5.3953, + "step": 6953 + }, + { + "epoch": 0.6705882352941176, + "grad_norm": 2.8289198875427246, + "learning_rate": 1.224073266770758e-05, + "loss": 5.1533, + "step": 6954 + }, + { + "epoch": 0.6706846673095468, + "grad_norm": 2.2698421478271484, + "learning_rate": 1.223422017047733e-05, + "loss": 5.6463, + "step": 6955 + }, + { + "epoch": 0.6707810993249759, + "grad_norm": 2.2311439514160156, + "learning_rate": 1.2227708844874646e-05, + "loss": 5.5571, + "step": 6956 + }, + { + "epoch": 0.6708775313404051, + "grad_norm": 1.6558886766433716, + "learning_rate": 1.222119869149712e-05, + "loss": 5.5757, + "step": 6957 + }, + { + "epoch": 0.6709739633558341, + "grad_norm": 1.5988991260528564, + "learning_rate": 1.2214689710942254e-05, + "loss": 5.4657, + "step": 6958 + }, + { + "epoch": 0.6710703953712632, + "grad_norm": 2.191452741622925, + "learning_rate": 1.2208181903807429e-05, + "loss": 5.0383, + "step": 6959 + }, + { + "epoch": 0.6711668273866924, + "grad_norm": 2.242624282836914, + "learning_rate": 1.2201675270689923e-05, + "loss": 5.367, + "step": 6960 + }, + { + "epoch": 0.6712632594021215, + "grad_norm": 2.9657037258148193, + "learning_rate": 1.2195169812186908e-05, + "loss": 5.3435, + "step": 6961 + }, + { + "epoch": 0.6713596914175506, + "grad_norm": 2.965402603149414, + "learning_rate": 1.218866552889545e-05, + "loss": 5.2985, + "step": 6962 + }, + { + "epoch": 0.6714561234329798, + "grad_norm": 3.9134135246276855, + "learning_rate": 1.2182162421412508e-05, + "loss": 5.6034, + "step": 6963 + }, + { + "epoch": 0.6715525554484089, + "grad_norm": 2.547877311706543, + "learning_rate": 1.2175660490334906e-05, + "loss": 5.4444, + "step": 6964 + }, + { + "epoch": 0.6716489874638379, + "grad_norm": 2.368748426437378, + "learning_rate": 1.2169159736259416e-05, + "loss": 5.2204, + "step": 6965 + }, + { + "epoch": 0.6717454194792671, + "grad_norm": 2.416200637817383, + "learning_rate": 1.2162660159782649e-05, + "loss": 5.613, + "step": 6966 + }, + { + "epoch": 0.6718418514946962, + "grad_norm": 2.1115760803222656, + "learning_rate": 1.2156161761501128e-05, + "loss": 5.3023, + "step": 6967 + }, + { + "epoch": 0.6719382835101254, + "grad_norm": 2.1588737964630127, + "learning_rate": 1.2149664542011273e-05, + "loss": 5.1856, + "step": 6968 + }, + { + "epoch": 0.6720347155255545, + "grad_norm": 2.8517649173736572, + "learning_rate": 1.2143168501909392e-05, + "loss": 5.5963, + "step": 6969 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 2.332031488418579, + "learning_rate": 1.2136673641791682e-05, + "loss": 5.5432, + "step": 6970 + }, + { + "epoch": 0.6722275795564128, + "grad_norm": 2.3346025943756104, + "learning_rate": 1.213017996225424e-05, + "loss": 5.6226, + "step": 6971 + }, + { + "epoch": 0.6723240115718419, + "grad_norm": 2.9332592487335205, + "learning_rate": 1.2123687463893025e-05, + "loss": 5.7118, + "step": 6972 + }, + { + "epoch": 0.6724204435872709, + "grad_norm": 2.7732627391815186, + "learning_rate": 1.2117196147303941e-05, + "loss": 5.2044, + "step": 6973 + }, + { + "epoch": 0.6725168756027001, + "grad_norm": 2.8367273807525635, + "learning_rate": 1.2110706013082731e-05, + "loss": 5.3602, + "step": 6974 + }, + { + "epoch": 0.6726133076181292, + "grad_norm": 2.400714874267578, + "learning_rate": 1.210421706182506e-05, + "loss": 5.3053, + "step": 6975 + }, + { + "epoch": 0.6727097396335583, + "grad_norm": 2.6301560401916504, + "learning_rate": 1.2097729294126476e-05, + "loss": 5.1081, + "step": 6976 + }, + { + "epoch": 0.6728061716489875, + "grad_norm": 2.203556776046753, + "learning_rate": 1.2091242710582414e-05, + "loss": 5.2989, + "step": 6977 + }, + { + "epoch": 0.6729026036644166, + "grad_norm": 1.907788872718811, + "learning_rate": 1.2084757311788209e-05, + "loss": 5.3658, + "step": 6978 + }, + { + "epoch": 0.6729990356798458, + "grad_norm": 2.4872865676879883, + "learning_rate": 1.2078273098339082e-05, + "loss": 5.4108, + "step": 6979 + }, + { + "epoch": 0.6730954676952748, + "grad_norm": 2.7771193981170654, + "learning_rate": 1.2071790070830146e-05, + "loss": 5.3548, + "step": 6980 + }, + { + "epoch": 0.6731918997107039, + "grad_norm": 3.6575889587402344, + "learning_rate": 1.2065308229856407e-05, + "loss": 5.2616, + "step": 6981 + }, + { + "epoch": 0.6732883317261331, + "grad_norm": 2.7595505714416504, + "learning_rate": 1.2058827576012766e-05, + "loss": 5.451, + "step": 6982 + }, + { + "epoch": 0.6733847637415622, + "grad_norm": 2.395540237426758, + "learning_rate": 1.2052348109893985e-05, + "loss": 5.1711, + "step": 6983 + }, + { + "epoch": 0.6734811957569913, + "grad_norm": 3.4396772384643555, + "learning_rate": 1.2045869832094775e-05, + "loss": 5.2464, + "step": 6984 + }, + { + "epoch": 0.6735776277724205, + "grad_norm": 3.414994716644287, + "learning_rate": 1.203939274320968e-05, + "loss": 5.1558, + "step": 6985 + }, + { + "epoch": 0.6736740597878496, + "grad_norm": 3.4179162979125977, + "learning_rate": 1.2032916843833167e-05, + "loss": 5.5479, + "step": 6986 + }, + { + "epoch": 0.6737704918032786, + "grad_norm": 2.476612091064453, + "learning_rate": 1.2026442134559585e-05, + "loss": 5.4891, + "step": 6987 + }, + { + "epoch": 0.6738669238187078, + "grad_norm": 3.386622428894043, + "learning_rate": 1.2019968615983187e-05, + "loss": 5.5198, + "step": 6988 + }, + { + "epoch": 0.6739633558341369, + "grad_norm": 4.924520492553711, + "learning_rate": 1.2013496288698073e-05, + "loss": 5.2967, + "step": 6989 + }, + { + "epoch": 0.6740597878495661, + "grad_norm": 3.7747318744659424, + "learning_rate": 1.2007025153298307e-05, + "loss": 5.2574, + "step": 6990 + }, + { + "epoch": 0.6741562198649952, + "grad_norm": 3.3275136947631836, + "learning_rate": 1.2000555210377761e-05, + "loss": 5.6412, + "step": 6991 + }, + { + "epoch": 0.6742526518804243, + "grad_norm": 2.6285347938537598, + "learning_rate": 1.1994086460530276e-05, + "loss": 5.5576, + "step": 6992 + }, + { + "epoch": 0.6743490838958535, + "grad_norm": 2.3111915588378906, + "learning_rate": 1.1987618904349522e-05, + "loss": 5.6235, + "step": 6993 + }, + { + "epoch": 0.6744455159112825, + "grad_norm": 2.679417848587036, + "learning_rate": 1.1981152542429083e-05, + "loss": 5.4529, + "step": 6994 + }, + { + "epoch": 0.6745419479267116, + "grad_norm": 2.4389851093292236, + "learning_rate": 1.1974687375362446e-05, + "loss": 5.5311, + "step": 6995 + }, + { + "epoch": 0.6746383799421408, + "grad_norm": 2.2942705154418945, + "learning_rate": 1.1968223403742975e-05, + "loss": 5.1596, + "step": 6996 + }, + { + "epoch": 0.6747348119575699, + "grad_norm": 3.799131393432617, + "learning_rate": 1.1961760628163905e-05, + "loss": 5.1788, + "step": 6997 + }, + { + "epoch": 0.674831243972999, + "grad_norm": 2.4706170558929443, + "learning_rate": 1.1955299049218412e-05, + "loss": 5.2485, + "step": 6998 + }, + { + "epoch": 0.6749276759884282, + "grad_norm": 2.662166118621826, + "learning_rate": 1.1948838667499512e-05, + "loss": 5.3836, + "step": 6999 + }, + { + "epoch": 0.6750241080038573, + "grad_norm": 3.4241604804992676, + "learning_rate": 1.1942379483600133e-05, + "loss": 5.7859, + "step": 7000 + }, + { + "epoch": 0.6751205400192865, + "grad_norm": 3.436577320098877, + "learning_rate": 1.1935921498113094e-05, + "loss": 5.2209, + "step": 7001 + }, + { + "epoch": 0.6752169720347155, + "grad_norm": 2.7246155738830566, + "learning_rate": 1.1929464711631103e-05, + "loss": 5.4974, + "step": 7002 + }, + { + "epoch": 0.6753134040501446, + "grad_norm": 3.475714683532715, + "learning_rate": 1.192300912474675e-05, + "loss": 5.5533, + "step": 7003 + }, + { + "epoch": 0.6754098360655738, + "grad_norm": 2.9117653369903564, + "learning_rate": 1.1916554738052527e-05, + "loss": 5.4066, + "step": 7004 + }, + { + "epoch": 0.6755062680810029, + "grad_norm": 4.293581485748291, + "learning_rate": 1.1910101552140806e-05, + "loss": 5.4361, + "step": 7005 + }, + { + "epoch": 0.675602700096432, + "grad_norm": 4.343626976013184, + "learning_rate": 1.1903649567603855e-05, + "loss": 5.4528, + "step": 7006 + }, + { + "epoch": 0.6756991321118612, + "grad_norm": 3.1115429401397705, + "learning_rate": 1.1897198785033837e-05, + "loss": 5.4669, + "step": 7007 + }, + { + "epoch": 0.6757955641272902, + "grad_norm": 2.8887596130371094, + "learning_rate": 1.1890749205022767e-05, + "loss": 5.5823, + "step": 7008 + }, + { + "epoch": 0.6758919961427193, + "grad_norm": 3.212954044342041, + "learning_rate": 1.1884300828162618e-05, + "loss": 5.4309, + "step": 7009 + }, + { + "epoch": 0.6759884281581485, + "grad_norm": 6.168707847595215, + "learning_rate": 1.1877853655045188e-05, + "loss": 5.504, + "step": 7010 + }, + { + "epoch": 0.6760848601735776, + "grad_norm": 4.119593143463135, + "learning_rate": 1.1871407686262201e-05, + "loss": 5.3276, + "step": 7011 + }, + { + "epoch": 0.6761812921890068, + "grad_norm": 2.3660266399383545, + "learning_rate": 1.1864962922405254e-05, + "loss": 5.462, + "step": 7012 + }, + { + "epoch": 0.6762777242044359, + "grad_norm": 1.5978082418441772, + "learning_rate": 1.1858519364065845e-05, + "loss": 5.5376, + "step": 7013 + }, + { + "epoch": 0.676374156219865, + "grad_norm": 3.8503258228302, + "learning_rate": 1.1852077011835356e-05, + "loss": 5.2974, + "step": 7014 + }, + { + "epoch": 0.6764705882352942, + "grad_norm": 3.544651746749878, + "learning_rate": 1.1845635866305064e-05, + "loss": 5.4749, + "step": 7015 + }, + { + "epoch": 0.6765670202507232, + "grad_norm": 3.0233986377716064, + "learning_rate": 1.1839195928066102e-05, + "loss": 5.5197, + "step": 7016 + }, + { + "epoch": 0.6766634522661523, + "grad_norm": 2.646801710128784, + "learning_rate": 1.183275719770956e-05, + "loss": 5.469, + "step": 7017 + }, + { + "epoch": 0.6767598842815815, + "grad_norm": 3.0958926677703857, + "learning_rate": 1.1826319675826348e-05, + "loss": 5.662, + "step": 7018 + }, + { + "epoch": 0.6768563162970106, + "grad_norm": 3.709735155105591, + "learning_rate": 1.18198833630073e-05, + "loss": 5.7995, + "step": 7019 + }, + { + "epoch": 0.6769527483124397, + "grad_norm": 2.8245043754577637, + "learning_rate": 1.1813448259843136e-05, + "loss": 5.6686, + "step": 7020 + }, + { + "epoch": 0.6770491803278689, + "grad_norm": 2.3676187992095947, + "learning_rate": 1.1807014366924456e-05, + "loss": 5.3152, + "step": 7021 + }, + { + "epoch": 0.677145612343298, + "grad_norm": 2.5054714679718018, + "learning_rate": 1.1800581684841765e-05, + "loss": 5.4671, + "step": 7022 + }, + { + "epoch": 0.6772420443587271, + "grad_norm": 2.7652485370635986, + "learning_rate": 1.179415021418544e-05, + "loss": 5.482, + "step": 7023 + }, + { + "epoch": 0.6773384763741562, + "grad_norm": 3.353346347808838, + "learning_rate": 1.1787719955545753e-05, + "loss": 5.5313, + "step": 7024 + }, + { + "epoch": 0.6774349083895853, + "grad_norm": 2.9117844104766846, + "learning_rate": 1.1781290909512868e-05, + "loss": 5.5142, + "step": 7025 + }, + { + "epoch": 0.6775313404050145, + "grad_norm": 2.1124751567840576, + "learning_rate": 1.1774863076676843e-05, + "loss": 5.5284, + "step": 7026 + }, + { + "epoch": 0.6776277724204436, + "grad_norm": 2.9618940353393555, + "learning_rate": 1.1768436457627587e-05, + "loss": 5.3141, + "step": 7027 + }, + { + "epoch": 0.6777242044358727, + "grad_norm": 2.1299233436584473, + "learning_rate": 1.1762011052954969e-05, + "loss": 5.4345, + "step": 7028 + }, + { + "epoch": 0.6778206364513019, + "grad_norm": 2.5911777019500732, + "learning_rate": 1.1755586863248675e-05, + "loss": 5.5708, + "step": 7029 + }, + { + "epoch": 0.6779170684667309, + "grad_norm": 2.7959320545196533, + "learning_rate": 1.1749163889098314e-05, + "loss": 5.4441, + "step": 7030 + }, + { + "epoch": 0.67801350048216, + "grad_norm": 2.295901298522949, + "learning_rate": 1.1742742131093382e-05, + "loss": 5.4499, + "step": 7031 + }, + { + "epoch": 0.6781099324975892, + "grad_norm": 1.8151379823684692, + "learning_rate": 1.173632158982327e-05, + "loss": 5.4214, + "step": 7032 + }, + { + "epoch": 0.6782063645130183, + "grad_norm": 1.764235258102417, + "learning_rate": 1.1729902265877221e-05, + "loss": 5.3755, + "step": 7033 + }, + { + "epoch": 0.6783027965284475, + "grad_norm": 2.211996555328369, + "learning_rate": 1.1723484159844428e-05, + "loss": 5.5565, + "step": 7034 + }, + { + "epoch": 0.6783992285438766, + "grad_norm": 2.265023708343506, + "learning_rate": 1.1717067272313898e-05, + "loss": 5.2301, + "step": 7035 + }, + { + "epoch": 0.6784956605593057, + "grad_norm": 1.6000080108642578, + "learning_rate": 1.1710651603874603e-05, + "loss": 5.3782, + "step": 7036 + }, + { + "epoch": 0.6785920925747349, + "grad_norm": 2.4028642177581787, + "learning_rate": 1.1704237155115338e-05, + "loss": 5.1387, + "step": 7037 + }, + { + "epoch": 0.6786885245901639, + "grad_norm": 2.765951633453369, + "learning_rate": 1.1697823926624823e-05, + "loss": 5.2071, + "step": 7038 + }, + { + "epoch": 0.678784956605593, + "grad_norm": 2.7714626789093018, + "learning_rate": 1.1691411918991657e-05, + "loss": 5.1491, + "step": 7039 + }, + { + "epoch": 0.6788813886210222, + "grad_norm": 1.908364176750183, + "learning_rate": 1.1685001132804324e-05, + "loss": 5.2315, + "step": 7040 + }, + { + "epoch": 0.6789778206364513, + "grad_norm": 2.356355667114258, + "learning_rate": 1.1678591568651197e-05, + "loss": 5.3051, + "step": 7041 + }, + { + "epoch": 0.6790742526518804, + "grad_norm": 2.6870665550231934, + "learning_rate": 1.1672183227120543e-05, + "loss": 5.3284, + "step": 7042 + }, + { + "epoch": 0.6791706846673096, + "grad_norm": 2.323934555053711, + "learning_rate": 1.1665776108800517e-05, + "loss": 5.4137, + "step": 7043 + }, + { + "epoch": 0.6792671166827386, + "grad_norm": 3.2246406078338623, + "learning_rate": 1.1659370214279127e-05, + "loss": 5.3045, + "step": 7044 + }, + { + "epoch": 0.6793635486981678, + "grad_norm": 3.2784321308135986, + "learning_rate": 1.1652965544144339e-05, + "loss": 5.0288, + "step": 7045 + }, + { + "epoch": 0.6794599807135969, + "grad_norm": 2.7264089584350586, + "learning_rate": 1.1646562098983937e-05, + "loss": 5.1897, + "step": 7046 + }, + { + "epoch": 0.679556412729026, + "grad_norm": 3.460571765899658, + "learning_rate": 1.1640159879385632e-05, + "loss": 4.8846, + "step": 7047 + }, + { + "epoch": 0.6796528447444552, + "grad_norm": 2.3212995529174805, + "learning_rate": 1.1633758885937007e-05, + "loss": 5.47, + "step": 7048 + }, + { + "epoch": 0.6797492767598843, + "grad_norm": 2.4945034980773926, + "learning_rate": 1.1627359119225541e-05, + "loss": 5.6888, + "step": 7049 + }, + { + "epoch": 0.6798457087753134, + "grad_norm": 2.76926326751709, + "learning_rate": 1.1620960579838596e-05, + "loss": 5.5011, + "step": 7050 + }, + { + "epoch": 0.6799421407907426, + "grad_norm": 2.039647340774536, + "learning_rate": 1.1614563268363432e-05, + "loss": 5.396, + "step": 7051 + }, + { + "epoch": 0.6800385728061716, + "grad_norm": 2.186556339263916, + "learning_rate": 1.1608167185387158e-05, + "loss": 5.3263, + "step": 7052 + }, + { + "epoch": 0.6801350048216007, + "grad_norm": 2.978381395339966, + "learning_rate": 1.1601772331496835e-05, + "loss": 5.7436, + "step": 7053 + }, + { + "epoch": 0.6802314368370299, + "grad_norm": 2.6850454807281494, + "learning_rate": 1.1595378707279347e-05, + "loss": 5.6804, + "step": 7054 + }, + { + "epoch": 0.680327868852459, + "grad_norm": 1.9555221796035767, + "learning_rate": 1.1588986313321504e-05, + "loss": 5.6346, + "step": 7055 + }, + { + "epoch": 0.6804243008678882, + "grad_norm": 2.5156452655792236, + "learning_rate": 1.158259515020999e-05, + "loss": 5.4797, + "step": 7056 + }, + { + "epoch": 0.6805207328833173, + "grad_norm": 2.5600810050964355, + "learning_rate": 1.157620521853138e-05, + "loss": 5.3276, + "step": 7057 + }, + { + "epoch": 0.6806171648987464, + "grad_norm": 3.3832902908325195, + "learning_rate": 1.1569816518872132e-05, + "loss": 5.3928, + "step": 7058 + }, + { + "epoch": 0.6807135969141755, + "grad_norm": 2.6359550952911377, + "learning_rate": 1.1563429051818605e-05, + "loss": 5.2833, + "step": 7059 + }, + { + "epoch": 0.6808100289296046, + "grad_norm": 2.388033866882324, + "learning_rate": 1.1557042817957001e-05, + "loss": 5.612, + "step": 7060 + }, + { + "epoch": 0.6809064609450337, + "grad_norm": 2.029085636138916, + "learning_rate": 1.155065781787348e-05, + "loss": 5.3242, + "step": 7061 + }, + { + "epoch": 0.6810028929604629, + "grad_norm": 2.433626651763916, + "learning_rate": 1.1544274052154025e-05, + "loss": 5.633, + "step": 7062 + }, + { + "epoch": 0.681099324975892, + "grad_norm": 3.453988552093506, + "learning_rate": 1.1537891521384533e-05, + "loss": 5.3766, + "step": 7063 + }, + { + "epoch": 0.6811957569913211, + "grad_norm": 2.2488515377044678, + "learning_rate": 1.153151022615079e-05, + "loss": 5.4934, + "step": 7064 + }, + { + "epoch": 0.6812921890067503, + "grad_norm": 2.4859695434570312, + "learning_rate": 1.1525130167038461e-05, + "loss": 5.4925, + "step": 7065 + }, + { + "epoch": 0.6813886210221793, + "grad_norm": 3.1206471920013428, + "learning_rate": 1.15187513446331e-05, + "loss": 5.2351, + "step": 7066 + }, + { + "epoch": 0.6814850530376085, + "grad_norm": 3.6514699459075928, + "learning_rate": 1.1512373759520149e-05, + "loss": 5.4355, + "step": 7067 + }, + { + "epoch": 0.6815814850530376, + "grad_norm": 2.389129161834717, + "learning_rate": 1.1505997412284933e-05, + "loss": 5.6208, + "step": 7068 + }, + { + "epoch": 0.6816779170684667, + "grad_norm": 3.3064095973968506, + "learning_rate": 1.1499622303512663e-05, + "loss": 5.1438, + "step": 7069 + }, + { + "epoch": 0.6817743490838959, + "grad_norm": 2.8970706462860107, + "learning_rate": 1.1493248433788456e-05, + "loss": 5.4515, + "step": 7070 + }, + { + "epoch": 0.681870781099325, + "grad_norm": 4.010075092315674, + "learning_rate": 1.1486875803697261e-05, + "loss": 5.3423, + "step": 7071 + }, + { + "epoch": 0.6819672131147541, + "grad_norm": 2.7854998111724854, + "learning_rate": 1.1480504413823995e-05, + "loss": 5.334, + "step": 7072 + }, + { + "epoch": 0.6820636451301832, + "grad_norm": 2.347609281539917, + "learning_rate": 1.1474134264753384e-05, + "loss": 5.3829, + "step": 7073 + }, + { + "epoch": 0.6821600771456123, + "grad_norm": 2.789285898208618, + "learning_rate": 1.146776535707008e-05, + "loss": 5.4034, + "step": 7074 + }, + { + "epoch": 0.6822565091610414, + "grad_norm": 2.4542946815490723, + "learning_rate": 1.1461397691358619e-05, + "loss": 5.5371, + "step": 7075 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 3.0182063579559326, + "learning_rate": 1.145503126820342e-05, + "loss": 5.3196, + "step": 7076 + }, + { + "epoch": 0.6824493731918997, + "grad_norm": 3.0601065158843994, + "learning_rate": 1.1448666088188764e-05, + "loss": 5.502, + "step": 7077 + }, + { + "epoch": 0.6825458052073289, + "grad_norm": 2.45050311088562, + "learning_rate": 1.1442302151898875e-05, + "loss": 5.3271, + "step": 7078 + }, + { + "epoch": 0.682642237222758, + "grad_norm": 1.8472073078155518, + "learning_rate": 1.1435939459917787e-05, + "loss": 5.5239, + "step": 7079 + }, + { + "epoch": 0.682738669238187, + "grad_norm": 1.9766457080841064, + "learning_rate": 1.1429578012829503e-05, + "loss": 5.433, + "step": 7080 + }, + { + "epoch": 0.6828351012536162, + "grad_norm": 3.0862200260162354, + "learning_rate": 1.1423217811217835e-05, + "loss": 5.3545, + "step": 7081 + }, + { + "epoch": 0.6829315332690453, + "grad_norm": 4.30026912689209, + "learning_rate": 1.1416858855666523e-05, + "loss": 5.3425, + "step": 7082 + }, + { + "epoch": 0.6830279652844744, + "grad_norm": 4.04742956161499, + "learning_rate": 1.1410501146759191e-05, + "loss": 5.5211, + "step": 7083 + }, + { + "epoch": 0.6831243972999036, + "grad_norm": 3.079000949859619, + "learning_rate": 1.1404144685079338e-05, + "loss": 5.4895, + "step": 7084 + }, + { + "epoch": 0.6832208293153327, + "grad_norm": 2.657527446746826, + "learning_rate": 1.1397789471210349e-05, + "loss": 5.3409, + "step": 7085 + }, + { + "epoch": 0.6833172613307618, + "grad_norm": 3.4195172786712646, + "learning_rate": 1.1391435505735504e-05, + "loss": 5.3461, + "step": 7086 + }, + { + "epoch": 0.683413693346191, + "grad_norm": 3.7518844604492188, + "learning_rate": 1.1385082789237966e-05, + "loss": 5.3335, + "step": 7087 + }, + { + "epoch": 0.68351012536162, + "grad_norm": 3.262810230255127, + "learning_rate": 1.1378731322300754e-05, + "loss": 5.5094, + "step": 7088 + }, + { + "epoch": 0.6836065573770492, + "grad_norm": 2.4700028896331787, + "learning_rate": 1.1372381105506835e-05, + "loss": 5.5606, + "step": 7089 + }, + { + "epoch": 0.6837029893924783, + "grad_norm": 2.5179691314697266, + "learning_rate": 1.1366032139438987e-05, + "loss": 5.541, + "step": 7090 + }, + { + "epoch": 0.6837994214079074, + "grad_norm": 2.453160285949707, + "learning_rate": 1.1359684424679945e-05, + "loss": 5.5197, + "step": 7091 + }, + { + "epoch": 0.6838958534233366, + "grad_norm": 2.8388781547546387, + "learning_rate": 1.1353337961812271e-05, + "loss": 5.7042, + "step": 7092 + }, + { + "epoch": 0.6839922854387657, + "grad_norm": 2.6537041664123535, + "learning_rate": 1.1346992751418441e-05, + "loss": 5.5811, + "step": 7093 + }, + { + "epoch": 0.6840887174541948, + "grad_norm": 2.361844062805176, + "learning_rate": 1.1340648794080815e-05, + "loss": 5.456, + "step": 7094 + }, + { + "epoch": 0.6841851494696239, + "grad_norm": 2.8745713233947754, + "learning_rate": 1.1334306090381638e-05, + "loss": 5.5271, + "step": 7095 + }, + { + "epoch": 0.684281581485053, + "grad_norm": 2.1161341667175293, + "learning_rate": 1.1327964640903011e-05, + "loss": 5.5417, + "step": 7096 + }, + { + "epoch": 0.6843780135004821, + "grad_norm": 2.5407516956329346, + "learning_rate": 1.1321624446226978e-05, + "loss": 5.3923, + "step": 7097 + }, + { + "epoch": 0.6844744455159113, + "grad_norm": 2.412060260772705, + "learning_rate": 1.1315285506935411e-05, + "loss": 5.5306, + "step": 7098 + }, + { + "epoch": 0.6845708775313404, + "grad_norm": 3.721747636795044, + "learning_rate": 1.1308947823610094e-05, + "loss": 5.454, + "step": 7099 + }, + { + "epoch": 0.6846673095467696, + "grad_norm": 2.950775623321533, + "learning_rate": 1.1302611396832699e-05, + "loss": 5.3348, + "step": 7100 + }, + { + "epoch": 0.6847637415621987, + "grad_norm": 2.4727022647857666, + "learning_rate": 1.1296276227184768e-05, + "loss": 5.4146, + "step": 7101 + }, + { + "epoch": 0.6848601735776277, + "grad_norm": 2.8914990425109863, + "learning_rate": 1.128994231524774e-05, + "loss": 5.2633, + "step": 7102 + }, + { + "epoch": 0.6849566055930569, + "grad_norm": 4.048638820648193, + "learning_rate": 1.128360966160294e-05, + "loss": 5.2873, + "step": 7103 + }, + { + "epoch": 0.685053037608486, + "grad_norm": 4.125576496124268, + "learning_rate": 1.1277278266831546e-05, + "loss": 5.2936, + "step": 7104 + }, + { + "epoch": 0.6851494696239151, + "grad_norm": 4.571710109710693, + "learning_rate": 1.1270948131514681e-05, + "loss": 5.4553, + "step": 7105 + }, + { + "epoch": 0.6852459016393443, + "grad_norm": 2.9348912239074707, + "learning_rate": 1.1264619256233288e-05, + "loss": 5.3461, + "step": 7106 + }, + { + "epoch": 0.6853423336547734, + "grad_norm": 2.5210227966308594, + "learning_rate": 1.1258291641568235e-05, + "loss": 5.4004, + "step": 7107 + }, + { + "epoch": 0.6854387656702025, + "grad_norm": 2.9410245418548584, + "learning_rate": 1.1251965288100264e-05, + "loss": 5.4595, + "step": 7108 + }, + { + "epoch": 0.6855351976856316, + "grad_norm": 2.827204465866089, + "learning_rate": 1.1245640196409995e-05, + "loss": 5.4455, + "step": 7109 + }, + { + "epoch": 0.6856316297010607, + "grad_norm": 2.219146490097046, + "learning_rate": 1.123931636707794e-05, + "loss": 5.3608, + "step": 7110 + }, + { + "epoch": 0.6857280617164899, + "grad_norm": 2.9051196575164795, + "learning_rate": 1.1232993800684493e-05, + "loss": 5.5675, + "step": 7111 + }, + { + "epoch": 0.685824493731919, + "grad_norm": 2.908271551132202, + "learning_rate": 1.1226672497809932e-05, + "loss": 5.5952, + "step": 7112 + }, + { + "epoch": 0.6859209257473481, + "grad_norm": 2.597921133041382, + "learning_rate": 1.1220352459034414e-05, + "loss": 5.3117, + "step": 7113 + }, + { + "epoch": 0.6860173577627773, + "grad_norm": 3.6279399394989014, + "learning_rate": 1.1214033684938e-05, + "loss": 5.4088, + "step": 7114 + }, + { + "epoch": 0.6861137897782064, + "grad_norm": 2.2406601905822754, + "learning_rate": 1.1207716176100586e-05, + "loss": 5.547, + "step": 7115 + }, + { + "epoch": 0.6862102217936354, + "grad_norm": 3.0992929935455322, + "learning_rate": 1.1201399933102021e-05, + "loss": 5.4775, + "step": 7116 + }, + { + "epoch": 0.6863066538090646, + "grad_norm": 3.5180516242980957, + "learning_rate": 1.1195084956521985e-05, + "loss": 5.5324, + "step": 7117 + }, + { + "epoch": 0.6864030858244937, + "grad_norm": 2.9650139808654785, + "learning_rate": 1.1188771246940055e-05, + "loss": 5.4545, + "step": 7118 + }, + { + "epoch": 0.6864995178399228, + "grad_norm": 2.1172971725463867, + "learning_rate": 1.1182458804935703e-05, + "loss": 5.5551, + "step": 7119 + }, + { + "epoch": 0.686595949855352, + "grad_norm": 1.7096775770187378, + "learning_rate": 1.1176147631088282e-05, + "loss": 5.474, + "step": 7120 + }, + { + "epoch": 0.6866923818707811, + "grad_norm": 1.9937522411346436, + "learning_rate": 1.1169837725977003e-05, + "loss": 5.4281, + "step": 7121 + }, + { + "epoch": 0.6867888138862103, + "grad_norm": 2.036029100418091, + "learning_rate": 1.116352909018101e-05, + "loss": 5.5074, + "step": 7122 + }, + { + "epoch": 0.6868852459016394, + "grad_norm": 1.7333104610443115, + "learning_rate": 1.115722172427927e-05, + "loss": 5.3902, + "step": 7123 + }, + { + "epoch": 0.6869816779170684, + "grad_norm": 2.5861527919769287, + "learning_rate": 1.1150915628850702e-05, + "loss": 5.343, + "step": 7124 + }, + { + "epoch": 0.6870781099324976, + "grad_norm": 1.9037357568740845, + "learning_rate": 1.1144610804474043e-05, + "loss": 5.5785, + "step": 7125 + }, + { + "epoch": 0.6871745419479267, + "grad_norm": 2.0857045650482178, + "learning_rate": 1.113830725172795e-05, + "loss": 5.6343, + "step": 7126 + }, + { + "epoch": 0.6872709739633558, + "grad_norm": 1.9279065132141113, + "learning_rate": 1.1132004971190957e-05, + "loss": 5.3977, + "step": 7127 + }, + { + "epoch": 0.687367405978785, + "grad_norm": 1.9865424633026123, + "learning_rate": 1.1125703963441477e-05, + "loss": 5.3217, + "step": 7128 + }, + { + "epoch": 0.6874638379942141, + "grad_norm": 2.22934627532959, + "learning_rate": 1.1119404229057814e-05, + "loss": 5.5937, + "step": 7129 + }, + { + "epoch": 0.6875602700096431, + "grad_norm": 2.146176815032959, + "learning_rate": 1.1113105768618146e-05, + "loss": 5.283, + "step": 7130 + }, + { + "epoch": 0.6876567020250723, + "grad_norm": 1.9677523374557495, + "learning_rate": 1.1106808582700545e-05, + "loss": 5.4851, + "step": 7131 + }, + { + "epoch": 0.6877531340405014, + "grad_norm": 2.4981863498687744, + "learning_rate": 1.1100512671882937e-05, + "loss": 5.3756, + "step": 7132 + }, + { + "epoch": 0.6878495660559306, + "grad_norm": 2.2695813179016113, + "learning_rate": 1.1094218036743187e-05, + "loss": 5.4703, + "step": 7133 + }, + { + "epoch": 0.6879459980713597, + "grad_norm": 2.890390396118164, + "learning_rate": 1.1087924677858968e-05, + "loss": 5.4885, + "step": 7134 + }, + { + "epoch": 0.6880424300867888, + "grad_norm": 1.7045470476150513, + "learning_rate": 1.1081632595807922e-05, + "loss": 5.6359, + "step": 7135 + }, + { + "epoch": 0.688138862102218, + "grad_norm": 2.3144586086273193, + "learning_rate": 1.1075341791167495e-05, + "loss": 5.5961, + "step": 7136 + }, + { + "epoch": 0.6882352941176471, + "grad_norm": 3.716557025909424, + "learning_rate": 1.106905226451506e-05, + "loss": 5.5087, + "step": 7137 + }, + { + "epoch": 0.6883317261330761, + "grad_norm": 2.181767463684082, + "learning_rate": 1.1062764016427862e-05, + "loss": 5.542, + "step": 7138 + }, + { + "epoch": 0.6884281581485053, + "grad_norm": 2.7513723373413086, + "learning_rate": 1.1056477047483038e-05, + "loss": 5.436, + "step": 7139 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 2.5812203884124756, + "learning_rate": 1.1050191358257569e-05, + "loss": 5.4201, + "step": 7140 + }, + { + "epoch": 0.6886210221793635, + "grad_norm": 2.3955771923065186, + "learning_rate": 1.1043906949328387e-05, + "loss": 5.464, + "step": 7141 + }, + { + "epoch": 0.6887174541947927, + "grad_norm": 2.711357593536377, + "learning_rate": 1.1037623821272238e-05, + "loss": 5.5062, + "step": 7142 + }, + { + "epoch": 0.6888138862102218, + "grad_norm": 2.2324063777923584, + "learning_rate": 1.1031341974665794e-05, + "loss": 5.3938, + "step": 7143 + }, + { + "epoch": 0.688910318225651, + "grad_norm": 2.890059232711792, + "learning_rate": 1.1025061410085586e-05, + "loss": 5.5413, + "step": 7144 + }, + { + "epoch": 0.68900675024108, + "grad_norm": 2.1998019218444824, + "learning_rate": 1.1018782128108044e-05, + "loss": 5.5156, + "step": 7145 + }, + { + "epoch": 0.6891031822565091, + "grad_norm": 2.04249906539917, + "learning_rate": 1.1012504129309467e-05, + "loss": 5.5248, + "step": 7146 + }, + { + "epoch": 0.6891996142719383, + "grad_norm": 1.6148933172225952, + "learning_rate": 1.1006227414266049e-05, + "loss": 5.5175, + "step": 7147 + }, + { + "epoch": 0.6892960462873674, + "grad_norm": 1.880799651145935, + "learning_rate": 1.0999951983553854e-05, + "loss": 5.4277, + "step": 7148 + }, + { + "epoch": 0.6893924783027965, + "grad_norm": 1.5335559844970703, + "learning_rate": 1.0993677837748834e-05, + "loss": 5.3458, + "step": 7149 + }, + { + "epoch": 0.6894889103182257, + "grad_norm": 1.889176607131958, + "learning_rate": 1.0987404977426832e-05, + "loss": 5.3546, + "step": 7150 + }, + { + "epoch": 0.6895853423336548, + "grad_norm": 1.9052412509918213, + "learning_rate": 1.0981133403163548e-05, + "loss": 5.3962, + "step": 7151 + }, + { + "epoch": 0.6896817743490838, + "grad_norm": 1.8241454362869263, + "learning_rate": 1.0974863115534584e-05, + "loss": 5.5016, + "step": 7152 + }, + { + "epoch": 0.689778206364513, + "grad_norm": 2.1197712421417236, + "learning_rate": 1.0968594115115422e-05, + "loss": 5.3958, + "step": 7153 + }, + { + "epoch": 0.6898746383799421, + "grad_norm": 1.7306097745895386, + "learning_rate": 1.0962326402481421e-05, + "loss": 5.4061, + "step": 7154 + }, + { + "epoch": 0.6899710703953713, + "grad_norm": 2.431471586227417, + "learning_rate": 1.0956059978207825e-05, + "loss": 5.3283, + "step": 7155 + }, + { + "epoch": 0.6900675024108004, + "grad_norm": 2.717780113220215, + "learning_rate": 1.094979484286976e-05, + "loss": 5.2663, + "step": 7156 + }, + { + "epoch": 0.6901639344262295, + "grad_norm": 2.313694477081299, + "learning_rate": 1.0943530997042231e-05, + "loss": 5.3312, + "step": 7157 + }, + { + "epoch": 0.6902603664416587, + "grad_norm": 1.747979760169983, + "learning_rate": 1.0937268441300136e-05, + "loss": 5.3949, + "step": 7158 + }, + { + "epoch": 0.6903567984570878, + "grad_norm": 1.767856478691101, + "learning_rate": 1.0931007176218213e-05, + "loss": 5.4865, + "step": 7159 + }, + { + "epoch": 0.6904532304725168, + "grad_norm": 3.2949459552764893, + "learning_rate": 1.0924747202371158e-05, + "loss": 5.5374, + "step": 7160 + }, + { + "epoch": 0.690549662487946, + "grad_norm": 2.4169275760650635, + "learning_rate": 1.091848852033347e-05, + "loss": 5.1659, + "step": 7161 + }, + { + "epoch": 0.6906460945033751, + "grad_norm": 1.7880315780639648, + "learning_rate": 1.0912231130679576e-05, + "loss": 5.6305, + "step": 7162 + }, + { + "epoch": 0.6907425265188042, + "grad_norm": 2.6155221462249756, + "learning_rate": 1.0905975033983767e-05, + "loss": 5.4345, + "step": 7163 + }, + { + "epoch": 0.6908389585342334, + "grad_norm": 2.988689422607422, + "learning_rate": 1.0899720230820235e-05, + "loss": 5.4111, + "step": 7164 + }, + { + "epoch": 0.6909353905496625, + "grad_norm": 2.7952897548675537, + "learning_rate": 1.0893466721763004e-05, + "loss": 5.4627, + "step": 7165 + }, + { + "epoch": 0.6910318225650917, + "grad_norm": 2.3607676029205322, + "learning_rate": 1.0887214507386054e-05, + "loss": 5.2644, + "step": 7166 + }, + { + "epoch": 0.6911282545805207, + "grad_norm": 1.7029039859771729, + "learning_rate": 1.0880963588263169e-05, + "loss": 5.4554, + "step": 7167 + }, + { + "epoch": 0.6912246865959498, + "grad_norm": 2.8888535499572754, + "learning_rate": 1.0874713964968085e-05, + "loss": 5.6362, + "step": 7168 + }, + { + "epoch": 0.691321118611379, + "grad_norm": 2.3094913959503174, + "learning_rate": 1.086846563807436e-05, + "loss": 5.561, + "step": 7169 + }, + { + "epoch": 0.6914175506268081, + "grad_norm": 2.650629758834839, + "learning_rate": 1.0862218608155467e-05, + "loss": 5.573, + "step": 7170 + }, + { + "epoch": 0.6915139826422372, + "grad_norm": 2.14646053314209, + "learning_rate": 1.0855972875784748e-05, + "loss": 5.5215, + "step": 7171 + }, + { + "epoch": 0.6916104146576664, + "grad_norm": 2.2267186641693115, + "learning_rate": 1.0849728441535428e-05, + "loss": 5.3602, + "step": 7172 + }, + { + "epoch": 0.6917068466730955, + "grad_norm": 2.2100679874420166, + "learning_rate": 1.084348530598062e-05, + "loss": 5.5478, + "step": 7173 + }, + { + "epoch": 0.6918032786885245, + "grad_norm": 2.6592109203338623, + "learning_rate": 1.0837243469693301e-05, + "loss": 5.2882, + "step": 7174 + }, + { + "epoch": 0.6918997107039537, + "grad_norm": 2.7409870624542236, + "learning_rate": 1.0831002933246358e-05, + "loss": 5.5187, + "step": 7175 + }, + { + "epoch": 0.6919961427193828, + "grad_norm": 1.8286174535751343, + "learning_rate": 1.0824763697212506e-05, + "loss": 5.4648, + "step": 7176 + }, + { + "epoch": 0.692092574734812, + "grad_norm": 1.915269136428833, + "learning_rate": 1.0818525762164417e-05, + "loss": 5.479, + "step": 7177 + }, + { + "epoch": 0.6921890067502411, + "grad_norm": 2.6292576789855957, + "learning_rate": 1.0812289128674557e-05, + "loss": 5.3304, + "step": 7178 + }, + { + "epoch": 0.6922854387656702, + "grad_norm": 2.367438793182373, + "learning_rate": 1.0806053797315357e-05, + "loss": 5.514, + "step": 7179 + }, + { + "epoch": 0.6923818707810994, + "grad_norm": 3.2169430255889893, + "learning_rate": 1.0799819768659064e-05, + "loss": 5.0521, + "step": 7180 + }, + { + "epoch": 0.6924783027965284, + "grad_norm": 2.782304286956787, + "learning_rate": 1.0793587043277831e-05, + "loss": 5.5573, + "step": 7181 + }, + { + "epoch": 0.6925747348119575, + "grad_norm": 2.108048439025879, + "learning_rate": 1.0787355621743698e-05, + "loss": 5.4892, + "step": 7182 + }, + { + "epoch": 0.6926711668273867, + "grad_norm": 2.4774787425994873, + "learning_rate": 1.0781125504628578e-05, + "loss": 5.5241, + "step": 7183 + }, + { + "epoch": 0.6927675988428158, + "grad_norm": 2.741900682449341, + "learning_rate": 1.077489669250424e-05, + "loss": 5.4945, + "step": 7184 + }, + { + "epoch": 0.6928640308582449, + "grad_norm": 2.586005210876465, + "learning_rate": 1.0768669185942398e-05, + "loss": 5.4829, + "step": 7185 + }, + { + "epoch": 0.6929604628736741, + "grad_norm": 2.1218252182006836, + "learning_rate": 1.0762442985514571e-05, + "loss": 5.4125, + "step": 7186 + }, + { + "epoch": 0.6930568948891032, + "grad_norm": 1.9800522327423096, + "learning_rate": 1.0756218091792203e-05, + "loss": 5.3091, + "step": 7187 + }, + { + "epoch": 0.6931533269045324, + "grad_norm": 2.14336895942688, + "learning_rate": 1.0749994505346606e-05, + "loss": 5.458, + "step": 7188 + }, + { + "epoch": 0.6932497589199614, + "grad_norm": 1.8427777290344238, + "learning_rate": 1.0743772226748977e-05, + "loss": 5.452, + "step": 7189 + }, + { + "epoch": 0.6933461909353905, + "grad_norm": 2.5916073322296143, + "learning_rate": 1.0737551256570386e-05, + "loss": 5.4417, + "step": 7190 + }, + { + "epoch": 0.6934426229508197, + "grad_norm": 2.761638879776001, + "learning_rate": 1.0731331595381783e-05, + "loss": 5.4298, + "step": 7191 + }, + { + "epoch": 0.6935390549662488, + "grad_norm": 2.1043412685394287, + "learning_rate": 1.0725113243754009e-05, + "loss": 5.4904, + "step": 7192 + }, + { + "epoch": 0.6936354869816779, + "grad_norm": 2.4516735076904297, + "learning_rate": 1.0718896202257769e-05, + "loss": 5.4605, + "step": 7193 + }, + { + "epoch": 0.6937319189971071, + "grad_norm": 2.16814923286438, + "learning_rate": 1.0712680471463668e-05, + "loss": 5.1716, + "step": 7194 + }, + { + "epoch": 0.6938283510125361, + "grad_norm": 3.11206316947937, + "learning_rate": 1.0706466051942151e-05, + "loss": 5.3787, + "step": 7195 + }, + { + "epoch": 0.6939247830279652, + "grad_norm": 3.2182655334472656, + "learning_rate": 1.0700252944263605e-05, + "loss": 5.3196, + "step": 7196 + }, + { + "epoch": 0.6940212150433944, + "grad_norm": 2.531215190887451, + "learning_rate": 1.0694041148998235e-05, + "loss": 5.2716, + "step": 7197 + }, + { + "epoch": 0.6941176470588235, + "grad_norm": 2.959336519241333, + "learning_rate": 1.0687830666716158e-05, + "loss": 5.4314, + "step": 7198 + }, + { + "epoch": 0.6942140790742527, + "grad_norm": 2.5293660163879395, + "learning_rate": 1.068162149798737e-05, + "loss": 5.5073, + "step": 7199 + }, + { + "epoch": 0.6943105110896818, + "grad_norm": 2.7847416400909424, + "learning_rate": 1.0675413643381737e-05, + "loss": 5.3268, + "step": 7200 + }, + { + "epoch": 0.6944069431051109, + "grad_norm": 3.4784765243530273, + "learning_rate": 1.0669207103469008e-05, + "loss": 5.4128, + "step": 7201 + }, + { + "epoch": 0.6945033751205401, + "grad_norm": 2.409752130508423, + "learning_rate": 1.0663001878818826e-05, + "loss": 5.4828, + "step": 7202 + }, + { + "epoch": 0.6945998071359691, + "grad_norm": 2.079927682876587, + "learning_rate": 1.0656797970000662e-05, + "loss": 5.429, + "step": 7203 + }, + { + "epoch": 0.6946962391513982, + "grad_norm": 2.8917627334594727, + "learning_rate": 1.065059537758395e-05, + "loss": 5.4419, + "step": 7204 + }, + { + "epoch": 0.6947926711668274, + "grad_norm": 2.4359869956970215, + "learning_rate": 1.0644394102137919e-05, + "loss": 5.2549, + "step": 7205 + }, + { + "epoch": 0.6948891031822565, + "grad_norm": 2.210800886154175, + "learning_rate": 1.0638194144231731e-05, + "loss": 5.3705, + "step": 7206 + }, + { + "epoch": 0.6949855351976856, + "grad_norm": 2.623317241668701, + "learning_rate": 1.0631995504434409e-05, + "loss": 5.6026, + "step": 7207 + }, + { + "epoch": 0.6950819672131148, + "grad_norm": 2.1886978149414062, + "learning_rate": 1.062579818331486e-05, + "loss": 5.6246, + "step": 7208 + }, + { + "epoch": 0.6951783992285439, + "grad_norm": 2.481513500213623, + "learning_rate": 1.0619602181441848e-05, + "loss": 5.5052, + "step": 7209 + }, + { + "epoch": 0.695274831243973, + "grad_norm": 2.5048069953918457, + "learning_rate": 1.0613407499384068e-05, + "loss": 5.5236, + "step": 7210 + }, + { + "epoch": 0.6953712632594021, + "grad_norm": 2.001150369644165, + "learning_rate": 1.0607214137710018e-05, + "loss": 5.4756, + "step": 7211 + }, + { + "epoch": 0.6954676952748312, + "grad_norm": 2.4531798362731934, + "learning_rate": 1.0601022096988162e-05, + "loss": 5.4452, + "step": 7212 + }, + { + "epoch": 0.6955641272902604, + "grad_norm": 3.0965576171875, + "learning_rate": 1.0594831377786763e-05, + "loss": 5.2088, + "step": 7213 + }, + { + "epoch": 0.6956605593056895, + "grad_norm": 2.401555061340332, + "learning_rate": 1.058864198067401e-05, + "loss": 5.5822, + "step": 7214 + }, + { + "epoch": 0.6957569913211186, + "grad_norm": 1.9763091802597046, + "learning_rate": 1.0582453906217962e-05, + "loss": 5.6836, + "step": 7215 + }, + { + "epoch": 0.6958534233365478, + "grad_norm": 1.8665785789489746, + "learning_rate": 1.0576267154986547e-05, + "loss": 5.3265, + "step": 7216 + }, + { + "epoch": 0.6959498553519768, + "grad_norm": 2.2992753982543945, + "learning_rate": 1.0570081727547582e-05, + "loss": 5.4694, + "step": 7217 + }, + { + "epoch": 0.6960462873674059, + "grad_norm": 2.625422954559326, + "learning_rate": 1.0563897624468753e-05, + "loss": 5.1826, + "step": 7218 + }, + { + "epoch": 0.6961427193828351, + "grad_norm": 2.3925986289978027, + "learning_rate": 1.0557714846317642e-05, + "loss": 5.4876, + "step": 7219 + }, + { + "epoch": 0.6962391513982642, + "grad_norm": 2.8241207599639893, + "learning_rate": 1.055153339366167e-05, + "loss": 5.4055, + "step": 7220 + }, + { + "epoch": 0.6963355834136934, + "grad_norm": 2.5304908752441406, + "learning_rate": 1.0545353267068198e-05, + "loss": 5.1857, + "step": 7221 + }, + { + "epoch": 0.6964320154291225, + "grad_norm": 2.386482000350952, + "learning_rate": 1.0539174467104392e-05, + "loss": 5.4218, + "step": 7222 + }, + { + "epoch": 0.6965284474445516, + "grad_norm": 2.4079275131225586, + "learning_rate": 1.0532996994337374e-05, + "loss": 5.4363, + "step": 7223 + }, + { + "epoch": 0.6966248794599808, + "grad_norm": 2.7053475379943848, + "learning_rate": 1.0526820849334079e-05, + "loss": 5.3625, + "step": 7224 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 2.861638307571411, + "learning_rate": 1.052064603266135e-05, + "loss": 5.404, + "step": 7225 + }, + { + "epoch": 0.6968177434908389, + "grad_norm": 2.0307257175445557, + "learning_rate": 1.051447254488591e-05, + "loss": 5.2604, + "step": 7226 + }, + { + "epoch": 0.6969141755062681, + "grad_norm": 2.1671135425567627, + "learning_rate": 1.0508300386574357e-05, + "loss": 5.2597, + "step": 7227 + }, + { + "epoch": 0.6970106075216972, + "grad_norm": 1.8616589307785034, + "learning_rate": 1.0502129558293143e-05, + "loss": 5.4419, + "step": 7228 + }, + { + "epoch": 0.6971070395371263, + "grad_norm": 2.401090145111084, + "learning_rate": 1.0495960060608648e-05, + "loss": 5.2423, + "step": 7229 + }, + { + "epoch": 0.6972034715525555, + "grad_norm": 2.3948254585266113, + "learning_rate": 1.048979189408708e-05, + "loss": 5.6036, + "step": 7230 + }, + { + "epoch": 0.6972999035679845, + "grad_norm": 2.654576063156128, + "learning_rate": 1.0483625059294555e-05, + "loss": 5.3003, + "step": 7231 + }, + { + "epoch": 0.6973963355834137, + "grad_norm": 2.0858495235443115, + "learning_rate": 1.0477459556797054e-05, + "loss": 5.1031, + "step": 7232 + }, + { + "epoch": 0.6974927675988428, + "grad_norm": 2.3007872104644775, + "learning_rate": 1.0471295387160441e-05, + "loss": 5.6037, + "step": 7233 + }, + { + "epoch": 0.6975891996142719, + "grad_norm": 2.0825750827789307, + "learning_rate": 1.0465132550950454e-05, + "loss": 5.6249, + "step": 7234 + }, + { + "epoch": 0.6976856316297011, + "grad_norm": 2.5918056964874268, + "learning_rate": 1.0458971048732712e-05, + "loss": 5.4391, + "step": 7235 + }, + { + "epoch": 0.6977820636451302, + "grad_norm": 3.1196961402893066, + "learning_rate": 1.0452810881072711e-05, + "loss": 5.5042, + "step": 7236 + }, + { + "epoch": 0.6978784956605593, + "grad_norm": 2.170549154281616, + "learning_rate": 1.0446652048535821e-05, + "loss": 5.5859, + "step": 7237 + }, + { + "epoch": 0.6979749276759885, + "grad_norm": 2.3051235675811768, + "learning_rate": 1.0440494551687305e-05, + "loss": 5.2517, + "step": 7238 + }, + { + "epoch": 0.6980713596914175, + "grad_norm": 2.3966565132141113, + "learning_rate": 1.043433839109226e-05, + "loss": 5.2817, + "step": 7239 + }, + { + "epoch": 0.6981677917068466, + "grad_norm": 1.97283935546875, + "learning_rate": 1.0428183567315727e-05, + "loss": 5.2954, + "step": 7240 + }, + { + "epoch": 0.6982642237222758, + "grad_norm": 2.5815510749816895, + "learning_rate": 1.0422030080922565e-05, + "loss": 5.4269, + "step": 7241 + }, + { + "epoch": 0.6983606557377049, + "grad_norm": 2.678691864013672, + "learning_rate": 1.0415877932477538e-05, + "loss": 5.1908, + "step": 7242 + }, + { + "epoch": 0.6984570877531341, + "grad_norm": 2.1137611865997314, + "learning_rate": 1.0409727122545285e-05, + "loss": 5.1984, + "step": 7243 + }, + { + "epoch": 0.6985535197685632, + "grad_norm": 2.460947275161743, + "learning_rate": 1.040357765169032e-05, + "loss": 5.5757, + "step": 7244 + }, + { + "epoch": 0.6986499517839923, + "grad_norm": 2.3967347145080566, + "learning_rate": 1.0397429520477029e-05, + "loss": 5.1419, + "step": 7245 + }, + { + "epoch": 0.6987463837994214, + "grad_norm": 3.00652813911438, + "learning_rate": 1.0391282729469694e-05, + "loss": 5.0921, + "step": 7246 + }, + { + "epoch": 0.6988428158148505, + "grad_norm": 3.465573310852051, + "learning_rate": 1.0385137279232435e-05, + "loss": 5.3652, + "step": 7247 + }, + { + "epoch": 0.6989392478302796, + "grad_norm": 2.318419933319092, + "learning_rate": 1.0378993170329304e-05, + "loss": 5.4567, + "step": 7248 + }, + { + "epoch": 0.6990356798457088, + "grad_norm": 2.673224687576294, + "learning_rate": 1.0372850403324174e-05, + "loss": 5.1617, + "step": 7249 + }, + { + "epoch": 0.6991321118611379, + "grad_norm": 1.8164201974868774, + "learning_rate": 1.0366708978780832e-05, + "loss": 5.1644, + "step": 7250 + }, + { + "epoch": 0.699228543876567, + "grad_norm": 2.027472496032715, + "learning_rate": 1.0360568897262932e-05, + "loss": 5.2329, + "step": 7251 + }, + { + "epoch": 0.6993249758919962, + "grad_norm": 2.4820854663848877, + "learning_rate": 1.0354430159333996e-05, + "loss": 5.455, + "step": 7252 + }, + { + "epoch": 0.6994214079074252, + "grad_norm": 1.8915185928344727, + "learning_rate": 1.0348292765557438e-05, + "loss": 5.5264, + "step": 7253 + }, + { + "epoch": 0.6995178399228544, + "grad_norm": 1.8416839838027954, + "learning_rate": 1.0342156716496534e-05, + "loss": 5.2351, + "step": 7254 + }, + { + "epoch": 0.6996142719382835, + "grad_norm": 2.2517435550689697, + "learning_rate": 1.0336022012714446e-05, + "loss": 5.4498, + "step": 7255 + }, + { + "epoch": 0.6997107039537126, + "grad_norm": 1.8023228645324707, + "learning_rate": 1.032988865477422e-05, + "loss": 5.5421, + "step": 7256 + }, + { + "epoch": 0.6998071359691418, + "grad_norm": 1.6993486881256104, + "learning_rate": 1.0323756643238747e-05, + "loss": 5.378, + "step": 7257 + }, + { + "epoch": 0.6999035679845709, + "grad_norm": 2.1920979022979736, + "learning_rate": 1.0317625978670825e-05, + "loss": 5.4245, + "step": 7258 + }, + { + "epoch": 0.7, + "grad_norm": 1.6711472272872925, + "learning_rate": 1.0311496661633122e-05, + "loss": 5.3746, + "step": 7259 + }, + { + "epoch": 0.7000964320154291, + "grad_norm": 1.8640137910842896, + "learning_rate": 1.0305368692688174e-05, + "loss": 5.4486, + "step": 7260 + }, + { + "epoch": 0.7001928640308582, + "grad_norm": 1.8834675550460815, + "learning_rate": 1.0299242072398403e-05, + "loss": 5.2413, + "step": 7261 + }, + { + "epoch": 0.7002892960462873, + "grad_norm": 1.7796051502227783, + "learning_rate": 1.02931168013261e-05, + "loss": 5.3523, + "step": 7262 + }, + { + "epoch": 0.7003857280617165, + "grad_norm": 2.7522025108337402, + "learning_rate": 1.0286992880033444e-05, + "loss": 5.1637, + "step": 7263 + }, + { + "epoch": 0.7004821600771456, + "grad_norm": 2.1753997802734375, + "learning_rate": 1.0280870309082458e-05, + "loss": 5.4988, + "step": 7264 + }, + { + "epoch": 0.7005785920925748, + "grad_norm": 2.3990437984466553, + "learning_rate": 1.0274749089035093e-05, + "loss": 5.5904, + "step": 7265 + }, + { + "epoch": 0.7006750241080039, + "grad_norm": 2.207723379135132, + "learning_rate": 1.0268629220453116e-05, + "loss": 5.4848, + "step": 7266 + }, + { + "epoch": 0.700771456123433, + "grad_norm": 2.6226563453674316, + "learning_rate": 1.0262510703898238e-05, + "loss": 5.7128, + "step": 7267 + }, + { + "epoch": 0.7008678881388621, + "grad_norm": 2.3496897220611572, + "learning_rate": 1.0256393539931977e-05, + "loss": 5.5011, + "step": 7268 + }, + { + "epoch": 0.7009643201542912, + "grad_norm": 1.8899041414260864, + "learning_rate": 1.025027772911577e-05, + "loss": 5.235, + "step": 7269 + }, + { + "epoch": 0.7010607521697203, + "grad_norm": 1.7782341241836548, + "learning_rate": 1.0244163272010922e-05, + "loss": 5.2651, + "step": 7270 + }, + { + "epoch": 0.7011571841851495, + "grad_norm": 2.4491732120513916, + "learning_rate": 1.0238050169178615e-05, + "loss": 5.3428, + "step": 7271 + }, + { + "epoch": 0.7012536162005786, + "grad_norm": 3.652489423751831, + "learning_rate": 1.0231938421179876e-05, + "loss": 5.5215, + "step": 7272 + }, + { + "epoch": 0.7013500482160077, + "grad_norm": 2.97568678855896, + "learning_rate": 1.022582802857567e-05, + "loss": 5.5604, + "step": 7273 + }, + { + "epoch": 0.7014464802314369, + "grad_norm": 2.7486917972564697, + "learning_rate": 1.0219718991926775e-05, + "loss": 5.4755, + "step": 7274 + }, + { + "epoch": 0.7015429122468659, + "grad_norm": 2.161804676055908, + "learning_rate": 1.021361131179388e-05, + "loss": 5.4095, + "step": 7275 + }, + { + "epoch": 0.7016393442622951, + "grad_norm": 3.430352210998535, + "learning_rate": 1.020750498873754e-05, + "loss": 5.4991, + "step": 7276 + }, + { + "epoch": 0.7017357762777242, + "grad_norm": 3.079254150390625, + "learning_rate": 1.0201400023318184e-05, + "loss": 5.078, + "step": 7277 + }, + { + "epoch": 0.7018322082931533, + "grad_norm": 2.9068398475646973, + "learning_rate": 1.019529641609612e-05, + "loss": 5.2312, + "step": 7278 + }, + { + "epoch": 0.7019286403085825, + "grad_norm": 3.0357918739318848, + "learning_rate": 1.018919416763153e-05, + "loss": 5.352, + "step": 7279 + }, + { + "epoch": 0.7020250723240116, + "grad_norm": 2.201219081878662, + "learning_rate": 1.018309327848447e-05, + "loss": 5.2893, + "step": 7280 + }, + { + "epoch": 0.7021215043394406, + "grad_norm": 2.444399118423462, + "learning_rate": 1.0176993749214872e-05, + "loss": 5.5271, + "step": 7281 + }, + { + "epoch": 0.7022179363548698, + "grad_norm": 2.493926763534546, + "learning_rate": 1.0170895580382553e-05, + "loss": 5.3254, + "step": 7282 + }, + { + "epoch": 0.7023143683702989, + "grad_norm": 2.400524616241455, + "learning_rate": 1.0164798772547166e-05, + "loss": 5.4684, + "step": 7283 + }, + { + "epoch": 0.702410800385728, + "grad_norm": 2.0770928859710693, + "learning_rate": 1.0158703326268309e-05, + "loss": 5.4333, + "step": 7284 + }, + { + "epoch": 0.7025072324011572, + "grad_norm": 3.533130168914795, + "learning_rate": 1.0152609242105386e-05, + "loss": 5.3692, + "step": 7285 + }, + { + "epoch": 0.7026036644165863, + "grad_norm": 3.0215954780578613, + "learning_rate": 1.0146516520617707e-05, + "loss": 5.3152, + "step": 7286 + }, + { + "epoch": 0.7027000964320155, + "grad_norm": 1.9394004344940186, + "learning_rate": 1.0140425162364465e-05, + "loss": 5.3968, + "step": 7287 + }, + { + "epoch": 0.7027965284474446, + "grad_norm": 2.592292308807373, + "learning_rate": 1.0134335167904713e-05, + "loss": 5.4542, + "step": 7288 + }, + { + "epoch": 0.7028929604628736, + "grad_norm": 2.895995855331421, + "learning_rate": 1.0128246537797378e-05, + "loss": 5.4219, + "step": 7289 + }, + { + "epoch": 0.7029893924783028, + "grad_norm": 2.7626211643218994, + "learning_rate": 1.0122159272601283e-05, + "loss": 5.3985, + "step": 7290 + }, + { + "epoch": 0.7030858244937319, + "grad_norm": 2.3805181980133057, + "learning_rate": 1.0116073372875082e-05, + "loss": 5.3709, + "step": 7291 + }, + { + "epoch": 0.703182256509161, + "grad_norm": 2.2161669731140137, + "learning_rate": 1.0109988839177364e-05, + "loss": 5.3601, + "step": 7292 + }, + { + "epoch": 0.7032786885245902, + "grad_norm": 1.6460460424423218, + "learning_rate": 1.0103905672066535e-05, + "loss": 5.3376, + "step": 7293 + }, + { + "epoch": 0.7033751205400193, + "grad_norm": 2.1718590259552, + "learning_rate": 1.0097823872100912e-05, + "loss": 5.1441, + "step": 7294 + }, + { + "epoch": 0.7034715525554484, + "grad_norm": 2.4649641513824463, + "learning_rate": 1.009174343983867e-05, + "loss": 5.4312, + "step": 7295 + }, + { + "epoch": 0.7035679845708775, + "grad_norm": 2.105863332748413, + "learning_rate": 1.0085664375837864e-05, + "loss": 5.155, + "step": 7296 + }, + { + "epoch": 0.7036644165863066, + "grad_norm": 1.6994694471359253, + "learning_rate": 1.0079586680656428e-05, + "loss": 5.3655, + "step": 7297 + }, + { + "epoch": 0.7037608486017358, + "grad_norm": 2.0261662006378174, + "learning_rate": 1.0073510354852161e-05, + "loss": 5.5863, + "step": 7298 + }, + { + "epoch": 0.7038572806171649, + "grad_norm": 2.6587069034576416, + "learning_rate": 1.006743539898274e-05, + "loss": 5.3029, + "step": 7299 + }, + { + "epoch": 0.703953712632594, + "grad_norm": 2.5604796409606934, + "learning_rate": 1.0061361813605724e-05, + "loss": 5.5402, + "step": 7300 + }, + { + "epoch": 0.7040501446480232, + "grad_norm": 2.3851990699768066, + "learning_rate": 1.0055289599278541e-05, + "loss": 5.5464, + "step": 7301 + }, + { + "epoch": 0.7041465766634523, + "grad_norm": 2.284323215484619, + "learning_rate": 1.0049218756558468e-05, + "loss": 5.4713, + "step": 7302 + }, + { + "epoch": 0.7042430086788813, + "grad_norm": 2.2322404384613037, + "learning_rate": 1.0043149286002712e-05, + "loss": 5.29, + "step": 7303 + }, + { + "epoch": 0.7043394406943105, + "grad_norm": 2.304605484008789, + "learning_rate": 1.0037081188168295e-05, + "loss": 5.5901, + "step": 7304 + }, + { + "epoch": 0.7044358727097396, + "grad_norm": 2.4205005168914795, + "learning_rate": 1.0031014463612153e-05, + "loss": 5.3425, + "step": 7305 + }, + { + "epoch": 0.7045323047251687, + "grad_norm": 2.2714548110961914, + "learning_rate": 1.002494911289108e-05, + "loss": 5.273, + "step": 7306 + }, + { + "epoch": 0.7046287367405979, + "grad_norm": 3.1129868030548096, + "learning_rate": 1.0018885136561754e-05, + "loss": 5.3031, + "step": 7307 + }, + { + "epoch": 0.704725168756027, + "grad_norm": 1.8308347463607788, + "learning_rate": 1.0012822535180694e-05, + "loss": 5.3442, + "step": 7308 + }, + { + "epoch": 0.7048216007714562, + "grad_norm": 2.2337820529937744, + "learning_rate": 1.0006761309304352e-05, + "loss": 5.561, + "step": 7309 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 2.5467910766601562, + "learning_rate": 1.0000701459488987e-05, + "loss": 5.6379, + "step": 7310 + }, + { + "epoch": 0.7050144648023143, + "grad_norm": 2.6721935272216797, + "learning_rate": 9.994642986290797e-06, + "loss": 5.4811, + "step": 7311 + }, + { + "epoch": 0.7051108968177435, + "grad_norm": 2.62302827835083, + "learning_rate": 9.9885858902658e-06, + "loss": 5.4833, + "step": 7312 + }, + { + "epoch": 0.7052073288331726, + "grad_norm": 1.9862935543060303, + "learning_rate": 9.982530171969911e-06, + "loss": 5.4565, + "step": 7313 + }, + { + "epoch": 0.7053037608486017, + "grad_norm": 2.6690006256103516, + "learning_rate": 9.97647583195892e-06, + "loss": 5.2354, + "step": 7314 + }, + { + "epoch": 0.7054001928640309, + "grad_norm": 3.103397846221924, + "learning_rate": 9.970422870788495e-06, + "loss": 5.3923, + "step": 7315 + }, + { + "epoch": 0.70549662487946, + "grad_norm": 2.019646406173706, + "learning_rate": 9.964371289014143e-06, + "loss": 5.3811, + "step": 7316 + }, + { + "epoch": 0.705593056894889, + "grad_norm": 2.2299964427948, + "learning_rate": 9.958321087191305e-06, + "loss": 5.3352, + "step": 7317 + }, + { + "epoch": 0.7056894889103182, + "grad_norm": 1.879805564880371, + "learning_rate": 9.952272265875238e-06, + "loss": 5.5132, + "step": 7318 + }, + { + "epoch": 0.7057859209257473, + "grad_norm": 1.8908694982528687, + "learning_rate": 9.946224825621098e-06, + "loss": 5.6194, + "step": 7319 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.9550857543945312, + "learning_rate": 9.940178766983917e-06, + "loss": 5.3902, + "step": 7320 + }, + { + "epoch": 0.7059787849566056, + "grad_norm": 2.0441384315490723, + "learning_rate": 9.934134090518593e-06, + "loss": 5.6433, + "step": 7321 + }, + { + "epoch": 0.7060752169720347, + "grad_norm": 1.7855547666549683, + "learning_rate": 9.928090796779899e-06, + "loss": 5.6116, + "step": 7322 + }, + { + "epoch": 0.7061716489874639, + "grad_norm": 1.4243576526641846, + "learning_rate": 9.922048886322483e-06, + "loss": 5.5774, + "step": 7323 + }, + { + "epoch": 0.706268081002893, + "grad_norm": 2.0529441833496094, + "learning_rate": 9.91600835970086e-06, + "loss": 5.4829, + "step": 7324 + }, + { + "epoch": 0.706364513018322, + "grad_norm": 2.0731089115142822, + "learning_rate": 9.909969217469426e-06, + "loss": 5.6359, + "step": 7325 + }, + { + "epoch": 0.7064609450337512, + "grad_norm": 2.190110683441162, + "learning_rate": 9.903931460182453e-06, + "loss": 5.5131, + "step": 7326 + }, + { + "epoch": 0.7065573770491803, + "grad_norm": 1.8591300249099731, + "learning_rate": 9.897895088394051e-06, + "loss": 5.4068, + "step": 7327 + }, + { + "epoch": 0.7066538090646094, + "grad_norm": 2.2127151489257812, + "learning_rate": 9.891860102658269e-06, + "loss": 5.4306, + "step": 7328 + }, + { + "epoch": 0.7067502410800386, + "grad_norm": 3.230700969696045, + "learning_rate": 9.885826503528963e-06, + "loss": 5.4495, + "step": 7329 + }, + { + "epoch": 0.7068466730954677, + "grad_norm": 1.922414779663086, + "learning_rate": 9.879794291559896e-06, + "loss": 5.3015, + "step": 7330 + }, + { + "epoch": 0.7069431051108969, + "grad_norm": 1.8587387800216675, + "learning_rate": 9.873763467304703e-06, + "loss": 5.4458, + "step": 7331 + }, + { + "epoch": 0.707039537126326, + "grad_norm": 1.8130382299423218, + "learning_rate": 9.867734031316878e-06, + "loss": 5.5219, + "step": 7332 + }, + { + "epoch": 0.707135969141755, + "grad_norm": 2.365654706954956, + "learning_rate": 9.8617059841498e-06, + "loss": 5.251, + "step": 7333 + }, + { + "epoch": 0.7072324011571842, + "grad_norm": 1.9002695083618164, + "learning_rate": 9.855679326356723e-06, + "loss": 5.5955, + "step": 7334 + }, + { + "epoch": 0.7073288331726133, + "grad_norm": 2.0721547603607178, + "learning_rate": 9.849654058490743e-06, + "loss": 5.4727, + "step": 7335 + }, + { + "epoch": 0.7074252651880424, + "grad_norm": 2.1720759868621826, + "learning_rate": 9.843630181104882e-06, + "loss": 5.4215, + "step": 7336 + }, + { + "epoch": 0.7075216972034716, + "grad_norm": 2.6082680225372314, + "learning_rate": 9.83760769475198e-06, + "loss": 5.5422, + "step": 7337 + }, + { + "epoch": 0.7076181292189007, + "grad_norm": 2.4240403175354004, + "learning_rate": 9.83158659998478e-06, + "loss": 5.5552, + "step": 7338 + }, + { + "epoch": 0.7077145612343297, + "grad_norm": 3.2583537101745605, + "learning_rate": 9.825566897355893e-06, + "loss": 4.9615, + "step": 7339 + }, + { + "epoch": 0.7078109932497589, + "grad_norm": 2.1323742866516113, + "learning_rate": 9.8195485874178e-06, + "loss": 5.3975, + "step": 7340 + }, + { + "epoch": 0.707907425265188, + "grad_norm": 1.9464495182037354, + "learning_rate": 9.813531670722856e-06, + "loss": 5.4707, + "step": 7341 + }, + { + "epoch": 0.7080038572806172, + "grad_norm": 2.7387101650238037, + "learning_rate": 9.807516147823281e-06, + "loss": 5.1799, + "step": 7342 + }, + { + "epoch": 0.7081002892960463, + "grad_norm": 1.8269180059432983, + "learning_rate": 9.801502019271175e-06, + "loss": 5.3518, + "step": 7343 + }, + { + "epoch": 0.7081967213114754, + "grad_norm": 2.4971818923950195, + "learning_rate": 9.795489285618508e-06, + "loss": 5.2798, + "step": 7344 + }, + { + "epoch": 0.7082931533269046, + "grad_norm": 3.073460102081299, + "learning_rate": 9.789477947417131e-06, + "loss": 5.0791, + "step": 7345 + }, + { + "epoch": 0.7083895853423336, + "grad_norm": 1.8323277235031128, + "learning_rate": 9.783468005218727e-06, + "loss": 5.1806, + "step": 7346 + }, + { + "epoch": 0.7084860173577627, + "grad_norm": 1.9643971920013428, + "learning_rate": 9.777459459574919e-06, + "loss": 5.3863, + "step": 7347 + }, + { + "epoch": 0.7085824493731919, + "grad_norm": 2.668426513671875, + "learning_rate": 9.771452311037139e-06, + "loss": 5.258, + "step": 7348 + }, + { + "epoch": 0.708678881388621, + "grad_norm": 2.0092921257019043, + "learning_rate": 9.76544656015672e-06, + "loss": 5.2832, + "step": 7349 + }, + { + "epoch": 0.7087753134040502, + "grad_norm": 2.5455942153930664, + "learning_rate": 9.759442207484867e-06, + "loss": 5.1585, + "step": 7350 + }, + { + "epoch": 0.7088717454194793, + "grad_norm": 2.188441753387451, + "learning_rate": 9.753439253572649e-06, + "loss": 5.4524, + "step": 7351 + }, + { + "epoch": 0.7089681774349084, + "grad_norm": 1.819677710533142, + "learning_rate": 9.747437698971013e-06, + "loss": 5.3756, + "step": 7352 + }, + { + "epoch": 0.7090646094503376, + "grad_norm": 2.6690425872802734, + "learning_rate": 9.741437544230777e-06, + "loss": 5.1751, + "step": 7353 + }, + { + "epoch": 0.7091610414657666, + "grad_norm": 1.9820197820663452, + "learning_rate": 9.735438789902607e-06, + "loss": 5.2701, + "step": 7354 + }, + { + "epoch": 0.7092574734811957, + "grad_norm": 2.2640857696533203, + "learning_rate": 9.729441436537095e-06, + "loss": 5.4849, + "step": 7355 + }, + { + "epoch": 0.7093539054966249, + "grad_norm": 2.7687392234802246, + "learning_rate": 9.723445484684645e-06, + "loss": 5.436, + "step": 7356 + }, + { + "epoch": 0.709450337512054, + "grad_norm": 1.7167726755142212, + "learning_rate": 9.717450934895566e-06, + "loss": 5.4389, + "step": 7357 + }, + { + "epoch": 0.7095467695274831, + "grad_norm": 2.7658214569091797, + "learning_rate": 9.71145778772003e-06, + "loss": 5.3614, + "step": 7358 + }, + { + "epoch": 0.7096432015429123, + "grad_norm": 1.7452969551086426, + "learning_rate": 9.70546604370808e-06, + "loss": 5.3958, + "step": 7359 + }, + { + "epoch": 0.7097396335583414, + "grad_norm": 2.1741762161254883, + "learning_rate": 9.699475703409635e-06, + "loss": 5.2604, + "step": 7360 + }, + { + "epoch": 0.7098360655737705, + "grad_norm": 2.6351563930511475, + "learning_rate": 9.693486767374486e-06, + "loss": 5.2611, + "step": 7361 + }, + { + "epoch": 0.7099324975891996, + "grad_norm": 2.4808285236358643, + "learning_rate": 9.687499236152261e-06, + "loss": 5.2238, + "step": 7362 + }, + { + "epoch": 0.7100289296046287, + "grad_norm": 2.395514488220215, + "learning_rate": 9.68151311029253e-06, + "loss": 5.4884, + "step": 7363 + }, + { + "epoch": 0.7101253616200579, + "grad_norm": 2.551529884338379, + "learning_rate": 9.675528390344662e-06, + "loss": 5.2994, + "step": 7364 + }, + { + "epoch": 0.710221793635487, + "grad_norm": 2.2431745529174805, + "learning_rate": 9.66954507685794e-06, + "loss": 5.6536, + "step": 7365 + }, + { + "epoch": 0.7103182256509161, + "grad_norm": 2.011096239089966, + "learning_rate": 9.663563170381498e-06, + "loss": 5.6566, + "step": 7366 + }, + { + "epoch": 0.7104146576663453, + "grad_norm": 2.2066993713378906, + "learning_rate": 9.657582671464355e-06, + "loss": 5.5224, + "step": 7367 + }, + { + "epoch": 0.7105110896817743, + "grad_norm": 2.371962785720825, + "learning_rate": 9.651603580655389e-06, + "loss": 5.368, + "step": 7368 + }, + { + "epoch": 0.7106075216972034, + "grad_norm": 4.001951694488525, + "learning_rate": 9.645625898503355e-06, + "loss": 5.2999, + "step": 7369 + }, + { + "epoch": 0.7107039537126326, + "grad_norm": 3.1722075939178467, + "learning_rate": 9.63964962555689e-06, + "loss": 5.1208, + "step": 7370 + }, + { + "epoch": 0.7108003857280617, + "grad_norm": 2.6407835483551025, + "learning_rate": 9.633674762364457e-06, + "loss": 5.3133, + "step": 7371 + }, + { + "epoch": 0.7108968177434909, + "grad_norm": 2.812896728515625, + "learning_rate": 9.62770130947446e-06, + "loss": 5.5467, + "step": 7372 + }, + { + "epoch": 0.71099324975892, + "grad_norm": 2.1320881843566895, + "learning_rate": 9.621729267435097e-06, + "loss": 5.2677, + "step": 7373 + }, + { + "epoch": 0.7110896817743491, + "grad_norm": 2.2053210735321045, + "learning_rate": 9.615758636794513e-06, + "loss": 5.5911, + "step": 7374 + }, + { + "epoch": 0.7111861137897783, + "grad_norm": 2.4251112937927246, + "learning_rate": 9.609789418100659e-06, + "loss": 5.1643, + "step": 7375 + }, + { + "epoch": 0.7112825458052073, + "grad_norm": 1.9129482507705688, + "learning_rate": 9.603821611901387e-06, + "loss": 5.4282, + "step": 7376 + }, + { + "epoch": 0.7113789778206364, + "grad_norm": 1.701407551765442, + "learning_rate": 9.59785521874442e-06, + "loss": 5.1502, + "step": 7377 + }, + { + "epoch": 0.7114754098360656, + "grad_norm": 2.067500352859497, + "learning_rate": 9.591890239177354e-06, + "loss": 5.2578, + "step": 7378 + }, + { + "epoch": 0.7115718418514947, + "grad_norm": 2.2672102451324463, + "learning_rate": 9.585926673747616e-06, + "loss": 5.1816, + "step": 7379 + }, + { + "epoch": 0.7116682738669238, + "grad_norm": 1.8942023515701294, + "learning_rate": 9.579964523002577e-06, + "loss": 5.3653, + "step": 7380 + }, + { + "epoch": 0.711764705882353, + "grad_norm": 2.4087421894073486, + "learning_rate": 9.574003787489406e-06, + "loss": 5.5863, + "step": 7381 + }, + { + "epoch": 0.711861137897782, + "grad_norm": 1.7213144302368164, + "learning_rate": 9.568044467755181e-06, + "loss": 5.5366, + "step": 7382 + }, + { + "epoch": 0.7119575699132112, + "grad_norm": 2.6184499263763428, + "learning_rate": 9.562086564346839e-06, + "loss": 5.3014, + "step": 7383 + }, + { + "epoch": 0.7120540019286403, + "grad_norm": 3.033024311065674, + "learning_rate": 9.556130077811193e-06, + "loss": 5.2073, + "step": 7384 + }, + { + "epoch": 0.7121504339440694, + "grad_norm": 1.889106273651123, + "learning_rate": 9.55017500869492e-06, + "loss": 5.4538, + "step": 7385 + }, + { + "epoch": 0.7122468659594986, + "grad_norm": 2.4485692977905273, + "learning_rate": 9.544221357544569e-06, + "loss": 5.6256, + "step": 7386 + }, + { + "epoch": 0.7123432979749277, + "grad_norm": 2.423769950866699, + "learning_rate": 9.53826912490656e-06, + "loss": 5.4968, + "step": 7387 + }, + { + "epoch": 0.7124397299903568, + "grad_norm": 2.0756313800811768, + "learning_rate": 9.53231831132718e-06, + "loss": 5.4522, + "step": 7388 + }, + { + "epoch": 0.712536162005786, + "grad_norm": 2.92958927154541, + "learning_rate": 9.526368917352596e-06, + "loss": 5.1637, + "step": 7389 + }, + { + "epoch": 0.712632594021215, + "grad_norm": 2.702436685562134, + "learning_rate": 9.52042094352881e-06, + "loss": 5.0316, + "step": 7390 + }, + { + "epoch": 0.7127290260366441, + "grad_norm": 2.1441330909729004, + "learning_rate": 9.514474390401753e-06, + "loss": 5.1656, + "step": 7391 + }, + { + "epoch": 0.7128254580520733, + "grad_norm": 2.9065144062042236, + "learning_rate": 9.508529258517171e-06, + "loss": 5.4876, + "step": 7392 + }, + { + "epoch": 0.7129218900675024, + "grad_norm": 3.440603017807007, + "learning_rate": 9.502585548420706e-06, + "loss": 4.6662, + "step": 7393 + }, + { + "epoch": 0.7130183220829316, + "grad_norm": 2.3371057510375977, + "learning_rate": 9.496643260657865e-06, + "loss": 4.7733, + "step": 7394 + }, + { + "epoch": 0.7131147540983607, + "grad_norm": 2.4118611812591553, + "learning_rate": 9.490702395774026e-06, + "loss": 4.7798, + "step": 7395 + }, + { + "epoch": 0.7132111861137898, + "grad_norm": 2.9544851779937744, + "learning_rate": 9.48476295431443e-06, + "loss": 5.1404, + "step": 7396 + }, + { + "epoch": 0.713307618129219, + "grad_norm": 2.8122475147247314, + "learning_rate": 9.478824936824202e-06, + "loss": 4.7292, + "step": 7397 + }, + { + "epoch": 0.713404050144648, + "grad_norm": 1.6454155445098877, + "learning_rate": 9.472888343848301e-06, + "loss": 5.1568, + "step": 7398 + }, + { + "epoch": 0.7135004821600771, + "grad_norm": 2.670067310333252, + "learning_rate": 9.466953175931617e-06, + "loss": 5.0197, + "step": 7399 + }, + { + "epoch": 0.7135969141755063, + "grad_norm": 1.7879873514175415, + "learning_rate": 9.461019433618839e-06, + "loss": 5.1417, + "step": 7400 + }, + { + "epoch": 0.7136933461909354, + "grad_norm": 1.644927978515625, + "learning_rate": 9.455087117454576e-06, + "loss": 5.3647, + "step": 7401 + }, + { + "epoch": 0.7137897782063645, + "grad_norm": 2.0682897567749023, + "learning_rate": 9.449156227983282e-06, + "loss": 4.9866, + "step": 7402 + }, + { + "epoch": 0.7138862102217937, + "grad_norm": 2.3200042247772217, + "learning_rate": 9.443226765749291e-06, + "loss": 4.8608, + "step": 7403 + }, + { + "epoch": 0.7139826422372227, + "grad_norm": 2.9452779293060303, + "learning_rate": 9.437298731296801e-06, + "loss": 5.1307, + "step": 7404 + }, + { + "epoch": 0.7140790742526519, + "grad_norm": 1.8180875778198242, + "learning_rate": 9.43137212516988e-06, + "loss": 5.0424, + "step": 7405 + }, + { + "epoch": 0.714175506268081, + "grad_norm": 2.12835955619812, + "learning_rate": 9.42544694791246e-06, + "loss": 4.6797, + "step": 7406 + }, + { + "epoch": 0.7142719382835101, + "grad_norm": 2.491110324859619, + "learning_rate": 9.41952320006835e-06, + "loss": 4.7706, + "step": 7407 + }, + { + "epoch": 0.7143683702989393, + "grad_norm": 2.254239797592163, + "learning_rate": 9.413600882181236e-06, + "loss": 5.1463, + "step": 7408 + }, + { + "epoch": 0.7144648023143684, + "grad_norm": 2.317077159881592, + "learning_rate": 9.40767999479463e-06, + "loss": 4.574, + "step": 7409 + }, + { + "epoch": 0.7145612343297975, + "grad_norm": 3.3206052780151367, + "learning_rate": 9.401760538451981e-06, + "loss": 4.6278, + "step": 7410 + }, + { + "epoch": 0.7146576663452266, + "grad_norm": 2.0764403343200684, + "learning_rate": 9.39584251369654e-06, + "loss": 4.8808, + "step": 7411 + }, + { + "epoch": 0.7147540983606557, + "grad_norm": 1.752809762954712, + "learning_rate": 9.389925921071472e-06, + "loss": 5.0377, + "step": 7412 + }, + { + "epoch": 0.7148505303760848, + "grad_norm": 1.7217862606048584, + "learning_rate": 9.384010761119787e-06, + "loss": 4.9861, + "step": 7413 + }, + { + "epoch": 0.714946962391514, + "grad_norm": 2.516204357147217, + "learning_rate": 9.378097034384384e-06, + "loss": 4.7846, + "step": 7414 + }, + { + "epoch": 0.7150433944069431, + "grad_norm": 1.8825711011886597, + "learning_rate": 9.37218474140799e-06, + "loss": 5.0822, + "step": 7415 + }, + { + "epoch": 0.7151398264223723, + "grad_norm": 1.8846685886383057, + "learning_rate": 9.366273882733265e-06, + "loss": 5.1618, + "step": 7416 + }, + { + "epoch": 0.7152362584378014, + "grad_norm": 2.0672240257263184, + "learning_rate": 9.36036445890266e-06, + "loss": 5.1306, + "step": 7417 + }, + { + "epoch": 0.7153326904532304, + "grad_norm": 1.475516438484192, + "learning_rate": 9.354456470458575e-06, + "loss": 5.0684, + "step": 7418 + }, + { + "epoch": 0.7154291224686596, + "grad_norm": 2.081826686859131, + "learning_rate": 9.348549917943211e-06, + "loss": 5.2054, + "step": 7419 + }, + { + "epoch": 0.7155255544840887, + "grad_norm": 1.814993977546692, + "learning_rate": 9.34264480189867e-06, + "loss": 5.0401, + "step": 7420 + }, + { + "epoch": 0.7156219864995178, + "grad_norm": 2.270860433578491, + "learning_rate": 9.336741122866919e-06, + "loss": 5.2248, + "step": 7421 + }, + { + "epoch": 0.715718418514947, + "grad_norm": 1.747395396232605, + "learning_rate": 9.330838881389797e-06, + "loss": 4.9417, + "step": 7422 + }, + { + "epoch": 0.7158148505303761, + "grad_norm": 1.8942537307739258, + "learning_rate": 9.324938078008982e-06, + "loss": 5.2113, + "step": 7423 + }, + { + "epoch": 0.7159112825458052, + "grad_norm": 1.9150844812393188, + "learning_rate": 9.319038713266073e-06, + "loss": 4.7251, + "step": 7424 + }, + { + "epoch": 0.7160077145612344, + "grad_norm": 2.3875181674957275, + "learning_rate": 9.313140787702482e-06, + "loss": 5.1915, + "step": 7425 + }, + { + "epoch": 0.7161041465766634, + "grad_norm": 2.407473564147949, + "learning_rate": 9.307244301859522e-06, + "loss": 5.0629, + "step": 7426 + }, + { + "epoch": 0.7162005785920926, + "grad_norm": 2.636685848236084, + "learning_rate": 9.301349256278366e-06, + "loss": 5.1164, + "step": 7427 + }, + { + "epoch": 0.7162970106075217, + "grad_norm": 2.1507205963134766, + "learning_rate": 9.295455651500051e-06, + "loss": 5.1893, + "step": 7428 + }, + { + "epoch": 0.7163934426229508, + "grad_norm": 2.527776002883911, + "learning_rate": 9.289563488065489e-06, + "loss": 4.7918, + "step": 7429 + }, + { + "epoch": 0.71648987463838, + "grad_norm": 1.9906398057937622, + "learning_rate": 9.283672766515456e-06, + "loss": 4.9979, + "step": 7430 + }, + { + "epoch": 0.7165863066538091, + "grad_norm": 2.8699252605438232, + "learning_rate": 9.27778348739059e-06, + "loss": 5.2045, + "step": 7431 + }, + { + "epoch": 0.7166827386692382, + "grad_norm": 3.550607442855835, + "learning_rate": 9.271895651231405e-06, + "loss": 4.6585, + "step": 7432 + }, + { + "epoch": 0.7167791706846673, + "grad_norm": 2.706434726715088, + "learning_rate": 9.266009258578287e-06, + "loss": 4.7941, + "step": 7433 + }, + { + "epoch": 0.7168756027000964, + "grad_norm": 3.1110002994537354, + "learning_rate": 9.260124309971457e-06, + "loss": 4.8458, + "step": 7434 + }, + { + "epoch": 0.7169720347155255, + "grad_norm": 2.4022786617279053, + "learning_rate": 9.254240805951065e-06, + "loss": 4.8662, + "step": 7435 + }, + { + "epoch": 0.7170684667309547, + "grad_norm": 2.235574722290039, + "learning_rate": 9.248358747057058e-06, + "loss": 5.096, + "step": 7436 + }, + { + "epoch": 0.7171648987463838, + "grad_norm": 3.186196804046631, + "learning_rate": 9.242478133829301e-06, + "loss": 5.2653, + "step": 7437 + }, + { + "epoch": 0.717261330761813, + "grad_norm": 4.004420757293701, + "learning_rate": 9.236598966807507e-06, + "loss": 5.1046, + "step": 7438 + }, + { + "epoch": 0.7173577627772421, + "grad_norm": 3.965198040008545, + "learning_rate": 9.23072124653126e-06, + "loss": 5.2353, + "step": 7439 + }, + { + "epoch": 0.7174541947926711, + "grad_norm": 4.523067951202393, + "learning_rate": 9.224844973540005e-06, + "loss": 4.9912, + "step": 7440 + }, + { + "epoch": 0.7175506268081003, + "grad_norm": 2.937563180923462, + "learning_rate": 9.218970148373075e-06, + "loss": 5.3453, + "step": 7441 + }, + { + "epoch": 0.7176470588235294, + "grad_norm": 2.5201425552368164, + "learning_rate": 9.21309677156962e-06, + "loss": 5.3667, + "step": 7442 + }, + { + "epoch": 0.7177434908389585, + "grad_norm": 3.123002052307129, + "learning_rate": 9.207224843668732e-06, + "loss": 5.4458, + "step": 7443 + }, + { + "epoch": 0.7178399228543877, + "grad_norm": 3.4381399154663086, + "learning_rate": 9.2013543652093e-06, + "loss": 5.2101, + "step": 7444 + }, + { + "epoch": 0.7179363548698168, + "grad_norm": 2.9711222648620605, + "learning_rate": 9.19548533673012e-06, + "loss": 5.0475, + "step": 7445 + }, + { + "epoch": 0.7180327868852459, + "grad_norm": 2.6551578044891357, + "learning_rate": 9.189617758769845e-06, + "loss": 5.1067, + "step": 7446 + }, + { + "epoch": 0.718129218900675, + "grad_norm": 2.1866793632507324, + "learning_rate": 9.183751631866991e-06, + "loss": 4.7228, + "step": 7447 + }, + { + "epoch": 0.7182256509161041, + "grad_norm": 1.8173871040344238, + "learning_rate": 9.177886956559944e-06, + "loss": 5.1025, + "step": 7448 + }, + { + "epoch": 0.7183220829315333, + "grad_norm": 3.214390993118286, + "learning_rate": 9.172023733386961e-06, + "loss": 5.2347, + "step": 7449 + }, + { + "epoch": 0.7184185149469624, + "grad_norm": 3.1523244380950928, + "learning_rate": 9.16616196288616e-06, + "loss": 5.1911, + "step": 7450 + }, + { + "epoch": 0.7185149469623915, + "grad_norm": 2.7081758975982666, + "learning_rate": 9.160301645595523e-06, + "loss": 5.0041, + "step": 7451 + }, + { + "epoch": 0.7186113789778207, + "grad_norm": 2.764387369155884, + "learning_rate": 9.154442782052916e-06, + "loss": 5.0929, + "step": 7452 + }, + { + "epoch": 0.7187078109932498, + "grad_norm": 2.8260319232940674, + "learning_rate": 9.148585372796029e-06, + "loss": 5.1145, + "step": 7453 + }, + { + "epoch": 0.7188042430086788, + "grad_norm": 2.4179465770721436, + "learning_rate": 9.142729418362484e-06, + "loss": 5.1456, + "step": 7454 + }, + { + "epoch": 0.718900675024108, + "grad_norm": 3.397249937057495, + "learning_rate": 9.136874919289705e-06, + "loss": 4.7823, + "step": 7455 + }, + { + "epoch": 0.7189971070395371, + "grad_norm": 3.3660898208618164, + "learning_rate": 9.131021876115026e-06, + "loss": 4.8787, + "step": 7456 + }, + { + "epoch": 0.7190935390549662, + "grad_norm": 1.8328990936279297, + "learning_rate": 9.125170289375627e-06, + "loss": 4.8986, + "step": 7457 + }, + { + "epoch": 0.7191899710703954, + "grad_norm": 1.6056958436965942, + "learning_rate": 9.119320159608565e-06, + "loss": 4.8039, + "step": 7458 + }, + { + "epoch": 0.7192864030858245, + "grad_norm": 2.4193170070648193, + "learning_rate": 9.113471487350739e-06, + "loss": 4.5932, + "step": 7459 + }, + { + "epoch": 0.7193828351012537, + "grad_norm": 2.073323965072632, + "learning_rate": 9.107624273138965e-06, + "loss": 5.1137, + "step": 7460 + }, + { + "epoch": 0.7194792671166828, + "grad_norm": 3.128304958343506, + "learning_rate": 9.101778517509854e-06, + "loss": 5.1138, + "step": 7461 + }, + { + "epoch": 0.7195756991321118, + "grad_norm": 2.207923412322998, + "learning_rate": 9.095934220999963e-06, + "loss": 4.9665, + "step": 7462 + }, + { + "epoch": 0.719672131147541, + "grad_norm": 2.469637632369995, + "learning_rate": 9.090091384145644e-06, + "loss": 4.8936, + "step": 7463 + }, + { + "epoch": 0.7197685631629701, + "grad_norm": 3.0815258026123047, + "learning_rate": 9.084250007483159e-06, + "loss": 5.0576, + "step": 7464 + }, + { + "epoch": 0.7198649951783992, + "grad_norm": 2.0134124755859375, + "learning_rate": 9.078410091548614e-06, + "loss": 5.4259, + "step": 7465 + }, + { + "epoch": 0.7199614271938284, + "grad_norm": 2.623227119445801, + "learning_rate": 9.072571636878007e-06, + "loss": 4.8699, + "step": 7466 + }, + { + "epoch": 0.7200578592092575, + "grad_norm": 2.8791794776916504, + "learning_rate": 9.066734644007155e-06, + "loss": 4.5422, + "step": 7467 + }, + { + "epoch": 0.7201542912246865, + "grad_norm": 2.218956470489502, + "learning_rate": 9.060899113471802e-06, + "loss": 5.2755, + "step": 7468 + }, + { + "epoch": 0.7202507232401157, + "grad_norm": 3.152374505996704, + "learning_rate": 9.055065045807501e-06, + "loss": 4.9232, + "step": 7469 + }, + { + "epoch": 0.7203471552555448, + "grad_norm": 1.6211988925933838, + "learning_rate": 9.049232441549707e-06, + "loss": 5.0298, + "step": 7470 + }, + { + "epoch": 0.720443587270974, + "grad_norm": 1.916223406791687, + "learning_rate": 9.043401301233729e-06, + "loss": 5.0638, + "step": 7471 + }, + { + "epoch": 0.7205400192864031, + "grad_norm": 1.8166784048080444, + "learning_rate": 9.037571625394736e-06, + "loss": 4.8833, + "step": 7472 + }, + { + "epoch": 0.7206364513018322, + "grad_norm": 2.502894401550293, + "learning_rate": 9.031743414567775e-06, + "loss": 5.002, + "step": 7473 + }, + { + "epoch": 0.7207328833172614, + "grad_norm": 2.1711771488189697, + "learning_rate": 9.025916669287749e-06, + "loss": 4.9095, + "step": 7474 + }, + { + "epoch": 0.7208293153326905, + "grad_norm": 2.0714457035064697, + "learning_rate": 9.02009139008943e-06, + "loss": 4.8197, + "step": 7475 + }, + { + "epoch": 0.7209257473481195, + "grad_norm": 3.1506705284118652, + "learning_rate": 9.014267577507457e-06, + "loss": 4.3247, + "step": 7476 + }, + { + "epoch": 0.7210221793635487, + "grad_norm": 2.2926464080810547, + "learning_rate": 9.008445232076338e-06, + "loss": 4.8798, + "step": 7477 + }, + { + "epoch": 0.7211186113789778, + "grad_norm": 2.71913743019104, + "learning_rate": 9.002624354330414e-06, + "loss": 4.8654, + "step": 7478 + }, + { + "epoch": 0.7212150433944069, + "grad_norm": 3.4990084171295166, + "learning_rate": 8.996804944803957e-06, + "loss": 5.0268, + "step": 7479 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 3.012446403503418, + "learning_rate": 8.990987004031035e-06, + "loss": 5.0707, + "step": 7480 + }, + { + "epoch": 0.7214079074252652, + "grad_norm": 2.584465265274048, + "learning_rate": 8.985170532545622e-06, + "loss": 4.9449, + "step": 7481 + }, + { + "epoch": 0.7215043394406944, + "grad_norm": 2.1503548622131348, + "learning_rate": 8.97935553088155e-06, + "loss": 5.3569, + "step": 7482 + }, + { + "epoch": 0.7216007714561234, + "grad_norm": 2.0133450031280518, + "learning_rate": 8.973541999572504e-06, + "loss": 5.6273, + "step": 7483 + }, + { + "epoch": 0.7216972034715525, + "grad_norm": 2.685823440551758, + "learning_rate": 8.967729939152053e-06, + "loss": 5.361, + "step": 7484 + }, + { + "epoch": 0.7217936354869817, + "grad_norm": 2.023613929748535, + "learning_rate": 8.961919350153622e-06, + "loss": 5.3861, + "step": 7485 + }, + { + "epoch": 0.7218900675024108, + "grad_norm": 2.5010979175567627, + "learning_rate": 8.956110233110473e-06, + "loss": 5.3101, + "step": 7486 + }, + { + "epoch": 0.7219864995178399, + "grad_norm": 3.074709415435791, + "learning_rate": 8.9503025885558e-06, + "loss": 5.4336, + "step": 7487 + }, + { + "epoch": 0.7220829315332691, + "grad_norm": 3.3042991161346436, + "learning_rate": 8.944496417022593e-06, + "loss": 5.2484, + "step": 7488 + }, + { + "epoch": 0.7221793635486982, + "grad_norm": 2.5981454849243164, + "learning_rate": 8.938691719043746e-06, + "loss": 5.2052, + "step": 7489 + }, + { + "epoch": 0.7222757955641272, + "grad_norm": 2.718235731124878, + "learning_rate": 8.932888495152003e-06, + "loss": 5.5198, + "step": 7490 + }, + { + "epoch": 0.7223722275795564, + "grad_norm": 2.0277042388916016, + "learning_rate": 8.927086745879976e-06, + "loss": 5.3606, + "step": 7491 + }, + { + "epoch": 0.7224686595949855, + "grad_norm": 2.109123945236206, + "learning_rate": 8.921286471760149e-06, + "loss": 5.5067, + "step": 7492 + }, + { + "epoch": 0.7225650916104147, + "grad_norm": 2.0733625888824463, + "learning_rate": 8.915487673324858e-06, + "loss": 5.5086, + "step": 7493 + }, + { + "epoch": 0.7226615236258438, + "grad_norm": 2.50342059135437, + "learning_rate": 8.909690351106308e-06, + "loss": 5.6492, + "step": 7494 + }, + { + "epoch": 0.7227579556412729, + "grad_norm": 1.4768201112747192, + "learning_rate": 8.903894505636579e-06, + "loss": 5.4677, + "step": 7495 + }, + { + "epoch": 0.7228543876567021, + "grad_norm": 1.7879952192306519, + "learning_rate": 8.898100137447606e-06, + "loss": 5.4087, + "step": 7496 + }, + { + "epoch": 0.7229508196721312, + "grad_norm": 2.3779096603393555, + "learning_rate": 8.892307247071171e-06, + "loss": 5.3266, + "step": 7497 + }, + { + "epoch": 0.7230472516875602, + "grad_norm": 2.0507822036743164, + "learning_rate": 8.886515835038967e-06, + "loss": 4.9723, + "step": 7498 + }, + { + "epoch": 0.7231436837029894, + "grad_norm": 1.9517236948013306, + "learning_rate": 8.880725901882498e-06, + "loss": 5.402, + "step": 7499 + }, + { + "epoch": 0.7232401157184185, + "grad_norm": 2.752655029296875, + "learning_rate": 8.874937448133167e-06, + "loss": 5.3072, + "step": 7500 + }, + { + "epoch": 0.7233365477338476, + "grad_norm": 2.084502935409546, + "learning_rate": 8.869150474322228e-06, + "loss": 5.2945, + "step": 7501 + }, + { + "epoch": 0.7234329797492768, + "grad_norm": 1.4593274593353271, + "learning_rate": 8.863364980980815e-06, + "loss": 5.3804, + "step": 7502 + }, + { + "epoch": 0.7235294117647059, + "grad_norm": 2.0838425159454346, + "learning_rate": 8.857580968639884e-06, + "loss": 5.1621, + "step": 7503 + }, + { + "epoch": 0.7236258437801351, + "grad_norm": 1.8047136068344116, + "learning_rate": 8.851798437830323e-06, + "loss": 5.356, + "step": 7504 + }, + { + "epoch": 0.7237222757955641, + "grad_norm": 2.229607105255127, + "learning_rate": 8.846017389082808e-06, + "loss": 5.2699, + "step": 7505 + }, + { + "epoch": 0.7238187078109932, + "grad_norm": 2.836918592453003, + "learning_rate": 8.840237822927952e-06, + "loss": 5.2461, + "step": 7506 + }, + { + "epoch": 0.7239151398264224, + "grad_norm": 1.9482240676879883, + "learning_rate": 8.834459739896172e-06, + "loss": 5.2503, + "step": 7507 + }, + { + "epoch": 0.7240115718418515, + "grad_norm": 2.522796630859375, + "learning_rate": 8.82868314051778e-06, + "loss": 5.4541, + "step": 7508 + }, + { + "epoch": 0.7241080038572806, + "grad_norm": 2.1267123222351074, + "learning_rate": 8.822908025322943e-06, + "loss": 5.5351, + "step": 7509 + }, + { + "epoch": 0.7242044358727098, + "grad_norm": 2.9501829147338867, + "learning_rate": 8.8171343948417e-06, + "loss": 5.2648, + "step": 7510 + }, + { + "epoch": 0.7243008678881389, + "grad_norm": 2.4837417602539062, + "learning_rate": 8.811362249603944e-06, + "loss": 5.4022, + "step": 7511 + }, + { + "epoch": 0.7243972999035679, + "grad_norm": 1.9389115571975708, + "learning_rate": 8.805591590139433e-06, + "loss": 5.3607, + "step": 7512 + }, + { + "epoch": 0.7244937319189971, + "grad_norm": 1.9350577592849731, + "learning_rate": 8.799822416977805e-06, + "loss": 5.2572, + "step": 7513 + }, + { + "epoch": 0.7245901639344262, + "grad_norm": 2.2925784587860107, + "learning_rate": 8.794054730648518e-06, + "loss": 5.3359, + "step": 7514 + }, + { + "epoch": 0.7246865959498554, + "grad_norm": 2.104020595550537, + "learning_rate": 8.78828853168096e-06, + "loss": 5.4036, + "step": 7515 + }, + { + "epoch": 0.7247830279652845, + "grad_norm": 2.174877882003784, + "learning_rate": 8.782523820604319e-06, + "loss": 5.348, + "step": 7516 + }, + { + "epoch": 0.7248794599807136, + "grad_norm": 2.9324638843536377, + "learning_rate": 8.776760597947679e-06, + "loss": 5.572, + "step": 7517 + }, + { + "epoch": 0.7249758919961428, + "grad_norm": 2.4921202659606934, + "learning_rate": 8.770998864239985e-06, + "loss": 5.0361, + "step": 7518 + }, + { + "epoch": 0.7250723240115718, + "grad_norm": 1.970252513885498, + "learning_rate": 8.765238620010041e-06, + "loss": 5.3599, + "step": 7519 + }, + { + "epoch": 0.7251687560270009, + "grad_norm": 2.038198471069336, + "learning_rate": 8.759479865786513e-06, + "loss": 5.2843, + "step": 7520 + }, + { + "epoch": 0.7252651880424301, + "grad_norm": 1.6838088035583496, + "learning_rate": 8.753722602097943e-06, + "loss": 5.4935, + "step": 7521 + }, + { + "epoch": 0.7253616200578592, + "grad_norm": 2.1542303562164307, + "learning_rate": 8.747966829472701e-06, + "loss": 5.3903, + "step": 7522 + }, + { + "epoch": 0.7254580520732883, + "grad_norm": 2.9566490650177, + "learning_rate": 8.742212548439077e-06, + "loss": 5.5693, + "step": 7523 + }, + { + "epoch": 0.7255544840887175, + "grad_norm": 2.067155361175537, + "learning_rate": 8.736459759525167e-06, + "loss": 5.3227, + "step": 7524 + }, + { + "epoch": 0.7256509161041466, + "grad_norm": 1.8047168254852295, + "learning_rate": 8.730708463258963e-06, + "loss": 5.3284, + "step": 7525 + }, + { + "epoch": 0.7257473481195758, + "grad_norm": 1.7562432289123535, + "learning_rate": 8.724958660168311e-06, + "loss": 5.5287, + "step": 7526 + }, + { + "epoch": 0.7258437801350048, + "grad_norm": 3.312002182006836, + "learning_rate": 8.719210350780923e-06, + "loss": 5.4226, + "step": 7527 + }, + { + "epoch": 0.7259402121504339, + "grad_norm": 2.1922109127044678, + "learning_rate": 8.71346353562437e-06, + "loss": 5.3077, + "step": 7528 + }, + { + "epoch": 0.7260366441658631, + "grad_norm": 3.5165092945098877, + "learning_rate": 8.707718215226095e-06, + "loss": 5.4322, + "step": 7529 + }, + { + "epoch": 0.7261330761812922, + "grad_norm": 1.9449336528778076, + "learning_rate": 8.701974390113377e-06, + "loss": 5.561, + "step": 7530 + }, + { + "epoch": 0.7262295081967213, + "grad_norm": 2.5306293964385986, + "learning_rate": 8.696232060813404e-06, + "loss": 5.5385, + "step": 7531 + }, + { + "epoch": 0.7263259402121505, + "grad_norm": 2.0798611640930176, + "learning_rate": 8.690491227853175e-06, + "loss": 5.6574, + "step": 7532 + }, + { + "epoch": 0.7264223722275795, + "grad_norm": 1.7300124168395996, + "learning_rate": 8.68475189175959e-06, + "loss": 5.4288, + "step": 7533 + }, + { + "epoch": 0.7265188042430086, + "grad_norm": 2.5211758613586426, + "learning_rate": 8.679014053059398e-06, + "loss": 5.3084, + "step": 7534 + }, + { + "epoch": 0.7266152362584378, + "grad_norm": 2.4997057914733887, + "learning_rate": 8.673277712279203e-06, + "loss": 5.1886, + "step": 7535 + }, + { + "epoch": 0.7267116682738669, + "grad_norm": 2.6945900917053223, + "learning_rate": 8.667542869945491e-06, + "loss": 5.3251, + "step": 7536 + }, + { + "epoch": 0.7268081002892961, + "grad_norm": 2.031590700149536, + "learning_rate": 8.661809526584588e-06, + "loss": 5.457, + "step": 7537 + }, + { + "epoch": 0.7269045323047252, + "grad_norm": 2.2944798469543457, + "learning_rate": 8.6560776827227e-06, + "loss": 5.6189, + "step": 7538 + }, + { + "epoch": 0.7270009643201543, + "grad_norm": 1.9945729970932007, + "learning_rate": 8.650347338885884e-06, + "loss": 5.4883, + "step": 7539 + }, + { + "epoch": 0.7270973963355835, + "grad_norm": 2.009422779083252, + "learning_rate": 8.644618495600073e-06, + "loss": 5.4999, + "step": 7540 + }, + { + "epoch": 0.7271938283510125, + "grad_norm": 2.183722972869873, + "learning_rate": 8.638891153391033e-06, + "loss": 5.4772, + "step": 7541 + }, + { + "epoch": 0.7272902603664416, + "grad_norm": 2.289339303970337, + "learning_rate": 8.63316531278444e-06, + "loss": 5.3229, + "step": 7542 + }, + { + "epoch": 0.7273866923818708, + "grad_norm": 2.729503870010376, + "learning_rate": 8.627440974305784e-06, + "loss": 5.3784, + "step": 7543 + }, + { + "epoch": 0.7274831243972999, + "grad_norm": 2.7973814010620117, + "learning_rate": 8.621718138480444e-06, + "loss": 5.3725, + "step": 7544 + }, + { + "epoch": 0.727579556412729, + "grad_norm": 2.280886173248291, + "learning_rate": 8.615996805833654e-06, + "loss": 5.5157, + "step": 7545 + }, + { + "epoch": 0.7276759884281582, + "grad_norm": 2.6469669342041016, + "learning_rate": 8.610276976890519e-06, + "loss": 5.488, + "step": 7546 + }, + { + "epoch": 0.7277724204435873, + "grad_norm": 3.198763370513916, + "learning_rate": 8.604558652175976e-06, + "loss": 5.5045, + "step": 7547 + }, + { + "epoch": 0.7278688524590164, + "grad_norm": 2.361492395401001, + "learning_rate": 8.598841832214877e-06, + "loss": 5.3844, + "step": 7548 + }, + { + "epoch": 0.7279652844744455, + "grad_norm": 2.350432872772217, + "learning_rate": 8.59312651753187e-06, + "loss": 5.361, + "step": 7549 + }, + { + "epoch": 0.7280617164898746, + "grad_norm": 2.7239749431610107, + "learning_rate": 8.587412708651534e-06, + "loss": 5.55, + "step": 7550 + }, + { + "epoch": 0.7281581485053038, + "grad_norm": 2.95035719871521, + "learning_rate": 8.581700406098254e-06, + "loss": 5.4056, + "step": 7551 + }, + { + "epoch": 0.7282545805207329, + "grad_norm": 2.456012725830078, + "learning_rate": 8.575989610396298e-06, + "loss": 5.5698, + "step": 7552 + }, + { + "epoch": 0.728351012536162, + "grad_norm": 1.6607871055603027, + "learning_rate": 8.570280322069804e-06, + "loss": 5.4114, + "step": 7553 + }, + { + "epoch": 0.7284474445515912, + "grad_norm": 1.536286473274231, + "learning_rate": 8.56457254164276e-06, + "loss": 5.5354, + "step": 7554 + }, + { + "epoch": 0.7285438765670202, + "grad_norm": 2.4994056224823, + "learning_rate": 8.558866269639018e-06, + "loss": 5.2914, + "step": 7555 + }, + { + "epoch": 0.7286403085824493, + "grad_norm": 2.2307183742523193, + "learning_rate": 8.553161506582297e-06, + "loss": 5.4252, + "step": 7556 + }, + { + "epoch": 0.7287367405978785, + "grad_norm": 1.6453722715377808, + "learning_rate": 8.547458252996177e-06, + "loss": 5.449, + "step": 7557 + }, + { + "epoch": 0.7288331726133076, + "grad_norm": 2.281780242919922, + "learning_rate": 8.54175650940407e-06, + "loss": 5.616, + "step": 7558 + }, + { + "epoch": 0.7289296046287368, + "grad_norm": 1.5591320991516113, + "learning_rate": 8.536056276329313e-06, + "loss": 5.5833, + "step": 7559 + }, + { + "epoch": 0.7290260366441659, + "grad_norm": 2.420886516571045, + "learning_rate": 8.53035755429503e-06, + "loss": 5.4408, + "step": 7560 + }, + { + "epoch": 0.729122468659595, + "grad_norm": 2.506751775741577, + "learning_rate": 8.524660343824275e-06, + "loss": 5.6209, + "step": 7561 + }, + { + "epoch": 0.7292189006750242, + "grad_norm": 2.806959390640259, + "learning_rate": 8.518964645439911e-06, + "loss": 5.5529, + "step": 7562 + }, + { + "epoch": 0.7293153326904532, + "grad_norm": 2.0601255893707275, + "learning_rate": 8.513270459664685e-06, + "loss": 5.5678, + "step": 7563 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 1.8575561046600342, + "learning_rate": 8.507577787021203e-06, + "loss": 5.2778, + "step": 7564 + }, + { + "epoch": 0.7295081967213115, + "grad_norm": 2.0169856548309326, + "learning_rate": 8.50188662803194e-06, + "loss": 5.4478, + "step": 7565 + }, + { + "epoch": 0.7296046287367406, + "grad_norm": 2.3239450454711914, + "learning_rate": 8.496196983219203e-06, + "loss": 5.5816, + "step": 7566 + }, + { + "epoch": 0.7297010607521697, + "grad_norm": 1.8082365989685059, + "learning_rate": 8.490508853105211e-06, + "loss": 5.5032, + "step": 7567 + }, + { + "epoch": 0.7297974927675989, + "grad_norm": 1.985234022140503, + "learning_rate": 8.484822238211986e-06, + "loss": 5.5114, + "step": 7568 + }, + { + "epoch": 0.729893924783028, + "grad_norm": 1.7122128009796143, + "learning_rate": 8.479137139061449e-06, + "loss": 5.5085, + "step": 7569 + }, + { + "epoch": 0.7299903567984571, + "grad_norm": 1.8565922975540161, + "learning_rate": 8.473453556175373e-06, + "loss": 5.5231, + "step": 7570 + }, + { + "epoch": 0.7300867888138862, + "grad_norm": 2.6791491508483887, + "learning_rate": 8.467771490075389e-06, + "loss": 5.5605, + "step": 7571 + }, + { + "epoch": 0.7301832208293153, + "grad_norm": 1.6871980428695679, + "learning_rate": 8.462090941282989e-06, + "loss": 5.5403, + "step": 7572 + }, + { + "epoch": 0.7302796528447445, + "grad_norm": 1.8847947120666504, + "learning_rate": 8.456411910319537e-06, + "loss": 5.4213, + "step": 7573 + }, + { + "epoch": 0.7303760848601736, + "grad_norm": 1.6828891038894653, + "learning_rate": 8.45073439770622e-06, + "loss": 5.4152, + "step": 7574 + }, + { + "epoch": 0.7304725168756027, + "grad_norm": 2.2606518268585205, + "learning_rate": 8.445058403964148e-06, + "loss": 5.1005, + "step": 7575 + }, + { + "epoch": 0.7305689488910319, + "grad_norm": 2.0273590087890625, + "learning_rate": 8.43938392961423e-06, + "loss": 5.3345, + "step": 7576 + }, + { + "epoch": 0.7306653809064609, + "grad_norm": 2.148806571960449, + "learning_rate": 8.433710975177275e-06, + "loss": 5.1844, + "step": 7577 + }, + { + "epoch": 0.73076181292189, + "grad_norm": 1.8858139514923096, + "learning_rate": 8.428039541173935e-06, + "loss": 5.4153, + "step": 7578 + }, + { + "epoch": 0.7308582449373192, + "grad_norm": 2.3707847595214844, + "learning_rate": 8.42236962812473e-06, + "loss": 5.3494, + "step": 7579 + }, + { + "epoch": 0.7309546769527483, + "grad_norm": 1.6840932369232178, + "learning_rate": 8.416701236550037e-06, + "loss": 5.5072, + "step": 7580 + }, + { + "epoch": 0.7310511089681775, + "grad_norm": 2.498990297317505, + "learning_rate": 8.411034366970095e-06, + "loss": 5.3795, + "step": 7581 + }, + { + "epoch": 0.7311475409836066, + "grad_norm": 2.399998903274536, + "learning_rate": 8.405369019904996e-06, + "loss": 5.2607, + "step": 7582 + }, + { + "epoch": 0.7312439729990357, + "grad_norm": 2.8568646907806396, + "learning_rate": 8.399705195874708e-06, + "loss": 5.5328, + "step": 7583 + }, + { + "epoch": 0.7313404050144648, + "grad_norm": 1.5381368398666382, + "learning_rate": 8.394042895399055e-06, + "loss": 5.5017, + "step": 7584 + }, + { + "epoch": 0.7314368370298939, + "grad_norm": 2.0858891010284424, + "learning_rate": 8.388382118997687e-06, + "loss": 5.3324, + "step": 7585 + }, + { + "epoch": 0.731533269045323, + "grad_norm": 2.619211435317993, + "learning_rate": 8.382722867190179e-06, + "loss": 5.2557, + "step": 7586 + }, + { + "epoch": 0.7316297010607522, + "grad_norm": 1.946131706237793, + "learning_rate": 8.377065140495907e-06, + "loss": 5.3556, + "step": 7587 + }, + { + "epoch": 0.7317261330761813, + "grad_norm": 2.771963357925415, + "learning_rate": 8.371408939434136e-06, + "loss": 5.4723, + "step": 7588 + }, + { + "epoch": 0.7318225650916104, + "grad_norm": 2.390249490737915, + "learning_rate": 8.365754264523986e-06, + "loss": 5.3659, + "step": 7589 + }, + { + "epoch": 0.7319189971070396, + "grad_norm": 2.1810433864593506, + "learning_rate": 8.360101116284445e-06, + "loss": 5.423, + "step": 7590 + }, + { + "epoch": 0.7320154291224686, + "grad_norm": 2.34794545173645, + "learning_rate": 8.354449495234326e-06, + "loss": 5.2458, + "step": 7591 + }, + { + "epoch": 0.7321118611378978, + "grad_norm": 1.728811264038086, + "learning_rate": 8.348799401892362e-06, + "loss": 5.4447, + "step": 7592 + }, + { + "epoch": 0.7322082931533269, + "grad_norm": 1.8753111362457275, + "learning_rate": 8.343150836777077e-06, + "loss": 5.5653, + "step": 7593 + }, + { + "epoch": 0.732304725168756, + "grad_norm": 2.160787582397461, + "learning_rate": 8.337503800406927e-06, + "loss": 5.3043, + "step": 7594 + }, + { + "epoch": 0.7324011571841852, + "grad_norm": 1.905264973640442, + "learning_rate": 8.331858293300156e-06, + "loss": 5.5466, + "step": 7595 + }, + { + "epoch": 0.7324975891996143, + "grad_norm": 1.9425468444824219, + "learning_rate": 8.326214315974921e-06, + "loss": 5.4653, + "step": 7596 + }, + { + "epoch": 0.7325940212150434, + "grad_norm": 2.709285259246826, + "learning_rate": 8.320571868949212e-06, + "loss": 5.4167, + "step": 7597 + }, + { + "epoch": 0.7326904532304725, + "grad_norm": 1.9706722497940063, + "learning_rate": 8.314930952740888e-06, + "loss": 5.3625, + "step": 7598 + }, + { + "epoch": 0.7327868852459016, + "grad_norm": 1.8951958417892456, + "learning_rate": 8.309291567867663e-06, + "loss": 5.5226, + "step": 7599 + }, + { + "epoch": 0.7328833172613307, + "grad_norm": 1.6630029678344727, + "learning_rate": 8.303653714847118e-06, + "loss": 5.4727, + "step": 7600 + }, + { + "epoch": 0.7329797492767599, + "grad_norm": 2.4855940341949463, + "learning_rate": 8.298017394196691e-06, + "loss": 5.4797, + "step": 7601 + }, + { + "epoch": 0.733076181292189, + "grad_norm": 2.5611560344696045, + "learning_rate": 8.292382606433658e-06, + "loss": 5.065, + "step": 7602 + }, + { + "epoch": 0.7331726133076182, + "grad_norm": 2.3081583976745605, + "learning_rate": 8.286749352075198e-06, + "loss": 5.2781, + "step": 7603 + }, + { + "epoch": 0.7332690453230473, + "grad_norm": 1.6334149837493896, + "learning_rate": 8.281117631638297e-06, + "loss": 5.3554, + "step": 7604 + }, + { + "epoch": 0.7333654773384763, + "grad_norm": 2.1458547115325928, + "learning_rate": 8.27548744563986e-06, + "loss": 5.3587, + "step": 7605 + }, + { + "epoch": 0.7334619093539055, + "grad_norm": 2.44480037689209, + "learning_rate": 8.26985879459659e-06, + "loss": 5.3706, + "step": 7606 + }, + { + "epoch": 0.7335583413693346, + "grad_norm": 2.141848564147949, + "learning_rate": 8.26423167902509e-06, + "loss": 5.4163, + "step": 7607 + }, + { + "epoch": 0.7336547733847637, + "grad_norm": 3.0900652408599854, + "learning_rate": 8.258606099441808e-06, + "loss": 5.2562, + "step": 7608 + }, + { + "epoch": 0.7337512054001929, + "grad_norm": 2.656010627746582, + "learning_rate": 8.25298205636306e-06, + "loss": 5.2985, + "step": 7609 + }, + { + "epoch": 0.733847637415622, + "grad_norm": 1.6900137662887573, + "learning_rate": 8.247359550304993e-06, + "loss": 5.3496, + "step": 7610 + }, + { + "epoch": 0.7339440694310511, + "grad_norm": 2.7110636234283447, + "learning_rate": 8.241738581783658e-06, + "loss": 5.5805, + "step": 7611 + }, + { + "epoch": 0.7340405014464803, + "grad_norm": 2.5465803146362305, + "learning_rate": 8.236119151314927e-06, + "loss": 5.5621, + "step": 7612 + }, + { + "epoch": 0.7341369334619093, + "grad_norm": 1.8476300239562988, + "learning_rate": 8.230501259414546e-06, + "loss": 5.51, + "step": 7613 + }, + { + "epoch": 0.7342333654773385, + "grad_norm": 3.2952849864959717, + "learning_rate": 8.224884906598118e-06, + "loss": 5.496, + "step": 7614 + }, + { + "epoch": 0.7343297974927676, + "grad_norm": 2.3214359283447266, + "learning_rate": 8.219270093381109e-06, + "loss": 5.4182, + "step": 7615 + }, + { + "epoch": 0.7344262295081967, + "grad_norm": 4.385842323303223, + "learning_rate": 8.213656820278836e-06, + "loss": 5.3741, + "step": 7616 + }, + { + "epoch": 0.7345226615236259, + "grad_norm": 3.340182304382324, + "learning_rate": 8.20804508780648e-06, + "loss": 5.3791, + "step": 7617 + }, + { + "epoch": 0.734619093539055, + "grad_norm": 2.320188283920288, + "learning_rate": 8.202434896479078e-06, + "loss": 5.4409, + "step": 7618 + }, + { + "epoch": 0.734715525554484, + "grad_norm": 2.0247297286987305, + "learning_rate": 8.196826246811525e-06, + "loss": 5.3094, + "step": 7619 + }, + { + "epoch": 0.7348119575699132, + "grad_norm": 2.3404695987701416, + "learning_rate": 8.191219139318587e-06, + "loss": 5.0499, + "step": 7620 + }, + { + "epoch": 0.7349083895853423, + "grad_norm": 2.5355424880981445, + "learning_rate": 8.185613574514861e-06, + "loss": 4.9629, + "step": 7621 + }, + { + "epoch": 0.7350048216007714, + "grad_norm": 3.2169482707977295, + "learning_rate": 8.180009552914825e-06, + "loss": 5.4117, + "step": 7622 + }, + { + "epoch": 0.7351012536162006, + "grad_norm": 2.732889175415039, + "learning_rate": 8.174407075032808e-06, + "loss": 5.3819, + "step": 7623 + }, + { + "epoch": 0.7351976856316297, + "grad_norm": 2.221447706222534, + "learning_rate": 8.168806141383004e-06, + "loss": 5.6226, + "step": 7624 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 2.1449692249298096, + "learning_rate": 8.163206752479457e-06, + "loss": 5.3856, + "step": 7625 + }, + { + "epoch": 0.735390549662488, + "grad_norm": 2.9257774353027344, + "learning_rate": 8.15760890883607e-06, + "loss": 5.4554, + "step": 7626 + }, + { + "epoch": 0.735486981677917, + "grad_norm": 2.2953426837921143, + "learning_rate": 8.152012610966608e-06, + "loss": 5.3263, + "step": 7627 + }, + { + "epoch": 0.7355834136933462, + "grad_norm": 3.300687313079834, + "learning_rate": 8.1464178593847e-06, + "loss": 5.0354, + "step": 7628 + }, + { + "epoch": 0.7356798457087753, + "grad_norm": 3.6393015384674072, + "learning_rate": 8.140824654603801e-06, + "loss": 5.4415, + "step": 7629 + }, + { + "epoch": 0.7357762777242044, + "grad_norm": 3.0332839488983154, + "learning_rate": 8.135232997137283e-06, + "loss": 5.5582, + "step": 7630 + }, + { + "epoch": 0.7358727097396336, + "grad_norm": 2.9728972911834717, + "learning_rate": 8.129642887498312e-06, + "loss": 5.4628, + "step": 7631 + }, + { + "epoch": 0.7359691417550627, + "grad_norm": 2.865016460418701, + "learning_rate": 8.124054326199954e-06, + "loss": 5.4445, + "step": 7632 + }, + { + "epoch": 0.7360655737704918, + "grad_norm": 1.9201101064682007, + "learning_rate": 8.11846731375512e-06, + "loss": 5.3776, + "step": 7633 + }, + { + "epoch": 0.736162005785921, + "grad_norm": 2.7686612606048584, + "learning_rate": 8.112881850676584e-06, + "loss": 5.384, + "step": 7634 + }, + { + "epoch": 0.73625843780135, + "grad_norm": 3.515749454498291, + "learning_rate": 8.107297937476955e-06, + "loss": 5.2638, + "step": 7635 + }, + { + "epoch": 0.7363548698167792, + "grad_norm": 1.7804886102676392, + "learning_rate": 8.101715574668742e-06, + "loss": 5.3723, + "step": 7636 + }, + { + "epoch": 0.7364513018322083, + "grad_norm": 3.203589916229248, + "learning_rate": 8.09613476276426e-06, + "loss": 5.1416, + "step": 7637 + }, + { + "epoch": 0.7365477338476374, + "grad_norm": 3.380957841873169, + "learning_rate": 8.090555502275742e-06, + "loss": 4.7476, + "step": 7638 + }, + { + "epoch": 0.7366441658630666, + "grad_norm": 3.171032190322876, + "learning_rate": 8.084977793715218e-06, + "loss": 4.7029, + "step": 7639 + }, + { + "epoch": 0.7367405978784957, + "grad_norm": 2.0607879161834717, + "learning_rate": 8.079401637594614e-06, + "loss": 4.9053, + "step": 7640 + }, + { + "epoch": 0.7368370298939247, + "grad_norm": 1.9478511810302734, + "learning_rate": 8.073827034425702e-06, + "loss": 4.8905, + "step": 7641 + }, + { + "epoch": 0.7369334619093539, + "grad_norm": 2.439476251602173, + "learning_rate": 8.068253984720111e-06, + "loss": 4.5677, + "step": 7642 + }, + { + "epoch": 0.737029893924783, + "grad_norm": 2.3983914852142334, + "learning_rate": 8.062682488989331e-06, + "loss": 5.3902, + "step": 7643 + }, + { + "epoch": 0.7371263259402121, + "grad_norm": 2.080995798110962, + "learning_rate": 8.057112547744705e-06, + "loss": 4.9778, + "step": 7644 + }, + { + "epoch": 0.7372227579556413, + "grad_norm": 2.2646868228912354, + "learning_rate": 8.051544161497446e-06, + "loss": 4.901, + "step": 7645 + }, + { + "epoch": 0.7373191899710704, + "grad_norm": 3.1046762466430664, + "learning_rate": 8.045977330758583e-06, + "loss": 5.1283, + "step": 7646 + }, + { + "epoch": 0.7374156219864996, + "grad_norm": 1.914910912513733, + "learning_rate": 8.040412056039073e-06, + "loss": 5.2622, + "step": 7647 + }, + { + "epoch": 0.7375120540019287, + "grad_norm": 2.182661771774292, + "learning_rate": 8.034848337849655e-06, + "loss": 4.8294, + "step": 7648 + }, + { + "epoch": 0.7376084860173577, + "grad_norm": 1.8324979543685913, + "learning_rate": 8.02928617670099e-06, + "loss": 4.9065, + "step": 7649 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 1.6372358798980713, + "learning_rate": 8.023725573103547e-06, + "loss": 5.1427, + "step": 7650 + }, + { + "epoch": 0.737801350048216, + "grad_norm": 1.9851529598236084, + "learning_rate": 8.018166527567672e-06, + "loss": 5.1731, + "step": 7651 + }, + { + "epoch": 0.7378977820636451, + "grad_norm": 2.101217031478882, + "learning_rate": 8.012609040603574e-06, + "loss": 5.0022, + "step": 7652 + }, + { + "epoch": 0.7379942140790743, + "grad_norm": 3.420980215072632, + "learning_rate": 8.007053112721322e-06, + "loss": 4.5594, + "step": 7653 + }, + { + "epoch": 0.7380906460945034, + "grad_norm": 3.2128841876983643, + "learning_rate": 8.001498744430798e-06, + "loss": 4.7859, + "step": 7654 + }, + { + "epoch": 0.7381870781099324, + "grad_norm": 2.5791594982147217, + "learning_rate": 7.995945936241816e-06, + "loss": 4.6689, + "step": 7655 + }, + { + "epoch": 0.7382835101253616, + "grad_norm": 1.941231608390808, + "learning_rate": 7.990394688663982e-06, + "loss": 4.847, + "step": 7656 + }, + { + "epoch": 0.7383799421407907, + "grad_norm": 1.9324750900268555, + "learning_rate": 7.984845002206787e-06, + "loss": 5.0146, + "step": 7657 + }, + { + "epoch": 0.7384763741562199, + "grad_norm": 1.9590609073638916, + "learning_rate": 7.979296877379572e-06, + "loss": 4.8165, + "step": 7658 + }, + { + "epoch": 0.738572806171649, + "grad_norm": 3.111102819442749, + "learning_rate": 7.973750314691544e-06, + "loss": 5.034, + "step": 7659 + }, + { + "epoch": 0.7386692381870781, + "grad_norm": 3.891233444213867, + "learning_rate": 7.968205314651755e-06, + "loss": 4.8389, + "step": 7660 + }, + { + "epoch": 0.7387656702025073, + "grad_norm": 2.8053269386291504, + "learning_rate": 7.96266187776912e-06, + "loss": 5.035, + "step": 7661 + }, + { + "epoch": 0.7388621022179364, + "grad_norm": 1.7118840217590332, + "learning_rate": 7.957120004552408e-06, + "loss": 5.1528, + "step": 7662 + }, + { + "epoch": 0.7389585342333654, + "grad_norm": 2.1550207138061523, + "learning_rate": 7.951579695510245e-06, + "loss": 5.042, + "step": 7663 + }, + { + "epoch": 0.7390549662487946, + "grad_norm": 2.3646433353424072, + "learning_rate": 7.946040951151124e-06, + "loss": 5.0015, + "step": 7664 + }, + { + "epoch": 0.7391513982642237, + "grad_norm": 2.25998854637146, + "learning_rate": 7.940503771983358e-06, + "loss": 4.9245, + "step": 7665 + }, + { + "epoch": 0.7392478302796528, + "grad_norm": 3.9461405277252197, + "learning_rate": 7.93496815851517e-06, + "loss": 4.7706, + "step": 7666 + }, + { + "epoch": 0.739344262295082, + "grad_norm": 1.9258360862731934, + "learning_rate": 7.9294341112546e-06, + "loss": 4.9926, + "step": 7667 + }, + { + "epoch": 0.7394406943105111, + "grad_norm": 1.9467250108718872, + "learning_rate": 7.923901630709555e-06, + "loss": 5.0293, + "step": 7668 + }, + { + "epoch": 0.7395371263259403, + "grad_norm": 2.164325714111328, + "learning_rate": 7.918370717387799e-06, + "loss": 5.2254, + "step": 7669 + }, + { + "epoch": 0.7396335583413693, + "grad_norm": 1.8628484010696411, + "learning_rate": 7.912841371796956e-06, + "loss": 4.9674, + "step": 7670 + }, + { + "epoch": 0.7397299903567984, + "grad_norm": 2.6611316204071045, + "learning_rate": 7.907313594444499e-06, + "loss": 4.9074, + "step": 7671 + }, + { + "epoch": 0.7398264223722276, + "grad_norm": 2.0636847019195557, + "learning_rate": 7.901787385837772e-06, + "loss": 4.9776, + "step": 7672 + }, + { + "epoch": 0.7399228543876567, + "grad_norm": 1.6959586143493652, + "learning_rate": 7.89626274648394e-06, + "loss": 5.2333, + "step": 7673 + }, + { + "epoch": 0.7400192864030858, + "grad_norm": 1.9871950149536133, + "learning_rate": 7.890739676890075e-06, + "loss": 4.9505, + "step": 7674 + }, + { + "epoch": 0.740115718418515, + "grad_norm": 3.5528788566589355, + "learning_rate": 7.885218177563059e-06, + "loss": 4.9294, + "step": 7675 + }, + { + "epoch": 0.7402121504339441, + "grad_norm": 3.0550076961517334, + "learning_rate": 7.879698249009651e-06, + "loss": 4.8342, + "step": 7676 + }, + { + "epoch": 0.7403085824493731, + "grad_norm": 2.262901782989502, + "learning_rate": 7.874179891736469e-06, + "loss": 5.1797, + "step": 7677 + }, + { + "epoch": 0.7404050144648023, + "grad_norm": 2.449430227279663, + "learning_rate": 7.868663106249985e-06, + "loss": 5.2302, + "step": 7678 + }, + { + "epoch": 0.7405014464802314, + "grad_norm": 1.6366047859191895, + "learning_rate": 7.863147893056497e-06, + "loss": 5.0444, + "step": 7679 + }, + { + "epoch": 0.7405978784956606, + "grad_norm": 2.7109577655792236, + "learning_rate": 7.857634252662222e-06, + "loss": 4.7832, + "step": 7680 + }, + { + "epoch": 0.7406943105110897, + "grad_norm": 2.602586507797241, + "learning_rate": 7.852122185573161e-06, + "loss": 4.704, + "step": 7681 + }, + { + "epoch": 0.7407907425265188, + "grad_norm": 1.9068827629089355, + "learning_rate": 7.846611692295236e-06, + "loss": 5.2263, + "step": 7682 + }, + { + "epoch": 0.740887174541948, + "grad_norm": 1.9530092477798462, + "learning_rate": 7.84110277333417e-06, + "loss": 4.97, + "step": 7683 + }, + { + "epoch": 0.740983606557377, + "grad_norm": 2.3714330196380615, + "learning_rate": 7.835595429195568e-06, + "loss": 4.9867, + "step": 7684 + }, + { + "epoch": 0.7410800385728061, + "grad_norm": 2.129798173904419, + "learning_rate": 7.830089660384895e-06, + "loss": 5.2985, + "step": 7685 + }, + { + "epoch": 0.7411764705882353, + "grad_norm": 1.7379404306411743, + "learning_rate": 7.824585467407461e-06, + "loss": 5.2756, + "step": 7686 + }, + { + "epoch": 0.7412729026036644, + "grad_norm": 2.295031785964966, + "learning_rate": 7.819082850768434e-06, + "loss": 5.0414, + "step": 7687 + }, + { + "epoch": 0.7413693346190935, + "grad_norm": 2.129909038543701, + "learning_rate": 7.813581810972834e-06, + "loss": 5.4295, + "step": 7688 + }, + { + "epoch": 0.7414657666345227, + "grad_norm": 2.1311748027801514, + "learning_rate": 7.80808234852555e-06, + "loss": 5.2847, + "step": 7689 + }, + { + "epoch": 0.7415621986499518, + "grad_norm": 1.9139288663864136, + "learning_rate": 7.802584463931295e-06, + "loss": 5.0672, + "step": 7690 + }, + { + "epoch": 0.741658630665381, + "grad_norm": 2.7453548908233643, + "learning_rate": 7.797088157694684e-06, + "loss": 4.745, + "step": 7691 + }, + { + "epoch": 0.74175506268081, + "grad_norm": 2.3097927570343018, + "learning_rate": 7.791593430320135e-06, + "loss": 4.9174, + "step": 7692 + }, + { + "epoch": 0.7418514946962391, + "grad_norm": 2.696316957473755, + "learning_rate": 7.786100282311978e-06, + "loss": 5.0478, + "step": 7693 + }, + { + "epoch": 0.7419479267116683, + "grad_norm": 2.442284107208252, + "learning_rate": 7.78060871417434e-06, + "loss": 5.1121, + "step": 7694 + }, + { + "epoch": 0.7420443587270974, + "grad_norm": 2.4125654697418213, + "learning_rate": 7.77511872641124e-06, + "loss": 5.1419, + "step": 7695 + }, + { + "epoch": 0.7421407907425265, + "grad_norm": 2.4334030151367188, + "learning_rate": 7.769630319526543e-06, + "loss": 5.0826, + "step": 7696 + }, + { + "epoch": 0.7422372227579557, + "grad_norm": 2.078918218612671, + "learning_rate": 7.764143494023979e-06, + "loss": 5.0352, + "step": 7697 + }, + { + "epoch": 0.7423336547733848, + "grad_norm": 1.8566197156906128, + "learning_rate": 7.758658250407094e-06, + "loss": 5.0677, + "step": 7698 + }, + { + "epoch": 0.7424300867888138, + "grad_norm": 1.6212269067764282, + "learning_rate": 7.75317458917935e-06, + "loss": 5.1723, + "step": 7699 + }, + { + "epoch": 0.742526518804243, + "grad_norm": 2.2717604637145996, + "learning_rate": 7.747692510844007e-06, + "loss": 5.0537, + "step": 7700 + }, + { + "epoch": 0.7426229508196721, + "grad_norm": 1.9512358903884888, + "learning_rate": 7.74221201590421e-06, + "loss": 5.103, + "step": 7701 + }, + { + "epoch": 0.7427193828351013, + "grad_norm": 1.8156152963638306, + "learning_rate": 7.736733104862954e-06, + "loss": 5.3119, + "step": 7702 + }, + { + "epoch": 0.7428158148505304, + "grad_norm": 1.864711046218872, + "learning_rate": 7.731255778223084e-06, + "loss": 5.033, + "step": 7703 + }, + { + "epoch": 0.7429122468659595, + "grad_norm": 2.2556509971618652, + "learning_rate": 7.725780036487306e-06, + "loss": 4.9608, + "step": 7704 + }, + { + "epoch": 0.7430086788813887, + "grad_norm": 1.9847184419631958, + "learning_rate": 7.720305880158176e-06, + "loss": 4.953, + "step": 7705 + }, + { + "epoch": 0.7431051108968177, + "grad_norm": 2.430082082748413, + "learning_rate": 7.714833309738103e-06, + "loss": 5.0292, + "step": 7706 + }, + { + "epoch": 0.7432015429122468, + "grad_norm": 2.0059683322906494, + "learning_rate": 7.709362325729358e-06, + "loss": 5.1904, + "step": 7707 + }, + { + "epoch": 0.743297974927676, + "grad_norm": 2.4006617069244385, + "learning_rate": 7.70389292863406e-06, + "loss": 4.782, + "step": 7708 + }, + { + "epoch": 0.7433944069431051, + "grad_norm": 2.615469455718994, + "learning_rate": 7.698425118954169e-06, + "loss": 4.6704, + "step": 7709 + }, + { + "epoch": 0.7434908389585342, + "grad_norm": 2.3391542434692383, + "learning_rate": 7.69295889719154e-06, + "loss": 5.1021, + "step": 7710 + }, + { + "epoch": 0.7435872709739634, + "grad_norm": 1.6063498258590698, + "learning_rate": 7.687494263847833e-06, + "loss": 5.2365, + "step": 7711 + }, + { + "epoch": 0.7436837029893925, + "grad_norm": 2.107360601425171, + "learning_rate": 7.682031219424597e-06, + "loss": 5.2667, + "step": 7712 + }, + { + "epoch": 0.7437801350048217, + "grad_norm": 1.8801472187042236, + "learning_rate": 7.676569764423219e-06, + "loss": 5.0731, + "step": 7713 + }, + { + "epoch": 0.7438765670202507, + "grad_norm": 1.7336595058441162, + "learning_rate": 7.671109899344946e-06, + "loss": 4.8946, + "step": 7714 + }, + { + "epoch": 0.7439729990356798, + "grad_norm": 2.064246416091919, + "learning_rate": 7.665651624690879e-06, + "loss": 5.0234, + "step": 7715 + }, + { + "epoch": 0.744069431051109, + "grad_norm": 2.123934507369995, + "learning_rate": 7.66019494096198e-06, + "loss": 4.9864, + "step": 7716 + }, + { + "epoch": 0.7441658630665381, + "grad_norm": 2.209055185317993, + "learning_rate": 7.654739848659028e-06, + "loss": 5.2668, + "step": 7717 + }, + { + "epoch": 0.7442622950819672, + "grad_norm": 2.3442039489746094, + "learning_rate": 7.649286348282722e-06, + "loss": 4.8927, + "step": 7718 + }, + { + "epoch": 0.7443587270973964, + "grad_norm": 2.504922389984131, + "learning_rate": 7.643834440333553e-06, + "loss": 4.8754, + "step": 7719 + }, + { + "epoch": 0.7444551591128254, + "grad_norm": 2.1605465412139893, + "learning_rate": 7.638384125311895e-06, + "loss": 5.0238, + "step": 7720 + }, + { + "epoch": 0.7445515911282545, + "grad_norm": 2.5360958576202393, + "learning_rate": 7.632935403717973e-06, + "loss": 4.7336, + "step": 7721 + }, + { + "epoch": 0.7446480231436837, + "grad_norm": 1.7660707235336304, + "learning_rate": 7.627488276051867e-06, + "loss": 4.9674, + "step": 7722 + }, + { + "epoch": 0.7447444551591128, + "grad_norm": 1.8675146102905273, + "learning_rate": 7.622042742813501e-06, + "loss": 5.3379, + "step": 7723 + }, + { + "epoch": 0.744840887174542, + "grad_norm": 1.8091938495635986, + "learning_rate": 7.616598804502667e-06, + "loss": 5.1625, + "step": 7724 + }, + { + "epoch": 0.7449373191899711, + "grad_norm": 2.2860822677612305, + "learning_rate": 7.6111564616189986e-06, + "loss": 5.0611, + "step": 7725 + }, + { + "epoch": 0.7450337512054002, + "grad_norm": 2.054969549179077, + "learning_rate": 7.605715714661996e-06, + "loss": 5.1095, + "step": 7726 + }, + { + "epoch": 0.7451301832208294, + "grad_norm": 2.1630983352661133, + "learning_rate": 7.600276564130987e-06, + "loss": 5.1148, + "step": 7727 + }, + { + "epoch": 0.7452266152362584, + "grad_norm": 2.181032180786133, + "learning_rate": 7.594839010525179e-06, + "loss": 4.9571, + "step": 7728 + }, + { + "epoch": 0.7453230472516875, + "grad_norm": 2.4972267150878906, + "learning_rate": 7.589403054343624e-06, + "loss": 5.0912, + "step": 7729 + }, + { + "epoch": 0.7454194792671167, + "grad_norm": 2.0185298919677734, + "learning_rate": 7.583968696085231e-06, + "loss": 4.8958, + "step": 7730 + }, + { + "epoch": 0.7455159112825458, + "grad_norm": 2.1461267471313477, + "learning_rate": 7.578535936248751e-06, + "loss": 5.3593, + "step": 7731 + }, + { + "epoch": 0.7456123432979749, + "grad_norm": 1.8246114253997803, + "learning_rate": 7.573104775332804e-06, + "loss": 4.8038, + "step": 7732 + }, + { + "epoch": 0.7457087753134041, + "grad_norm": 2.1647422313690186, + "learning_rate": 7.5676752138358565e-06, + "loss": 4.9756, + "step": 7733 + }, + { + "epoch": 0.7458052073288332, + "grad_norm": 2.7779035568237305, + "learning_rate": 7.562247252256211e-06, + "loss": 4.7138, + "step": 7734 + }, + { + "epoch": 0.7459016393442623, + "grad_norm": 2.442939043045044, + "learning_rate": 7.556820891092062e-06, + "loss": 5.2225, + "step": 7735 + }, + { + "epoch": 0.7459980713596914, + "grad_norm": 1.8301786184310913, + "learning_rate": 7.5513961308414065e-06, + "loss": 4.997, + "step": 7736 + }, + { + "epoch": 0.7460945033751205, + "grad_norm": 1.6143827438354492, + "learning_rate": 7.5459729720021574e-06, + "loss": 5.3648, + "step": 7737 + }, + { + "epoch": 0.7461909353905497, + "grad_norm": 2.0627427101135254, + "learning_rate": 7.540551415072017e-06, + "loss": 5.0918, + "step": 7738 + }, + { + "epoch": 0.7462873674059788, + "grad_norm": 1.9333943128585815, + "learning_rate": 7.535131460548578e-06, + "loss": 4.9308, + "step": 7739 + }, + { + "epoch": 0.7463837994214079, + "grad_norm": 1.9854811429977417, + "learning_rate": 7.529713108929279e-06, + "loss": 4.9982, + "step": 7740 + }, + { + "epoch": 0.7464802314368371, + "grad_norm": 1.7555737495422363, + "learning_rate": 7.524296360711413e-06, + "loss": 5.5365, + "step": 7741 + }, + { + "epoch": 0.7465766634522661, + "grad_norm": 2.851595163345337, + "learning_rate": 7.5188812163921065e-06, + "loss": 5.2217, + "step": 7742 + }, + { + "epoch": 0.7466730954676952, + "grad_norm": 1.7843648195266724, + "learning_rate": 7.513467676468381e-06, + "loss": 4.9181, + "step": 7743 + }, + { + "epoch": 0.7467695274831244, + "grad_norm": 1.4868662357330322, + "learning_rate": 7.508055741437062e-06, + "loss": 4.7779, + "step": 7744 + }, + { + "epoch": 0.7468659594985535, + "grad_norm": 2.023073673248291, + "learning_rate": 7.502645411794856e-06, + "loss": 5.311, + "step": 7745 + }, + { + "epoch": 0.7469623915139827, + "grad_norm": 2.2021474838256836, + "learning_rate": 7.49723668803832e-06, + "loss": 4.8867, + "step": 7746 + }, + { + "epoch": 0.7470588235294118, + "grad_norm": 1.626327395439148, + "learning_rate": 7.491829570663858e-06, + "loss": 4.9154, + "step": 7747 + }, + { + "epoch": 0.7471552555448409, + "grad_norm": 1.6094722747802734, + "learning_rate": 7.4864240601677265e-06, + "loss": 5.3418, + "step": 7748 + }, + { + "epoch": 0.74725168756027, + "grad_norm": 1.6779537200927734, + "learning_rate": 7.481020157046042e-06, + "loss": 5.0753, + "step": 7749 + }, + { + "epoch": 0.7473481195756991, + "grad_norm": 1.7135676145553589, + "learning_rate": 7.475617861794765e-06, + "loss": 4.699, + "step": 7750 + }, + { + "epoch": 0.7474445515911282, + "grad_norm": 2.2000608444213867, + "learning_rate": 7.470217174909711e-06, + "loss": 4.7291, + "step": 7751 + }, + { + "epoch": 0.7475409836065574, + "grad_norm": 1.6412074565887451, + "learning_rate": 7.4648180968865586e-06, + "loss": 5.0028, + "step": 7752 + }, + { + "epoch": 0.7476374156219865, + "grad_norm": 1.7524209022521973, + "learning_rate": 7.4594206282208e-06, + "loss": 4.9473, + "step": 7753 + }, + { + "epoch": 0.7477338476374156, + "grad_norm": 2.130018949508667, + "learning_rate": 7.454024769407844e-06, + "loss": 4.8649, + "step": 7754 + }, + { + "epoch": 0.7478302796528448, + "grad_norm": 2.114165782928467, + "learning_rate": 7.448630520942892e-06, + "loss": 4.855, + "step": 7755 + }, + { + "epoch": 0.7479267116682738, + "grad_norm": 2.124228000640869, + "learning_rate": 7.44323788332103e-06, + "loss": 4.7492, + "step": 7756 + }, + { + "epoch": 0.748023143683703, + "grad_norm": 1.8009347915649414, + "learning_rate": 7.437846857037184e-06, + "loss": 5.1494, + "step": 7757 + }, + { + "epoch": 0.7481195756991321, + "grad_norm": 2.1966969966888428, + "learning_rate": 7.432457442586138e-06, + "loss": 4.9826, + "step": 7758 + }, + { + "epoch": 0.7482160077145612, + "grad_norm": 1.9293837547302246, + "learning_rate": 7.427069640462528e-06, + "loss": 5.3618, + "step": 7759 + }, + { + "epoch": 0.7483124397299904, + "grad_norm": 2.504673957824707, + "learning_rate": 7.421683451160844e-06, + "loss": 5.2046, + "step": 7760 + }, + { + "epoch": 0.7484088717454195, + "grad_norm": 2.677147388458252, + "learning_rate": 7.416298875175401e-06, + "loss": 5.2956, + "step": 7761 + }, + { + "epoch": 0.7485053037608486, + "grad_norm": 1.8455289602279663, + "learning_rate": 7.410915913000421e-06, + "loss": 5.1941, + "step": 7762 + }, + { + "epoch": 0.7486017357762778, + "grad_norm": 2.6732218265533447, + "learning_rate": 7.405534565129923e-06, + "loss": 5.072, + "step": 7763 + }, + { + "epoch": 0.7486981677917068, + "grad_norm": 2.1364126205444336, + "learning_rate": 7.400154832057807e-06, + "loss": 4.7386, + "step": 7764 + }, + { + "epoch": 0.7487945998071359, + "grad_norm": 1.8287632465362549, + "learning_rate": 7.394776714277818e-06, + "loss": 4.9565, + "step": 7765 + }, + { + "epoch": 0.7488910318225651, + "grad_norm": 2.6188340187072754, + "learning_rate": 7.389400212283554e-06, + "loss": 5.0678, + "step": 7766 + }, + { + "epoch": 0.7489874638379942, + "grad_norm": 2.410909652709961, + "learning_rate": 7.384025326568464e-06, + "loss": 5.3838, + "step": 7767 + }, + { + "epoch": 0.7490838958534234, + "grad_norm": 2.222321033477783, + "learning_rate": 7.378652057625846e-06, + "loss": 4.8108, + "step": 7768 + }, + { + "epoch": 0.7491803278688525, + "grad_norm": 2.0281379222869873, + "learning_rate": 7.373280405948857e-06, + "loss": 4.8808, + "step": 7769 + }, + { + "epoch": 0.7492767598842816, + "grad_norm": 2.334927558898926, + "learning_rate": 7.367910372030495e-06, + "loss": 4.9713, + "step": 7770 + }, + { + "epoch": 0.7493731918997107, + "grad_norm": 1.9595152139663696, + "learning_rate": 7.362541956363625e-06, + "loss": 5.2557, + "step": 7771 + }, + { + "epoch": 0.7494696239151398, + "grad_norm": 1.792446494102478, + "learning_rate": 7.357175159440932e-06, + "loss": 5.2041, + "step": 7772 + }, + { + "epoch": 0.7495660559305689, + "grad_norm": 3.9603612422943115, + "learning_rate": 7.351809981755004e-06, + "loss": 4.7816, + "step": 7773 + }, + { + "epoch": 0.7496624879459981, + "grad_norm": 2.2008371353149414, + "learning_rate": 7.346446423798225e-06, + "loss": 5.2244, + "step": 7774 + }, + { + "epoch": 0.7497589199614272, + "grad_norm": 2.668809175491333, + "learning_rate": 7.341084486062866e-06, + "loss": 5.0986, + "step": 7775 + }, + { + "epoch": 0.7498553519768563, + "grad_norm": 2.2539103031158447, + "learning_rate": 7.335724169041036e-06, + "loss": 4.7898, + "step": 7776 + }, + { + "epoch": 0.7499517839922855, + "grad_norm": 1.8721729516983032, + "learning_rate": 7.3303654732247125e-06, + "loss": 5.2509, + "step": 7777 + }, + { + "epoch": 0.7500482160077145, + "grad_norm": 1.6619539260864258, + "learning_rate": 7.325008399105679e-06, + "loss": 5.1777, + "step": 7778 + }, + { + "epoch": 0.7501446480231437, + "grad_norm": 1.9916404485702515, + "learning_rate": 7.319652947175637e-06, + "loss": 5.4121, + "step": 7779 + }, + { + "epoch": 0.7502410800385728, + "grad_norm": 2.4407079219818115, + "learning_rate": 7.314299117926071e-06, + "loss": 4.7984, + "step": 7780 + }, + { + "epoch": 0.7503375120540019, + "grad_norm": 1.7666264772415161, + "learning_rate": 7.308946911848383e-06, + "loss": 4.9259, + "step": 7781 + }, + { + "epoch": 0.7504339440694311, + "grad_norm": 2.1518383026123047, + "learning_rate": 7.3035963294337626e-06, + "loss": 5.4013, + "step": 7782 + }, + { + "epoch": 0.7505303760848602, + "grad_norm": 1.7328917980194092, + "learning_rate": 7.298247371173289e-06, + "loss": 5.2974, + "step": 7783 + }, + { + "epoch": 0.7506268081002893, + "grad_norm": 2.8991639614105225, + "learning_rate": 7.292900037557887e-06, + "loss": 5.327, + "step": 7784 + }, + { + "epoch": 0.7507232401157184, + "grad_norm": 1.5256037712097168, + "learning_rate": 7.287554329078333e-06, + "loss": 5.0466, + "step": 7785 + }, + { + "epoch": 0.7508196721311475, + "grad_norm": 2.007275342941284, + "learning_rate": 7.282210246225224e-06, + "loss": 5.0615, + "step": 7786 + }, + { + "epoch": 0.7509161041465766, + "grad_norm": 2.842711925506592, + "learning_rate": 7.27686778948907e-06, + "loss": 5.4228, + "step": 7787 + }, + { + "epoch": 0.7510125361620058, + "grad_norm": 2.4742164611816406, + "learning_rate": 7.271526959360167e-06, + "loss": 5.3205, + "step": 7788 + }, + { + "epoch": 0.7511089681774349, + "grad_norm": 2.2078099250793457, + "learning_rate": 7.266187756328701e-06, + "loss": 4.8682, + "step": 7789 + }, + { + "epoch": 0.7512054001928641, + "grad_norm": 2.5592503547668457, + "learning_rate": 7.260850180884696e-06, + "loss": 4.8613, + "step": 7790 + }, + { + "epoch": 0.7513018322082932, + "grad_norm": 2.514561414718628, + "learning_rate": 7.255514233518026e-06, + "loss": 4.8087, + "step": 7791 + }, + { + "epoch": 0.7513982642237222, + "grad_norm": 2.0645689964294434, + "learning_rate": 7.250179914718422e-06, + "loss": 5.1773, + "step": 7792 + }, + { + "epoch": 0.7514946962391514, + "grad_norm": 1.9843626022338867, + "learning_rate": 7.24484722497546e-06, + "loss": 5.1185, + "step": 7793 + }, + { + "epoch": 0.7515911282545805, + "grad_norm": 2.7204885482788086, + "learning_rate": 7.239516164778564e-06, + "loss": 4.9252, + "step": 7794 + }, + { + "epoch": 0.7516875602700096, + "grad_norm": 2.6080305576324463, + "learning_rate": 7.234186734617016e-06, + "loss": 4.8202, + "step": 7795 + }, + { + "epoch": 0.7517839922854388, + "grad_norm": 1.9680886268615723, + "learning_rate": 7.228858934979948e-06, + "loss": 5.217, + "step": 7796 + }, + { + "epoch": 0.7518804243008679, + "grad_norm": 1.704102873802185, + "learning_rate": 7.223532766356322e-06, + "loss": 5.2481, + "step": 7797 + }, + { + "epoch": 0.751976856316297, + "grad_norm": 1.3144434690475464, + "learning_rate": 7.2182082292349965e-06, + "loss": 5.4745, + "step": 7798 + }, + { + "epoch": 0.7520732883317262, + "grad_norm": 1.4136830568313599, + "learning_rate": 7.2128853241046235e-06, + "loss": 5.4124, + "step": 7799 + }, + { + "epoch": 0.7521697203471552, + "grad_norm": 2.565084934234619, + "learning_rate": 7.207564051453744e-06, + "loss": 5.4968, + "step": 7800 + }, + { + "epoch": 0.7522661523625844, + "grad_norm": 2.2421796321868896, + "learning_rate": 7.202244411770737e-06, + "loss": 5.4841, + "step": 7801 + }, + { + "epoch": 0.7523625843780135, + "grad_norm": 2.028075695037842, + "learning_rate": 7.19692640554383e-06, + "loss": 5.39, + "step": 7802 + }, + { + "epoch": 0.7524590163934426, + "grad_norm": 1.5839084386825562, + "learning_rate": 7.191610033261109e-06, + "loss": 5.2465, + "step": 7803 + }, + { + "epoch": 0.7525554484088718, + "grad_norm": 1.5394465923309326, + "learning_rate": 7.186295295410506e-06, + "loss": 5.5157, + "step": 7804 + }, + { + "epoch": 0.7526518804243009, + "grad_norm": 1.541796326637268, + "learning_rate": 7.180982192479779e-06, + "loss": 5.5786, + "step": 7805 + }, + { + "epoch": 0.75274831243973, + "grad_norm": 1.8301453590393066, + "learning_rate": 7.175670724956593e-06, + "loss": 5.4568, + "step": 7806 + }, + { + "epoch": 0.7528447444551591, + "grad_norm": 1.638710379600525, + "learning_rate": 7.170360893328401e-06, + "loss": 5.5024, + "step": 7807 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 1.6652607917785645, + "learning_rate": 7.165052698082542e-06, + "loss": 5.2922, + "step": 7808 + }, + { + "epoch": 0.7530376084860173, + "grad_norm": 1.502957820892334, + "learning_rate": 7.159746139706194e-06, + "loss": 5.1232, + "step": 7809 + }, + { + "epoch": 0.7531340405014465, + "grad_norm": 1.4900604486465454, + "learning_rate": 7.154441218686389e-06, + "loss": 5.5371, + "step": 7810 + }, + { + "epoch": 0.7532304725168756, + "grad_norm": 1.9222413301467896, + "learning_rate": 7.149137935510003e-06, + "loss": 5.5663, + "step": 7811 + }, + { + "epoch": 0.7533269045323048, + "grad_norm": 1.8208197355270386, + "learning_rate": 7.143836290663769e-06, + "loss": 5.401, + "step": 7812 + }, + { + "epoch": 0.7534233365477339, + "grad_norm": 2.178601026535034, + "learning_rate": 7.138536284634262e-06, + "loss": 5.5167, + "step": 7813 + }, + { + "epoch": 0.7535197685631629, + "grad_norm": 2.071408987045288, + "learning_rate": 7.133237917907909e-06, + "loss": 5.4461, + "step": 7814 + }, + { + "epoch": 0.7536162005785921, + "grad_norm": 2.213562488555908, + "learning_rate": 7.127941190970999e-06, + "loss": 5.4106, + "step": 7815 + }, + { + "epoch": 0.7537126325940212, + "grad_norm": 1.7244542837142944, + "learning_rate": 7.122646104309633e-06, + "loss": 5.4208, + "step": 7816 + }, + { + "epoch": 0.7538090646094503, + "grad_norm": 2.396327495574951, + "learning_rate": 7.117352658409821e-06, + "loss": 5.4618, + "step": 7817 + }, + { + "epoch": 0.7539054966248795, + "grad_norm": 2.135376453399658, + "learning_rate": 7.112060853757363e-06, + "loss": 5.3169, + "step": 7818 + }, + { + "epoch": 0.7540019286403086, + "grad_norm": 3.2455759048461914, + "learning_rate": 7.106770690837941e-06, + "loss": 5.4204, + "step": 7819 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 2.5616114139556885, + "learning_rate": 7.1014821701370855e-06, + "loss": 5.2815, + "step": 7820 + }, + { + "epoch": 0.7541947926711668, + "grad_norm": 2.2841854095458984, + "learning_rate": 7.096195292140173e-06, + "loss": 5.5703, + "step": 7821 + }, + { + "epoch": 0.7542912246865959, + "grad_norm": 2.3416502475738525, + "learning_rate": 7.090910057332406e-06, + "loss": 5.2406, + "step": 7822 + }, + { + "epoch": 0.7543876567020251, + "grad_norm": 1.9742361307144165, + "learning_rate": 7.085626466198889e-06, + "loss": 5.425, + "step": 7823 + }, + { + "epoch": 0.7544840887174542, + "grad_norm": 2.430588960647583, + "learning_rate": 7.080344519224508e-06, + "loss": 5.4292, + "step": 7824 + }, + { + "epoch": 0.7545805207328833, + "grad_norm": 2.200115203857422, + "learning_rate": 7.075064216894067e-06, + "loss": 5.4829, + "step": 7825 + }, + { + "epoch": 0.7546769527483125, + "grad_norm": 1.8534332513809204, + "learning_rate": 7.0697855596921634e-06, + "loss": 5.4395, + "step": 7826 + }, + { + "epoch": 0.7547733847637416, + "grad_norm": 1.9878209829330444, + "learning_rate": 7.064508548103271e-06, + "loss": 5.2547, + "step": 7827 + }, + { + "epoch": 0.7548698167791706, + "grad_norm": 3.1632888317108154, + "learning_rate": 7.059233182611711e-06, + "loss": 5.0864, + "step": 7828 + }, + { + "epoch": 0.7549662487945998, + "grad_norm": 2.6348884105682373, + "learning_rate": 7.053959463701645e-06, + "loss": 5.2371, + "step": 7829 + }, + { + "epoch": 0.7550626808100289, + "grad_norm": 2.170807361602783, + "learning_rate": 7.048687391857095e-06, + "loss": 5.295, + "step": 7830 + }, + { + "epoch": 0.755159112825458, + "grad_norm": 2.5100955963134766, + "learning_rate": 7.0434169675619196e-06, + "loss": 5.365, + "step": 7831 + }, + { + "epoch": 0.7552555448408872, + "grad_norm": 2.4390971660614014, + "learning_rate": 7.0381481912998425e-06, + "loss": 5.1376, + "step": 7832 + }, + { + "epoch": 0.7553519768563163, + "grad_norm": 2.2469053268432617, + "learning_rate": 7.032881063554408e-06, + "loss": 5.1302, + "step": 7833 + }, + { + "epoch": 0.7554484088717455, + "grad_norm": 2.2546184062957764, + "learning_rate": 7.0276155848090385e-06, + "loss": 5.581, + "step": 7834 + }, + { + "epoch": 0.7555448408871746, + "grad_norm": 2.2009899616241455, + "learning_rate": 7.022351755546988e-06, + "loss": 5.3187, + "step": 7835 + }, + { + "epoch": 0.7556412729026036, + "grad_norm": 2.23724627494812, + "learning_rate": 7.017089576251365e-06, + "loss": 5.4562, + "step": 7836 + }, + { + "epoch": 0.7557377049180328, + "grad_norm": 2.7563376426696777, + "learning_rate": 7.01182904740513e-06, + "loss": 5.2642, + "step": 7837 + }, + { + "epoch": 0.7558341369334619, + "grad_norm": 1.779081106185913, + "learning_rate": 7.006570169491083e-06, + "loss": 5.3873, + "step": 7838 + }, + { + "epoch": 0.755930568948891, + "grad_norm": 2.6316041946411133, + "learning_rate": 7.001312942991883e-06, + "loss": 5.5599, + "step": 7839 + }, + { + "epoch": 0.7560270009643202, + "grad_norm": 1.452599287033081, + "learning_rate": 6.996057368390033e-06, + "loss": 5.3314, + "step": 7840 + }, + { + "epoch": 0.7561234329797493, + "grad_norm": 2.5110106468200684, + "learning_rate": 6.990803446167865e-06, + "loss": 5.0742, + "step": 7841 + }, + { + "epoch": 0.7562198649951783, + "grad_norm": 2.0247528553009033, + "learning_rate": 6.985551176807603e-06, + "loss": 5.6215, + "step": 7842 + }, + { + "epoch": 0.7563162970106075, + "grad_norm": 2.464146375656128, + "learning_rate": 6.980300560791278e-06, + "loss": 5.3392, + "step": 7843 + }, + { + "epoch": 0.7564127290260366, + "grad_norm": 1.9979605674743652, + "learning_rate": 6.975051598600787e-06, + "loss": 5.3248, + "step": 7844 + }, + { + "epoch": 0.7565091610414658, + "grad_norm": 1.6189055442810059, + "learning_rate": 6.969804290717874e-06, + "loss": 5.4437, + "step": 7845 + }, + { + "epoch": 0.7566055930568949, + "grad_norm": 1.8531765937805176, + "learning_rate": 6.964558637624135e-06, + "loss": 5.1735, + "step": 7846 + }, + { + "epoch": 0.756702025072324, + "grad_norm": 2.773867607116699, + "learning_rate": 6.959314639801004e-06, + "loss": 5.2855, + "step": 7847 + }, + { + "epoch": 0.7567984570877532, + "grad_norm": 3.8007946014404297, + "learning_rate": 6.954072297729777e-06, + "loss": 5.262, + "step": 7848 + }, + { + "epoch": 0.7568948891031823, + "grad_norm": 1.9919875860214233, + "learning_rate": 6.948831611891571e-06, + "loss": 5.3123, + "step": 7849 + }, + { + "epoch": 0.7569913211186113, + "grad_norm": 2.333167791366577, + "learning_rate": 6.943592582767397e-06, + "loss": 5.4609, + "step": 7850 + }, + { + "epoch": 0.7570877531340405, + "grad_norm": 2.6167073249816895, + "learning_rate": 6.938355210838066e-06, + "loss": 5.3966, + "step": 7851 + }, + { + "epoch": 0.7571841851494696, + "grad_norm": 2.496473789215088, + "learning_rate": 6.933119496584259e-06, + "loss": 5.3919, + "step": 7852 + }, + { + "epoch": 0.7572806171648987, + "grad_norm": 1.6527730226516724, + "learning_rate": 6.927885440486512e-06, + "loss": 5.4396, + "step": 7853 + }, + { + "epoch": 0.7573770491803279, + "grad_norm": 1.7858532667160034, + "learning_rate": 6.922653043025193e-06, + "loss": 5.6026, + "step": 7854 + }, + { + "epoch": 0.757473481195757, + "grad_norm": 1.7915178537368774, + "learning_rate": 6.917422304680532e-06, + "loss": 5.5448, + "step": 7855 + }, + { + "epoch": 0.7575699132111862, + "grad_norm": 2.231684446334839, + "learning_rate": 6.9121932259325925e-06, + "loss": 5.5667, + "step": 7856 + }, + { + "epoch": 0.7576663452266152, + "grad_norm": 2.094564437866211, + "learning_rate": 6.906965807261298e-06, + "loss": 5.3241, + "step": 7857 + }, + { + "epoch": 0.7577627772420443, + "grad_norm": 1.604681134223938, + "learning_rate": 6.90174004914641e-06, + "loss": 5.2837, + "step": 7858 + }, + { + "epoch": 0.7578592092574735, + "grad_norm": 2.4054453372955322, + "learning_rate": 6.896515952067556e-06, + "loss": 5.581, + "step": 7859 + }, + { + "epoch": 0.7579556412729026, + "grad_norm": 3.2367336750030518, + "learning_rate": 6.891293516504169e-06, + "loss": 5.2281, + "step": 7860 + }, + { + "epoch": 0.7580520732883317, + "grad_norm": 2.0130932331085205, + "learning_rate": 6.88607274293559e-06, + "loss": 5.688, + "step": 7861 + }, + { + "epoch": 0.7581485053037609, + "grad_norm": 2.315951347351074, + "learning_rate": 6.880853631840953e-06, + "loss": 5.1276, + "step": 7862 + }, + { + "epoch": 0.75824493731919, + "grad_norm": 2.948784828186035, + "learning_rate": 6.875636183699269e-06, + "loss": 5.44, + "step": 7863 + }, + { + "epoch": 0.758341369334619, + "grad_norm": 2.9663314819335938, + "learning_rate": 6.870420398989389e-06, + "loss": 5.1052, + "step": 7864 + }, + { + "epoch": 0.7584378013500482, + "grad_norm": 2.328395366668701, + "learning_rate": 6.8652062781900175e-06, + "loss": 5.4701, + "step": 7865 + }, + { + "epoch": 0.7585342333654773, + "grad_norm": 2.221959352493286, + "learning_rate": 6.8599938217796785e-06, + "loss": 5.3728, + "step": 7866 + }, + { + "epoch": 0.7586306653809065, + "grad_norm": 2.1934351921081543, + "learning_rate": 6.8547830302367945e-06, + "loss": 4.9841, + "step": 7867 + }, + { + "epoch": 0.7587270973963356, + "grad_norm": 1.670630693435669, + "learning_rate": 6.8495739040395774e-06, + "loss": 5.4845, + "step": 7868 + }, + { + "epoch": 0.7588235294117647, + "grad_norm": 1.889986515045166, + "learning_rate": 6.844366443666142e-06, + "loss": 5.4341, + "step": 7869 + }, + { + "epoch": 0.7589199614271939, + "grad_norm": 1.9492933750152588, + "learning_rate": 6.839160649594401e-06, + "loss": 5.4323, + "step": 7870 + }, + { + "epoch": 0.759016393442623, + "grad_norm": 2.222216844558716, + "learning_rate": 6.833956522302143e-06, + "loss": 5.5729, + "step": 7871 + }, + { + "epoch": 0.759112825458052, + "grad_norm": 1.6525352001190186, + "learning_rate": 6.828754062266995e-06, + "loss": 5.411, + "step": 7872 + }, + { + "epoch": 0.7592092574734812, + "grad_norm": 2.003157615661621, + "learning_rate": 6.8235532699664375e-06, + "loss": 5.5104, + "step": 7873 + }, + { + "epoch": 0.7593056894889103, + "grad_norm": 2.071263551712036, + "learning_rate": 6.818354145877787e-06, + "loss": 5.4816, + "step": 7874 + }, + { + "epoch": 0.7594021215043394, + "grad_norm": 2.6929070949554443, + "learning_rate": 6.813156690478215e-06, + "loss": 5.4132, + "step": 7875 + }, + { + "epoch": 0.7594985535197686, + "grad_norm": 1.6647672653198242, + "learning_rate": 6.8079609042447454e-06, + "loss": 5.4119, + "step": 7876 + }, + { + "epoch": 0.7595949855351977, + "grad_norm": 1.5925025939941406, + "learning_rate": 6.802766787654219e-06, + "loss": 5.2572, + "step": 7877 + }, + { + "epoch": 0.7596914175506269, + "grad_norm": 1.691794753074646, + "learning_rate": 6.797574341183374e-06, + "loss": 5.4904, + "step": 7878 + }, + { + "epoch": 0.7597878495660559, + "grad_norm": 1.6960018873214722, + "learning_rate": 6.792383565308735e-06, + "loss": 5.5687, + "step": 7879 + }, + { + "epoch": 0.759884281581485, + "grad_norm": 1.583528995513916, + "learning_rate": 6.787194460506741e-06, + "loss": 5.3483, + "step": 7880 + }, + { + "epoch": 0.7599807135969142, + "grad_norm": 1.562453269958496, + "learning_rate": 6.782007027253614e-06, + "loss": 5.4321, + "step": 7881 + }, + { + "epoch": 0.7600771456123433, + "grad_norm": 2.0571095943450928, + "learning_rate": 6.77682126602546e-06, + "loss": 5.6494, + "step": 7882 + }, + { + "epoch": 0.7601735776277724, + "grad_norm": 1.6422430276870728, + "learning_rate": 6.771637177298221e-06, + "loss": 5.289, + "step": 7883 + }, + { + "epoch": 0.7602700096432016, + "grad_norm": 1.779820442199707, + "learning_rate": 6.766454761547694e-06, + "loss": 5.4416, + "step": 7884 + }, + { + "epoch": 0.7603664416586307, + "grad_norm": 1.555501103401184, + "learning_rate": 6.761274019249492e-06, + "loss": 5.3595, + "step": 7885 + }, + { + "epoch": 0.7604628736740597, + "grad_norm": 2.369541645050049, + "learning_rate": 6.756094950879127e-06, + "loss": 5.3736, + "step": 7886 + }, + { + "epoch": 0.7605593056894889, + "grad_norm": 3.0728161334991455, + "learning_rate": 6.750917556911906e-06, + "loss": 5.2132, + "step": 7887 + }, + { + "epoch": 0.760655737704918, + "grad_norm": 2.2137198448181152, + "learning_rate": 6.745741837823011e-06, + "loss": 5.5526, + "step": 7888 + }, + { + "epoch": 0.7607521697203472, + "grad_norm": 2.2679874897003174, + "learning_rate": 6.740567794087463e-06, + "loss": 5.4877, + "step": 7889 + }, + { + "epoch": 0.7608486017357763, + "grad_norm": 1.6596803665161133, + "learning_rate": 6.7353954261801275e-06, + "loss": 5.6205, + "step": 7890 + }, + { + "epoch": 0.7609450337512054, + "grad_norm": 1.5377495288848877, + "learning_rate": 6.730224734575721e-06, + "loss": 5.405, + "step": 7891 + }, + { + "epoch": 0.7610414657666346, + "grad_norm": 2.0766336917877197, + "learning_rate": 6.725055719748807e-06, + "loss": 5.438, + "step": 7892 + }, + { + "epoch": 0.7611378977820636, + "grad_norm": 2.2453455924987793, + "learning_rate": 6.719888382173775e-06, + "loss": 5.472, + "step": 7893 + }, + { + "epoch": 0.7612343297974927, + "grad_norm": 3.2741763591766357, + "learning_rate": 6.714722722324901e-06, + "loss": 5.3201, + "step": 7894 + }, + { + "epoch": 0.7613307618129219, + "grad_norm": 2.047308921813965, + "learning_rate": 6.709558740676261e-06, + "loss": 5.4773, + "step": 7895 + }, + { + "epoch": 0.761427193828351, + "grad_norm": 1.86739182472229, + "learning_rate": 6.70439643770181e-06, + "loss": 5.3167, + "step": 7896 + }, + { + "epoch": 0.7615236258437801, + "grad_norm": 2.328495979309082, + "learning_rate": 6.699235813875334e-06, + "loss": 5.6792, + "step": 7897 + }, + { + "epoch": 0.7616200578592093, + "grad_norm": 1.642271637916565, + "learning_rate": 6.694076869670474e-06, + "loss": 5.182, + "step": 7898 + }, + { + "epoch": 0.7617164898746384, + "grad_norm": 2.0891950130462646, + "learning_rate": 6.6889196055607065e-06, + "loss": 5.1445, + "step": 7899 + }, + { + "epoch": 0.7618129218900676, + "grad_norm": 2.0893352031707764, + "learning_rate": 6.683764022019359e-06, + "loss": 5.275, + "step": 7900 + }, + { + "epoch": 0.7619093539054966, + "grad_norm": 1.6164714097976685, + "learning_rate": 6.678610119519607e-06, + "loss": 5.3005, + "step": 7901 + }, + { + "epoch": 0.7620057859209257, + "grad_norm": 2.1225733757019043, + "learning_rate": 6.673457898534466e-06, + "loss": 5.4448, + "step": 7902 + }, + { + "epoch": 0.7621022179363549, + "grad_norm": 1.7696400880813599, + "learning_rate": 6.6683073595368125e-06, + "loss": 5.1379, + "step": 7903 + }, + { + "epoch": 0.762198649951784, + "grad_norm": 1.7732999324798584, + "learning_rate": 6.663158502999331e-06, + "loss": 5.4859, + "step": 7904 + }, + { + "epoch": 0.7622950819672131, + "grad_norm": 2.249516010284424, + "learning_rate": 6.658011329394609e-06, + "loss": 5.4892, + "step": 7905 + }, + { + "epoch": 0.7623915139826423, + "grad_norm": 1.644970417022705, + "learning_rate": 6.652865839195024e-06, + "loss": 5.4984, + "step": 7906 + }, + { + "epoch": 0.7624879459980713, + "grad_norm": 2.1936886310577393, + "learning_rate": 6.64772203287283e-06, + "loss": 5.3233, + "step": 7907 + }, + { + "epoch": 0.7625843780135004, + "grad_norm": 1.543492078781128, + "learning_rate": 6.642579910900121e-06, + "loss": 5.4707, + "step": 7908 + }, + { + "epoch": 0.7626808100289296, + "grad_norm": 2.1119186878204346, + "learning_rate": 6.637439473748841e-06, + "loss": 5.3055, + "step": 7909 + }, + { + "epoch": 0.7627772420443587, + "grad_norm": 1.9528309106826782, + "learning_rate": 6.6323007218907496e-06, + "loss": 5.5625, + "step": 7910 + }, + { + "epoch": 0.7628736740597879, + "grad_norm": 1.6335896253585815, + "learning_rate": 6.6271636557975084e-06, + "loss": 5.4524, + "step": 7911 + }, + { + "epoch": 0.762970106075217, + "grad_norm": 1.8427172899246216, + "learning_rate": 6.622028275940556e-06, + "loss": 5.4006, + "step": 7912 + }, + { + "epoch": 0.7630665380906461, + "grad_norm": 1.928955078125, + "learning_rate": 6.616894582791244e-06, + "loss": 5.4631, + "step": 7913 + }, + { + "epoch": 0.7631629701060753, + "grad_norm": 1.8795092105865479, + "learning_rate": 6.611762576820713e-06, + "loss": 5.2026, + "step": 7914 + }, + { + "epoch": 0.7632594021215043, + "grad_norm": 2.10746693611145, + "learning_rate": 6.606632258499981e-06, + "loss": 5.4337, + "step": 7915 + }, + { + "epoch": 0.7633558341369334, + "grad_norm": 1.617790699005127, + "learning_rate": 6.601503628299902e-06, + "loss": 5.5237, + "step": 7916 + }, + { + "epoch": 0.7634522661523626, + "grad_norm": 2.1018471717834473, + "learning_rate": 6.596376686691178e-06, + "loss": 5.2401, + "step": 7917 + }, + { + "epoch": 0.7635486981677917, + "grad_norm": 1.8427331447601318, + "learning_rate": 6.591251434144347e-06, + "loss": 5.4355, + "step": 7918 + }, + { + "epoch": 0.7636451301832208, + "grad_norm": 1.737480640411377, + "learning_rate": 6.586127871129802e-06, + "loss": 5.2377, + "step": 7919 + }, + { + "epoch": 0.76374156219865, + "grad_norm": 2.046557664871216, + "learning_rate": 6.581005998117787e-06, + "loss": 5.3092, + "step": 7920 + }, + { + "epoch": 0.763837994214079, + "grad_norm": 2.2407262325286865, + "learning_rate": 6.575885815578353e-06, + "loss": 5.1589, + "step": 7921 + }, + { + "epoch": 0.7639344262295082, + "grad_norm": 1.774170160293579, + "learning_rate": 6.570767323981461e-06, + "loss": 5.4037, + "step": 7922 + }, + { + "epoch": 0.7640308582449373, + "grad_norm": 2.317420721054077, + "learning_rate": 6.565650523796843e-06, + "loss": 5.5033, + "step": 7923 + }, + { + "epoch": 0.7641272902603664, + "grad_norm": 2.930422782897949, + "learning_rate": 6.560535415494148e-06, + "loss": 5.3085, + "step": 7924 + }, + { + "epoch": 0.7642237222757956, + "grad_norm": 2.305297374725342, + "learning_rate": 6.55542199954281e-06, + "loss": 5.3388, + "step": 7925 + }, + { + "epoch": 0.7643201542912247, + "grad_norm": 1.9056237936019897, + "learning_rate": 6.5503102764121385e-06, + "loss": 5.2939, + "step": 7926 + }, + { + "epoch": 0.7644165863066538, + "grad_norm": 1.7816277742385864, + "learning_rate": 6.545200246571281e-06, + "loss": 5.3131, + "step": 7927 + }, + { + "epoch": 0.764513018322083, + "grad_norm": 1.8464347124099731, + "learning_rate": 6.54009191048924e-06, + "loss": 5.3907, + "step": 7928 + }, + { + "epoch": 0.764609450337512, + "grad_norm": 1.9328206777572632, + "learning_rate": 6.534985268634827e-06, + "loss": 5.2212, + "step": 7929 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 2.5139760971069336, + "learning_rate": 6.529880321476756e-06, + "loss": 5.3037, + "step": 7930 + }, + { + "epoch": 0.7648023143683703, + "grad_norm": 2.4219887256622314, + "learning_rate": 6.524777069483526e-06, + "loss": 5.3732, + "step": 7931 + }, + { + "epoch": 0.7648987463837994, + "grad_norm": 2.5595340728759766, + "learning_rate": 6.519675513123519e-06, + "loss": 5.3373, + "step": 7932 + }, + { + "epoch": 0.7649951783992286, + "grad_norm": 3.0012354850769043, + "learning_rate": 6.514575652864946e-06, + "loss": 5.5091, + "step": 7933 + }, + { + "epoch": 0.7650916104146577, + "grad_norm": 2.2066705226898193, + "learning_rate": 6.5094774891758695e-06, + "loss": 5.4212, + "step": 7934 + }, + { + "epoch": 0.7651880424300868, + "grad_norm": 2.591848850250244, + "learning_rate": 6.504381022524192e-06, + "loss": 5.3684, + "step": 7935 + }, + { + "epoch": 0.765284474445516, + "grad_norm": 3.3092193603515625, + "learning_rate": 6.499286253377657e-06, + "loss": 5.5237, + "step": 7936 + }, + { + "epoch": 0.765380906460945, + "grad_norm": 2.6700031757354736, + "learning_rate": 6.494193182203862e-06, + "loss": 5.4776, + "step": 7937 + }, + { + "epoch": 0.7654773384763741, + "grad_norm": 3.040289878845215, + "learning_rate": 6.489101809470243e-06, + "loss": 5.5392, + "step": 7938 + }, + { + "epoch": 0.7655737704918033, + "grad_norm": 2.386781692504883, + "learning_rate": 6.484012135644074e-06, + "loss": 5.5621, + "step": 7939 + }, + { + "epoch": 0.7656702025072324, + "grad_norm": 1.6309713125228882, + "learning_rate": 6.478924161192479e-06, + "loss": 5.4857, + "step": 7940 + }, + { + "epoch": 0.7657666345226615, + "grad_norm": 2.0137784481048584, + "learning_rate": 6.473837886582429e-06, + "loss": 5.3391, + "step": 7941 + }, + { + "epoch": 0.7658630665380907, + "grad_norm": 2.5869812965393066, + "learning_rate": 6.468753312280737e-06, + "loss": 5.2344, + "step": 7942 + }, + { + "epoch": 0.7659594985535197, + "grad_norm": 2.5568184852600098, + "learning_rate": 6.463670438754055e-06, + "loss": 5.0963, + "step": 7943 + }, + { + "epoch": 0.7660559305689489, + "grad_norm": 2.558476209640503, + "learning_rate": 6.458589266468887e-06, + "loss": 5.2108, + "step": 7944 + }, + { + "epoch": 0.766152362584378, + "grad_norm": 3.317772388458252, + "learning_rate": 6.453509795891577e-06, + "loss": 5.4527, + "step": 7945 + }, + { + "epoch": 0.7662487945998071, + "grad_norm": 3.414198398590088, + "learning_rate": 6.4484320274883105e-06, + "loss": 5.1572, + "step": 7946 + }, + { + "epoch": 0.7663452266152363, + "grad_norm": 1.9176915884017944, + "learning_rate": 6.443355961725125e-06, + "loss": 5.5438, + "step": 7947 + }, + { + "epoch": 0.7664416586306654, + "grad_norm": 2.599219799041748, + "learning_rate": 6.438281599067875e-06, + "loss": 5.3384, + "step": 7948 + }, + { + "epoch": 0.7665380906460945, + "grad_norm": 2.726928472518921, + "learning_rate": 6.43320893998231e-06, + "loss": 5.2716, + "step": 7949 + }, + { + "epoch": 0.7666345226615237, + "grad_norm": 1.6399154663085938, + "learning_rate": 6.428137984933968e-06, + "loss": 5.2688, + "step": 7950 + }, + { + "epoch": 0.7667309546769527, + "grad_norm": 2.2525641918182373, + "learning_rate": 6.423068734388265e-06, + "loss": 5.2622, + "step": 7951 + }, + { + "epoch": 0.7668273866923818, + "grad_norm": 2.6548705101013184, + "learning_rate": 6.4180011888104495e-06, + "loss": 5.2902, + "step": 7952 + }, + { + "epoch": 0.766923818707811, + "grad_norm": 2.264279842376709, + "learning_rate": 6.412935348665619e-06, + "loss": 5.7141, + "step": 7953 + }, + { + "epoch": 0.7670202507232401, + "grad_norm": 1.6241872310638428, + "learning_rate": 6.407871214418693e-06, + "loss": 5.4385, + "step": 7954 + }, + { + "epoch": 0.7671166827386693, + "grad_norm": 1.6876581907272339, + "learning_rate": 6.402808786534478e-06, + "loss": 5.4318, + "step": 7955 + }, + { + "epoch": 0.7672131147540984, + "grad_norm": 1.7843433618545532, + "learning_rate": 6.397748065477566e-06, + "loss": 5.172, + "step": 7956 + }, + { + "epoch": 0.7673095467695275, + "grad_norm": 1.5119595527648926, + "learning_rate": 6.392689051712458e-06, + "loss": 5.4774, + "step": 7957 + }, + { + "epoch": 0.7674059787849566, + "grad_norm": 1.4411157369613647, + "learning_rate": 6.3876317457034404e-06, + "loss": 5.4787, + "step": 7958 + }, + { + "epoch": 0.7675024108003857, + "grad_norm": 2.3299810886383057, + "learning_rate": 6.382576147914671e-06, + "loss": 5.3233, + "step": 7959 + }, + { + "epoch": 0.7675988428158148, + "grad_norm": 2.3837389945983887, + "learning_rate": 6.377522258810148e-06, + "loss": 5.4065, + "step": 7960 + }, + { + "epoch": 0.767695274831244, + "grad_norm": 1.8664299249649048, + "learning_rate": 6.372470078853712e-06, + "loss": 5.2785, + "step": 7961 + }, + { + "epoch": 0.7677917068466731, + "grad_norm": 1.7921850681304932, + "learning_rate": 6.367419608509048e-06, + "loss": 5.4334, + "step": 7962 + }, + { + "epoch": 0.7678881388621022, + "grad_norm": 2.2146646976470947, + "learning_rate": 6.362370848239677e-06, + "loss": 5.2828, + "step": 7963 + }, + { + "epoch": 0.7679845708775314, + "grad_norm": 2.694770097732544, + "learning_rate": 6.35732379850898e-06, + "loss": 5.3913, + "step": 7964 + }, + { + "epoch": 0.7680810028929604, + "grad_norm": 1.8230751752853394, + "learning_rate": 6.352278459780142e-06, + "loss": 5.212, + "step": 7965 + }, + { + "epoch": 0.7681774349083896, + "grad_norm": 1.8788238763809204, + "learning_rate": 6.34723483251625e-06, + "loss": 5.3063, + "step": 7966 + }, + { + "epoch": 0.7682738669238187, + "grad_norm": 2.1896331310272217, + "learning_rate": 6.342192917180173e-06, + "loss": 5.1322, + "step": 7967 + }, + { + "epoch": 0.7683702989392478, + "grad_norm": 1.9103590250015259, + "learning_rate": 6.337152714234682e-06, + "loss": 5.2855, + "step": 7968 + }, + { + "epoch": 0.768466730954677, + "grad_norm": 1.9720524549484253, + "learning_rate": 6.332114224142335e-06, + "loss": 5.4323, + "step": 7969 + }, + { + "epoch": 0.7685631629701061, + "grad_norm": 2.301344394683838, + "learning_rate": 6.32707744736557e-06, + "loss": 5.176, + "step": 7970 + }, + { + "epoch": 0.7686595949855352, + "grad_norm": 1.7324962615966797, + "learning_rate": 6.322042384366655e-06, + "loss": 5.2818, + "step": 7971 + }, + { + "epoch": 0.7687560270009643, + "grad_norm": 3.249171018600464, + "learning_rate": 6.317009035607707e-06, + "loss": 5.0004, + "step": 7972 + }, + { + "epoch": 0.7688524590163934, + "grad_norm": 1.5440531969070435, + "learning_rate": 6.311977401550662e-06, + "loss": 5.2786, + "step": 7973 + }, + { + "epoch": 0.7689488910318225, + "grad_norm": 1.7900439500808716, + "learning_rate": 6.306947482657341e-06, + "loss": 5.4066, + "step": 7974 + }, + { + "epoch": 0.7690453230472517, + "grad_norm": 2.0425214767456055, + "learning_rate": 6.301919279389367e-06, + "loss": 5.3492, + "step": 7975 + }, + { + "epoch": 0.7691417550626808, + "grad_norm": 1.7499761581420898, + "learning_rate": 6.2968927922082286e-06, + "loss": 5.3123, + "step": 7976 + }, + { + "epoch": 0.76923818707811, + "grad_norm": 1.7727774381637573, + "learning_rate": 6.29186802157525e-06, + "loss": 5.1475, + "step": 7977 + }, + { + "epoch": 0.7693346190935391, + "grad_norm": 1.8737833499908447, + "learning_rate": 6.2868449679515965e-06, + "loss": 5.1298, + "step": 7978 + }, + { + "epoch": 0.7694310511089681, + "grad_norm": 1.547932744026184, + "learning_rate": 6.281823631798281e-06, + "loss": 5.3803, + "step": 7979 + }, + { + "epoch": 0.7695274831243973, + "grad_norm": 1.9394092559814453, + "learning_rate": 6.276804013576154e-06, + "loss": 5.2866, + "step": 7980 + }, + { + "epoch": 0.7696239151398264, + "grad_norm": 1.7687098979949951, + "learning_rate": 6.271786113745911e-06, + "loss": 5.236, + "step": 7981 + }, + { + "epoch": 0.7697203471552555, + "grad_norm": 2.242271900177002, + "learning_rate": 6.266769932768088e-06, + "loss": 5.4598, + "step": 7982 + }, + { + "epoch": 0.7698167791706847, + "grad_norm": 1.7652866840362549, + "learning_rate": 6.26175547110307e-06, + "loss": 5.348, + "step": 7983 + }, + { + "epoch": 0.7699132111861138, + "grad_norm": 2.26987361907959, + "learning_rate": 6.256742729211057e-06, + "loss": 5.6011, + "step": 7984 + }, + { + "epoch": 0.7700096432015429, + "grad_norm": 1.7259833812713623, + "learning_rate": 6.2517317075521415e-06, + "loss": 5.0689, + "step": 7985 + }, + { + "epoch": 0.770106075216972, + "grad_norm": 2.1387994289398193, + "learning_rate": 6.246722406586209e-06, + "loss": 5.1659, + "step": 7986 + }, + { + "epoch": 0.7702025072324011, + "grad_norm": 1.5523736476898193, + "learning_rate": 6.2417148267730125e-06, + "loss": 5.4399, + "step": 7987 + }, + { + "epoch": 0.7702989392478303, + "grad_norm": 2.1388323307037354, + "learning_rate": 6.236708968572139e-06, + "loss": 5.2985, + "step": 7988 + }, + { + "epoch": 0.7703953712632594, + "grad_norm": 2.333319902420044, + "learning_rate": 6.231704832443025e-06, + "loss": 5.5273, + "step": 7989 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 1.6801718473434448, + "learning_rate": 6.22670241884494e-06, + "loss": 5.4325, + "step": 7990 + }, + { + "epoch": 0.7705882352941177, + "grad_norm": 1.519476294517517, + "learning_rate": 6.221701728237009e-06, + "loss": 5.3168, + "step": 7991 + }, + { + "epoch": 0.7706846673095468, + "grad_norm": 1.933477759361267, + "learning_rate": 6.216702761078166e-06, + "loss": 5.403, + "step": 7992 + }, + { + "epoch": 0.7707810993249758, + "grad_norm": 1.7359880208969116, + "learning_rate": 6.21170551782724e-06, + "loss": 5.2526, + "step": 7993 + }, + { + "epoch": 0.770877531340405, + "grad_norm": 1.8370789289474487, + "learning_rate": 6.206709998942847e-06, + "loss": 5.2262, + "step": 7994 + }, + { + "epoch": 0.7709739633558341, + "grad_norm": 1.5261131525039673, + "learning_rate": 6.201716204883484e-06, + "loss": 5.1675, + "step": 7995 + }, + { + "epoch": 0.7710703953712632, + "grad_norm": 1.9907951354980469, + "learning_rate": 6.196724136107468e-06, + "loss": 5.0718, + "step": 7996 + }, + { + "epoch": 0.7711668273866924, + "grad_norm": 1.5178488492965698, + "learning_rate": 6.1917337930729766e-06, + "loss": 5.3341, + "step": 7997 + }, + { + "epoch": 0.7712632594021215, + "grad_norm": 1.8374247550964355, + "learning_rate": 6.186745176237993e-06, + "loss": 5.2118, + "step": 7998 + }, + { + "epoch": 0.7713596914175507, + "grad_norm": 2.4665615558624268, + "learning_rate": 6.181758286060396e-06, + "loss": 5.4025, + "step": 7999 + }, + { + "epoch": 0.7714561234329798, + "grad_norm": 1.8832859992980957, + "learning_rate": 6.176773122997848e-06, + "loss": 5.3949, + "step": 8000 + }, + { + "epoch": 0.7715525554484088, + "grad_norm": 2.192945957183838, + "learning_rate": 6.171789687507909e-06, + "loss": 5.4508, + "step": 8001 + }, + { + "epoch": 0.771648987463838, + "grad_norm": 2.7523910999298096, + "learning_rate": 6.166807980047931e-06, + "loss": 5.294, + "step": 8002 + }, + { + "epoch": 0.7717454194792671, + "grad_norm": 2.7633137702941895, + "learning_rate": 6.161828001075137e-06, + "loss": 5.4541, + "step": 8003 + }, + { + "epoch": 0.7718418514946962, + "grad_norm": 1.9192981719970703, + "learning_rate": 6.156849751046584e-06, + "loss": 5.3254, + "step": 8004 + }, + { + "epoch": 0.7719382835101254, + "grad_norm": 1.6491150856018066, + "learning_rate": 6.1518732304191695e-06, + "loss": 5.323, + "step": 8005 + }, + { + "epoch": 0.7720347155255545, + "grad_norm": 1.848685383796692, + "learning_rate": 6.146898439649629e-06, + "loss": 5.2618, + "step": 8006 + }, + { + "epoch": 0.7721311475409836, + "grad_norm": 1.8609693050384521, + "learning_rate": 6.141925379194546e-06, + "loss": 5.3842, + "step": 8007 + }, + { + "epoch": 0.7722275795564127, + "grad_norm": 2.290116310119629, + "learning_rate": 6.136954049510352e-06, + "loss": 5.3172, + "step": 8008 + }, + { + "epoch": 0.7723240115718418, + "grad_norm": 2.2897331714630127, + "learning_rate": 6.13198445105328e-06, + "loss": 5.3163, + "step": 8009 + }, + { + "epoch": 0.772420443587271, + "grad_norm": 1.7328846454620361, + "learning_rate": 6.127016584279468e-06, + "loss": 5.4472, + "step": 8010 + }, + { + "epoch": 0.7725168756027001, + "grad_norm": 2.3054752349853516, + "learning_rate": 6.122050449644831e-06, + "loss": 5.3306, + "step": 8011 + }, + { + "epoch": 0.7726133076181292, + "grad_norm": 2.953409194946289, + "learning_rate": 6.117086047605183e-06, + "loss": 5.6221, + "step": 8012 + }, + { + "epoch": 0.7727097396335584, + "grad_norm": 2.4530978202819824, + "learning_rate": 6.11212337861613e-06, + "loss": 5.2036, + "step": 8013 + }, + { + "epoch": 0.7728061716489875, + "grad_norm": 1.8745038509368896, + "learning_rate": 6.107162443133147e-06, + "loss": 5.1744, + "step": 8014 + }, + { + "epoch": 0.7729026036644165, + "grad_norm": 2.1568603515625, + "learning_rate": 6.1022032416115375e-06, + "loss": 5.0375, + "step": 8015 + }, + { + "epoch": 0.7729990356798457, + "grad_norm": 1.795969009399414, + "learning_rate": 6.097245774506466e-06, + "loss": 5.2438, + "step": 8016 + }, + { + "epoch": 0.7730954676952748, + "grad_norm": 2.607517957687378, + "learning_rate": 6.092290042272896e-06, + "loss": 5.4595, + "step": 8017 + }, + { + "epoch": 0.7731918997107039, + "grad_norm": 2.892169713973999, + "learning_rate": 6.087336045365688e-06, + "loss": 5.5989, + "step": 8018 + }, + { + "epoch": 0.7732883317261331, + "grad_norm": 1.9979581832885742, + "learning_rate": 6.082383784239495e-06, + "loss": 5.4216, + "step": 8019 + }, + { + "epoch": 0.7733847637415622, + "grad_norm": 1.583822250366211, + "learning_rate": 6.077433259348833e-06, + "loss": 5.2532, + "step": 8020 + }, + { + "epoch": 0.7734811957569914, + "grad_norm": 2.3236725330352783, + "learning_rate": 6.072484471148057e-06, + "loss": 5.7213, + "step": 8021 + }, + { + "epoch": 0.7735776277724205, + "grad_norm": 1.9931857585906982, + "learning_rate": 6.06753742009136e-06, + "loss": 5.3111, + "step": 8022 + }, + { + "epoch": 0.7736740597878495, + "grad_norm": 2.4972598552703857, + "learning_rate": 6.062592106632775e-06, + "loss": 5.3651, + "step": 8023 + }, + { + "epoch": 0.7737704918032787, + "grad_norm": 2.185671806335449, + "learning_rate": 6.057648531226182e-06, + "loss": 5.1936, + "step": 8024 + }, + { + "epoch": 0.7738669238187078, + "grad_norm": 1.6999658346176147, + "learning_rate": 6.052706694325288e-06, + "loss": 5.2834, + "step": 8025 + }, + { + "epoch": 0.7739633558341369, + "grad_norm": 1.5856751203536987, + "learning_rate": 6.047766596383658e-06, + "loss": 5.4714, + "step": 8026 + }, + { + "epoch": 0.7740597878495661, + "grad_norm": 2.2764008045196533, + "learning_rate": 6.042828237854689e-06, + "loss": 5.0373, + "step": 8027 + }, + { + "epoch": 0.7741562198649952, + "grad_norm": 1.9208329916000366, + "learning_rate": 6.037891619191596e-06, + "loss": 5.1418, + "step": 8028 + }, + { + "epoch": 0.7742526518804242, + "grad_norm": 1.6544181108474731, + "learning_rate": 6.032956740847487e-06, + "loss": 5.297, + "step": 8029 + }, + { + "epoch": 0.7743490838958534, + "grad_norm": 1.5042994022369385, + "learning_rate": 6.028023603275257e-06, + "loss": 5.3111, + "step": 8030 + }, + { + "epoch": 0.7744455159112825, + "grad_norm": 1.6475129127502441, + "learning_rate": 6.023092206927669e-06, + "loss": 5.5025, + "step": 8031 + }, + { + "epoch": 0.7745419479267117, + "grad_norm": 1.5226333141326904, + "learning_rate": 6.01816255225732e-06, + "loss": 5.2563, + "step": 8032 + }, + { + "epoch": 0.7746383799421408, + "grad_norm": 2.340359926223755, + "learning_rate": 6.013234639716653e-06, + "loss": 5.2085, + "step": 8033 + }, + { + "epoch": 0.7747348119575699, + "grad_norm": 1.7441036701202393, + "learning_rate": 6.008308469757939e-06, + "loss": 5.4501, + "step": 8034 + }, + { + "epoch": 0.7748312439729991, + "grad_norm": 2.7011196613311768, + "learning_rate": 6.003384042833307e-06, + "loss": 5.0736, + "step": 8035 + }, + { + "epoch": 0.7749276759884282, + "grad_norm": 2.2258620262145996, + "learning_rate": 5.9984613593946915e-06, + "loss": 5.0179, + "step": 8036 + }, + { + "epoch": 0.7750241080038572, + "grad_norm": 1.9032831192016602, + "learning_rate": 5.99354041989392e-06, + "loss": 5.0094, + "step": 8037 + }, + { + "epoch": 0.7751205400192864, + "grad_norm": 1.7536133527755737, + "learning_rate": 5.988621224782609e-06, + "loss": 5.1251, + "step": 8038 + }, + { + "epoch": 0.7752169720347155, + "grad_norm": 1.859471321105957, + "learning_rate": 5.983703774512242e-06, + "loss": 5.4215, + "step": 8039 + }, + { + "epoch": 0.7753134040501446, + "grad_norm": 1.8518965244293213, + "learning_rate": 5.978788069534136e-06, + "loss": 5.3753, + "step": 8040 + }, + { + "epoch": 0.7754098360655738, + "grad_norm": 2.021864652633667, + "learning_rate": 5.9738741102994515e-06, + "loss": 5.2446, + "step": 8041 + }, + { + "epoch": 0.7755062680810029, + "grad_norm": 2.5700294971466064, + "learning_rate": 5.9689618972591846e-06, + "loss": 5.3438, + "step": 8042 + }, + { + "epoch": 0.7756027000964321, + "grad_norm": 2.2384893894195557, + "learning_rate": 5.964051430864176e-06, + "loss": 5.362, + "step": 8043 + }, + { + "epoch": 0.7756991321118611, + "grad_norm": 2.190469264984131, + "learning_rate": 5.959142711565085e-06, + "loss": 5.3956, + "step": 8044 + }, + { + "epoch": 0.7757955641272902, + "grad_norm": 2.4889376163482666, + "learning_rate": 5.954235739812455e-06, + "loss": 5.2806, + "step": 8045 + }, + { + "epoch": 0.7758919961427194, + "grad_norm": 1.8038581609725952, + "learning_rate": 5.94933051605662e-06, + "loss": 5.4414, + "step": 8046 + }, + { + "epoch": 0.7759884281581485, + "grad_norm": 2.027495861053467, + "learning_rate": 5.944427040747783e-06, + "loss": 5.3988, + "step": 8047 + }, + { + "epoch": 0.7760848601735776, + "grad_norm": 1.7477847337722778, + "learning_rate": 5.939525314335975e-06, + "loss": 5.2614, + "step": 8048 + }, + { + "epoch": 0.7761812921890068, + "grad_norm": 1.6793546676635742, + "learning_rate": 5.93462533727108e-06, + "loss": 5.3015, + "step": 8049 + }, + { + "epoch": 0.7762777242044359, + "grad_norm": 1.7315444946289062, + "learning_rate": 5.9297271100028e-06, + "loss": 5.2119, + "step": 8050 + }, + { + "epoch": 0.7763741562198649, + "grad_norm": 1.9075218439102173, + "learning_rate": 5.924830632980699e-06, + "loss": 5.3141, + "step": 8051 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 1.6928671598434448, + "learning_rate": 5.91993590665417e-06, + "loss": 5.2231, + "step": 8052 + }, + { + "epoch": 0.7765670202507232, + "grad_norm": 1.7300994396209717, + "learning_rate": 5.915042931472425e-06, + "loss": 5.3191, + "step": 8053 + }, + { + "epoch": 0.7766634522661524, + "grad_norm": 1.7315107583999634, + "learning_rate": 5.910151707884565e-06, + "loss": 5.249, + "step": 8054 + }, + { + "epoch": 0.7767598842815815, + "grad_norm": 1.7644611597061157, + "learning_rate": 5.90526223633947e-06, + "loss": 5.5559, + "step": 8055 + }, + { + "epoch": 0.7768563162970106, + "grad_norm": 2.0113277435302734, + "learning_rate": 5.9003745172859194e-06, + "loss": 5.3616, + "step": 8056 + }, + { + "epoch": 0.7769527483124398, + "grad_norm": 2.8345978260040283, + "learning_rate": 5.8954885511724815e-06, + "loss": 5.4646, + "step": 8057 + }, + { + "epoch": 0.7770491803278688, + "grad_norm": 2.9965813159942627, + "learning_rate": 5.890604338447592e-06, + "loss": 5.5211, + "step": 8058 + }, + { + "epoch": 0.7771456123432979, + "grad_norm": 1.8161767721176147, + "learning_rate": 5.885721879559514e-06, + "loss": 5.4338, + "step": 8059 + }, + { + "epoch": 0.7772420443587271, + "grad_norm": 1.8574659824371338, + "learning_rate": 5.880841174956364e-06, + "loss": 5.3928, + "step": 8060 + }, + { + "epoch": 0.7773384763741562, + "grad_norm": 2.0417590141296387, + "learning_rate": 5.875962225086065e-06, + "loss": 5.4865, + "step": 8061 + }, + { + "epoch": 0.7774349083895853, + "grad_norm": 1.9530264139175415, + "learning_rate": 5.871085030396431e-06, + "loss": 5.1899, + "step": 8062 + }, + { + "epoch": 0.7775313404050145, + "grad_norm": 2.069955587387085, + "learning_rate": 5.866209591335059e-06, + "loss": 5.1047, + "step": 8063 + }, + { + "epoch": 0.7776277724204436, + "grad_norm": 1.695143461227417, + "learning_rate": 5.861335908349422e-06, + "loss": 5.0882, + "step": 8064 + }, + { + "epoch": 0.7777242044358728, + "grad_norm": 1.442732810974121, + "learning_rate": 5.85646398188682e-06, + "loss": 5.4836, + "step": 8065 + }, + { + "epoch": 0.7778206364513018, + "grad_norm": 1.8753366470336914, + "learning_rate": 5.85159381239439e-06, + "loss": 5.4934, + "step": 8066 + }, + { + "epoch": 0.7779170684667309, + "grad_norm": 2.432513952255249, + "learning_rate": 5.846725400319114e-06, + "loss": 5.6784, + "step": 8067 + }, + { + "epoch": 0.7780135004821601, + "grad_norm": 3.385059118270874, + "learning_rate": 5.841858746107809e-06, + "loss": 5.3983, + "step": 8068 + }, + { + "epoch": 0.7781099324975892, + "grad_norm": 3.038038969039917, + "learning_rate": 5.8369938502071255e-06, + "loss": 5.3931, + "step": 8069 + }, + { + "epoch": 0.7782063645130183, + "grad_norm": 1.7226436138153076, + "learning_rate": 5.832130713063561e-06, + "loss": 5.4192, + "step": 8070 + }, + { + "epoch": 0.7783027965284475, + "grad_norm": 2.4032301902770996, + "learning_rate": 5.827269335123453e-06, + "loss": 5.4707, + "step": 8071 + }, + { + "epoch": 0.7783992285438766, + "grad_norm": 2.6641812324523926, + "learning_rate": 5.822409716832957e-06, + "loss": 5.4441, + "step": 8072 + }, + { + "epoch": 0.7784956605593056, + "grad_norm": 2.356501817703247, + "learning_rate": 5.817551858638107e-06, + "loss": 5.3075, + "step": 8073 + }, + { + "epoch": 0.7785920925747348, + "grad_norm": 2.50941801071167, + "learning_rate": 5.812695760984729e-06, + "loss": 5.5219, + "step": 8074 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 2.769298553466797, + "learning_rate": 5.807841424318519e-06, + "loss": 5.2047, + "step": 8075 + }, + { + "epoch": 0.7787849566055931, + "grad_norm": 2.237783670425415, + "learning_rate": 5.8029888490850005e-06, + "loss": 5.4285, + "step": 8076 + }, + { + "epoch": 0.7788813886210222, + "grad_norm": 2.08324933052063, + "learning_rate": 5.798138035729539e-06, + "loss": 5.3727, + "step": 8077 + }, + { + "epoch": 0.7789778206364513, + "grad_norm": 2.2267603874206543, + "learning_rate": 5.793288984697334e-06, + "loss": 5.4971, + "step": 8078 + }, + { + "epoch": 0.7790742526518805, + "grad_norm": 2.7557194232940674, + "learning_rate": 5.7884416964334335e-06, + "loss": 5.3582, + "step": 8079 + }, + { + "epoch": 0.7791706846673095, + "grad_norm": 1.8165007829666138, + "learning_rate": 5.7835961713826935e-06, + "loss": 5.3659, + "step": 8080 + }, + { + "epoch": 0.7792671166827386, + "grad_norm": 1.725317358970642, + "learning_rate": 5.778752409989862e-06, + "loss": 5.3623, + "step": 8081 + }, + { + "epoch": 0.7793635486981678, + "grad_norm": 1.8485981225967407, + "learning_rate": 5.773910412699468e-06, + "loss": 5.4536, + "step": 8082 + }, + { + "epoch": 0.7794599807135969, + "grad_norm": 1.6482313871383667, + "learning_rate": 5.769070179955913e-06, + "loss": 5.4792, + "step": 8083 + }, + { + "epoch": 0.779556412729026, + "grad_norm": 2.0572521686553955, + "learning_rate": 5.76423171220343e-06, + "loss": 5.4612, + "step": 8084 + }, + { + "epoch": 0.7796528447444552, + "grad_norm": 2.383953094482422, + "learning_rate": 5.7593950098860815e-06, + "loss": 5.5223, + "step": 8085 + }, + { + "epoch": 0.7797492767598843, + "grad_norm": 2.199842929840088, + "learning_rate": 5.754560073447779e-06, + "loss": 5.3966, + "step": 8086 + }, + { + "epoch": 0.7798457087753135, + "grad_norm": 2.405684471130371, + "learning_rate": 5.749726903332267e-06, + "loss": 5.2459, + "step": 8087 + }, + { + "epoch": 0.7799421407907425, + "grad_norm": 1.9678117036819458, + "learning_rate": 5.744895499983125e-06, + "loss": 5.2506, + "step": 8088 + }, + { + "epoch": 0.7800385728061716, + "grad_norm": 1.800691843032837, + "learning_rate": 5.740065863843774e-06, + "loss": 5.4772, + "step": 8089 + }, + { + "epoch": 0.7801350048216008, + "grad_norm": 1.7082865238189697, + "learning_rate": 5.735237995357481e-06, + "loss": 5.3358, + "step": 8090 + }, + { + "epoch": 0.7802314368370299, + "grad_norm": 1.5744884014129639, + "learning_rate": 5.730411894967319e-06, + "loss": 5.3384, + "step": 8091 + }, + { + "epoch": 0.780327868852459, + "grad_norm": 1.7561637163162231, + "learning_rate": 5.7255875631162525e-06, + "loss": 5.4207, + "step": 8092 + }, + { + "epoch": 0.7804243008678882, + "grad_norm": 2.3893678188323975, + "learning_rate": 5.7207650002470274e-06, + "loss": 5.5859, + "step": 8093 + }, + { + "epoch": 0.7805207328833172, + "grad_norm": 1.7144454717636108, + "learning_rate": 5.7159442068022624e-06, + "loss": 5.4072, + "step": 8094 + }, + { + "epoch": 0.7806171648987463, + "grad_norm": 1.627066969871521, + "learning_rate": 5.7111251832244026e-06, + "loss": 5.2994, + "step": 8095 + }, + { + "epoch": 0.7807135969141755, + "grad_norm": 1.8639702796936035, + "learning_rate": 5.706307929955742e-06, + "loss": 5.457, + "step": 8096 + }, + { + "epoch": 0.7808100289296046, + "grad_norm": 1.8060137033462524, + "learning_rate": 5.701492447438378e-06, + "loss": 5.2916, + "step": 8097 + }, + { + "epoch": 0.7809064609450338, + "grad_norm": 3.336390972137451, + "learning_rate": 5.696678736114297e-06, + "loss": 4.9283, + "step": 8098 + }, + { + "epoch": 0.7810028929604629, + "grad_norm": 3.2935755252838135, + "learning_rate": 5.691866796425269e-06, + "loss": 4.9554, + "step": 8099 + }, + { + "epoch": 0.781099324975892, + "grad_norm": 2.5179038047790527, + "learning_rate": 5.687056628812956e-06, + "loss": 5.3949, + "step": 8100 + }, + { + "epoch": 0.7811957569913212, + "grad_norm": 2.085972309112549, + "learning_rate": 5.6822482337188095e-06, + "loss": 5.4939, + "step": 8101 + }, + { + "epoch": 0.7812921890067502, + "grad_norm": 1.4917659759521484, + "learning_rate": 5.677441611584145e-06, + "loss": 5.4239, + "step": 8102 + }, + { + "epoch": 0.7813886210221793, + "grad_norm": 1.8087034225463867, + "learning_rate": 5.672636762850106e-06, + "loss": 5.5367, + "step": 8103 + }, + { + "epoch": 0.7814850530376085, + "grad_norm": 1.768520712852478, + "learning_rate": 5.667833687957683e-06, + "loss": 5.3433, + "step": 8104 + }, + { + "epoch": 0.7815814850530376, + "grad_norm": 1.7662379741668701, + "learning_rate": 5.663032387347678e-06, + "loss": 5.445, + "step": 8105 + }, + { + "epoch": 0.7816779170684667, + "grad_norm": 1.9702867269515991, + "learning_rate": 5.6582328614607715e-06, + "loss": 5.3933, + "step": 8106 + }, + { + "epoch": 0.7817743490838959, + "grad_norm": 2.1487393379211426, + "learning_rate": 5.653435110737443e-06, + "loss": 5.3262, + "step": 8107 + }, + { + "epoch": 0.781870781099325, + "grad_norm": 1.813610315322876, + "learning_rate": 5.6486391356180255e-06, + "loss": 5.4927, + "step": 8108 + }, + { + "epoch": 0.7819672131147541, + "grad_norm": 2.938547372817993, + "learning_rate": 5.643844936542692e-06, + "loss": 5.4561, + "step": 8109 + }, + { + "epoch": 0.7820636451301832, + "grad_norm": 2.091245412826538, + "learning_rate": 5.639052513951448e-06, + "loss": 5.3373, + "step": 8110 + }, + { + "epoch": 0.7821600771456123, + "grad_norm": 2.5004541873931885, + "learning_rate": 5.634261868284133e-06, + "loss": 4.9963, + "step": 8111 + }, + { + "epoch": 0.7822565091610415, + "grad_norm": 3.5023961067199707, + "learning_rate": 5.629472999980429e-06, + "loss": 5.2152, + "step": 8112 + }, + { + "epoch": 0.7823529411764706, + "grad_norm": 2.8302478790283203, + "learning_rate": 5.624685909479852e-06, + "loss": 5.1917, + "step": 8113 + }, + { + "epoch": 0.7824493731918997, + "grad_norm": 2.111382484436035, + "learning_rate": 5.619900597221753e-06, + "loss": 5.2542, + "step": 8114 + }, + { + "epoch": 0.7825458052073289, + "grad_norm": 3.0018715858459473, + "learning_rate": 5.615117063645334e-06, + "loss": 5.6648, + "step": 8115 + }, + { + "epoch": 0.7826422372227579, + "grad_norm": 2.442234516143799, + "learning_rate": 5.610335309189596e-06, + "loss": 5.3552, + "step": 8116 + }, + { + "epoch": 0.782738669238187, + "grad_norm": 2.6410903930664062, + "learning_rate": 5.605555334293433e-06, + "loss": 5.5248, + "step": 8117 + }, + { + "epoch": 0.7828351012536162, + "grad_norm": 2.1538007259368896, + "learning_rate": 5.6007771393955246e-06, + "loss": 5.3508, + "step": 8118 + }, + { + "epoch": 0.7829315332690453, + "grad_norm": 1.8997554779052734, + "learning_rate": 5.596000724934414e-06, + "loss": 5.6695, + "step": 8119 + }, + { + "epoch": 0.7830279652844745, + "grad_norm": 1.8692208528518677, + "learning_rate": 5.591226091348475e-06, + "loss": 5.3266, + "step": 8120 + }, + { + "epoch": 0.7831243972999036, + "grad_norm": 2.2675764560699463, + "learning_rate": 5.586453239075915e-06, + "loss": 5.1118, + "step": 8121 + }, + { + "epoch": 0.7832208293153327, + "grad_norm": 2.3992719650268555, + "learning_rate": 5.581682168554783e-06, + "loss": 5.2814, + "step": 8122 + }, + { + "epoch": 0.7833172613307618, + "grad_norm": 2.5442659854888916, + "learning_rate": 5.5769128802229705e-06, + "loss": 5.44, + "step": 8123 + }, + { + "epoch": 0.7834136933461909, + "grad_norm": 2.1472880840301514, + "learning_rate": 5.57214537451817e-06, + "loss": 5.4494, + "step": 8124 + }, + { + "epoch": 0.78351012536162, + "grad_norm": 2.161715030670166, + "learning_rate": 5.567379651877971e-06, + "loss": 5.5149, + "step": 8125 + }, + { + "epoch": 0.7836065573770492, + "grad_norm": 1.8691421747207642, + "learning_rate": 5.562615712739744e-06, + "loss": 5.2813, + "step": 8126 + }, + { + "epoch": 0.7837029893924783, + "grad_norm": 1.9217948913574219, + "learning_rate": 5.5578535575407236e-06, + "loss": 5.2124, + "step": 8127 + }, + { + "epoch": 0.7837994214079074, + "grad_norm": 1.651638388633728, + "learning_rate": 5.553093186717972e-06, + "loss": 5.441, + "step": 8128 + }, + { + "epoch": 0.7838958534233366, + "grad_norm": 2.0279853343963623, + "learning_rate": 5.5483346007083966e-06, + "loss": 5.4289, + "step": 8129 + }, + { + "epoch": 0.7839922854387656, + "grad_norm": 1.8390076160430908, + "learning_rate": 5.543577799948726e-06, + "loss": 5.5082, + "step": 8130 + }, + { + "epoch": 0.7840887174541948, + "grad_norm": 2.4792420864105225, + "learning_rate": 5.538822784875541e-06, + "loss": 5.612, + "step": 8131 + }, + { + "epoch": 0.7841851494696239, + "grad_norm": 1.8829344511032104, + "learning_rate": 5.534069555925248e-06, + "loss": 5.5094, + "step": 8132 + }, + { + "epoch": 0.784281581485053, + "grad_norm": 1.9293467998504639, + "learning_rate": 5.52931811353409e-06, + "loss": 5.3956, + "step": 8133 + }, + { + "epoch": 0.7843780135004822, + "grad_norm": 2.4455485343933105, + "learning_rate": 5.5245684581381605e-06, + "loss": 5.4985, + "step": 8134 + }, + { + "epoch": 0.7844744455159113, + "grad_norm": 1.657894253730774, + "learning_rate": 5.51982059017335e-06, + "loss": 5.5122, + "step": 8135 + }, + { + "epoch": 0.7845708775313404, + "grad_norm": 1.8495961427688599, + "learning_rate": 5.515074510075446e-06, + "loss": 5.405, + "step": 8136 + }, + { + "epoch": 0.7846673095467696, + "grad_norm": 2.266766309738159, + "learning_rate": 5.510330218280016e-06, + "loss": 5.1504, + "step": 8137 + }, + { + "epoch": 0.7847637415621986, + "grad_norm": 2.64729642868042, + "learning_rate": 5.5055877152224875e-06, + "loss": 5.3222, + "step": 8138 + }, + { + "epoch": 0.7848601735776277, + "grad_norm": 2.3525021076202393, + "learning_rate": 5.5008470013381276e-06, + "loss": 5.433, + "step": 8139 + }, + { + "epoch": 0.7849566055930569, + "grad_norm": 2.7412426471710205, + "learning_rate": 5.496108077062034e-06, + "loss": 5.5213, + "step": 8140 + }, + { + "epoch": 0.785053037608486, + "grad_norm": 2.778233051300049, + "learning_rate": 5.4913709428291244e-06, + "loss": 5.4628, + "step": 8141 + }, + { + "epoch": 0.7851494696239152, + "grad_norm": 1.6963509321212769, + "learning_rate": 5.486635599074191e-06, + "loss": 5.4041, + "step": 8142 + }, + { + "epoch": 0.7852459016393443, + "grad_norm": 1.674825668334961, + "learning_rate": 5.481902046231813e-06, + "loss": 5.5124, + "step": 8143 + }, + { + "epoch": 0.7853423336547734, + "grad_norm": 2.4519522190093994, + "learning_rate": 5.4771702847364545e-06, + "loss": 5.4338, + "step": 8144 + }, + { + "epoch": 0.7854387656702025, + "grad_norm": 2.0889954566955566, + "learning_rate": 5.472440315022373e-06, + "loss": 5.4681, + "step": 8145 + }, + { + "epoch": 0.7855351976856316, + "grad_norm": 2.0264132022857666, + "learning_rate": 5.467712137523684e-06, + "loss": 5.1981, + "step": 8146 + }, + { + "epoch": 0.7856316297010607, + "grad_norm": 2.1723697185516357, + "learning_rate": 5.462985752674335e-06, + "loss": 5.0985, + "step": 8147 + }, + { + "epoch": 0.7857280617164899, + "grad_norm": 1.6890631914138794, + "learning_rate": 5.4582611609081155e-06, + "loss": 5.137, + "step": 8148 + }, + { + "epoch": 0.785824493731919, + "grad_norm": 1.8020288944244385, + "learning_rate": 5.453538362658619e-06, + "loss": 5.3103, + "step": 8149 + }, + { + "epoch": 0.7859209257473481, + "grad_norm": 2.026069402694702, + "learning_rate": 5.448817358359329e-06, + "loss": 5.1502, + "step": 8150 + }, + { + "epoch": 0.7860173577627773, + "grad_norm": 1.9060909748077393, + "learning_rate": 5.4440981484435104e-06, + "loss": 5.4511, + "step": 8151 + }, + { + "epoch": 0.7861137897782063, + "grad_norm": 1.9911190271377563, + "learning_rate": 5.439380733344293e-06, + "loss": 5.4347, + "step": 8152 + }, + { + "epoch": 0.7862102217936355, + "grad_norm": 2.0693347454071045, + "learning_rate": 5.434665113494638e-06, + "loss": 5.3892, + "step": 8153 + }, + { + "epoch": 0.7863066538090646, + "grad_norm": 2.355477809906006, + "learning_rate": 5.42995128932734e-06, + "loss": 5.3657, + "step": 8154 + }, + { + "epoch": 0.7864030858244937, + "grad_norm": 1.9392789602279663, + "learning_rate": 5.425239261275025e-06, + "loss": 5.3442, + "step": 8155 + }, + { + "epoch": 0.7864995178399229, + "grad_norm": 2.248522996902466, + "learning_rate": 5.420529029770155e-06, + "loss": 5.3074, + "step": 8156 + }, + { + "epoch": 0.786595949855352, + "grad_norm": 2.590822219848633, + "learning_rate": 5.415820595245036e-06, + "loss": 5.3642, + "step": 8157 + }, + { + "epoch": 0.786692381870781, + "grad_norm": 1.5270553827285767, + "learning_rate": 5.411113958131797e-06, + "loss": 5.2939, + "step": 8158 + }, + { + "epoch": 0.7867888138862102, + "grad_norm": 2.18890643119812, + "learning_rate": 5.406409118862416e-06, + "loss": 5.313, + "step": 8159 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 2.342061996459961, + "learning_rate": 5.4017060778686766e-06, + "loss": 5.4306, + "step": 8160 + }, + { + "epoch": 0.7869816779170684, + "grad_norm": 2.0352015495300293, + "learning_rate": 5.397004835582242e-06, + "loss": 5.3383, + "step": 8161 + }, + { + "epoch": 0.7870781099324976, + "grad_norm": 1.8704562187194824, + "learning_rate": 5.392305392434574e-06, + "loss": 5.3571, + "step": 8162 + }, + { + "epoch": 0.7871745419479267, + "grad_norm": 2.184507369995117, + "learning_rate": 5.387607748856982e-06, + "loss": 5.5796, + "step": 8163 + }, + { + "epoch": 0.7872709739633559, + "grad_norm": 2.438213348388672, + "learning_rate": 5.382911905280614e-06, + "loss": 5.4102, + "step": 8164 + }, + { + "epoch": 0.787367405978785, + "grad_norm": 3.997951030731201, + "learning_rate": 5.378217862136442e-06, + "loss": 5.176, + "step": 8165 + }, + { + "epoch": 0.787463837994214, + "grad_norm": 3.7912888526916504, + "learning_rate": 5.373525619855288e-06, + "loss": 5.1808, + "step": 8166 + }, + { + "epoch": 0.7875602700096432, + "grad_norm": 2.8028998374938965, + "learning_rate": 5.3688351788678035e-06, + "loss": 5.2174, + "step": 8167 + }, + { + "epoch": 0.7876567020250723, + "grad_norm": 2.6162190437316895, + "learning_rate": 5.364146539604447e-06, + "loss": 5.1995, + "step": 8168 + }, + { + "epoch": 0.7877531340405014, + "grad_norm": 2.8254826068878174, + "learning_rate": 5.35945970249557e-06, + "loss": 5.3884, + "step": 8169 + }, + { + "epoch": 0.7878495660559306, + "grad_norm": 3.454096555709839, + "learning_rate": 5.354774667971302e-06, + "loss": 5.3294, + "step": 8170 + }, + { + "epoch": 0.7879459980713597, + "grad_norm": 3.5147626399993896, + "learning_rate": 5.350091436461633e-06, + "loss": 5.2532, + "step": 8171 + }, + { + "epoch": 0.7880424300867888, + "grad_norm": 3.6803433895111084, + "learning_rate": 5.345410008396387e-06, + "loss": 5.3424, + "step": 8172 + }, + { + "epoch": 0.788138862102218, + "grad_norm": 2.922015905380249, + "learning_rate": 5.340730384205222e-06, + "loss": 5.4674, + "step": 8173 + }, + { + "epoch": 0.788235294117647, + "grad_norm": 2.7606518268585205, + "learning_rate": 5.336052564317623e-06, + "loss": 5.1068, + "step": 8174 + }, + { + "epoch": 0.7883317261330762, + "grad_norm": 1.5232652425765991, + "learning_rate": 5.33137654916292e-06, + "loss": 5.1866, + "step": 8175 + }, + { + "epoch": 0.7884281581485053, + "grad_norm": 1.896502137184143, + "learning_rate": 5.326702339170267e-06, + "loss": 5.2535, + "step": 8176 + }, + { + "epoch": 0.7885245901639344, + "grad_norm": 2.8603665828704834, + "learning_rate": 5.32202993476866e-06, + "loss": 5.3791, + "step": 8177 + }, + { + "epoch": 0.7886210221793636, + "grad_norm": 3.061082601547241, + "learning_rate": 5.317359336386931e-06, + "loss": 5.3841, + "step": 8178 + }, + { + "epoch": 0.7887174541947927, + "grad_norm": 2.8110034465789795, + "learning_rate": 5.312690544453727e-06, + "loss": 5.3091, + "step": 8179 + }, + { + "epoch": 0.7888138862102217, + "grad_norm": 2.998701810836792, + "learning_rate": 5.308023559397565e-06, + "loss": 5.4574, + "step": 8180 + }, + { + "epoch": 0.7889103182256509, + "grad_norm": 2.6837496757507324, + "learning_rate": 5.303358381646756e-06, + "loss": 5.0608, + "step": 8181 + }, + { + "epoch": 0.78900675024108, + "grad_norm": 2.3846654891967773, + "learning_rate": 5.298695011629473e-06, + "loss": 5.2824, + "step": 8182 + }, + { + "epoch": 0.7891031822565091, + "grad_norm": 1.5567331314086914, + "learning_rate": 5.294033449773714e-06, + "loss": 5.3789, + "step": 8183 + }, + { + "epoch": 0.7891996142719383, + "grad_norm": 2.3902971744537354, + "learning_rate": 5.2893736965073185e-06, + "loss": 5.5035, + "step": 8184 + }, + { + "epoch": 0.7892960462873674, + "grad_norm": 2.1671934127807617, + "learning_rate": 5.284715752257929e-06, + "loss": 5.4007, + "step": 8185 + }, + { + "epoch": 0.7893924783027966, + "grad_norm": 2.159506320953369, + "learning_rate": 5.280059617453079e-06, + "loss": 5.3254, + "step": 8186 + }, + { + "epoch": 0.7894889103182257, + "grad_norm": 2.3040177822113037, + "learning_rate": 5.275405292520069e-06, + "loss": 5.3279, + "step": 8187 + }, + { + "epoch": 0.7895853423336547, + "grad_norm": 1.4987772703170776, + "learning_rate": 5.270752777886101e-06, + "loss": 5.3531, + "step": 8188 + }, + { + "epoch": 0.7896817743490839, + "grad_norm": 2.061269760131836, + "learning_rate": 5.266102073978152e-06, + "loss": 5.5404, + "step": 8189 + }, + { + "epoch": 0.789778206364513, + "grad_norm": 1.731158971786499, + "learning_rate": 5.261453181223067e-06, + "loss": 5.4086, + "step": 8190 + }, + { + "epoch": 0.7898746383799421, + "grad_norm": 1.750083088874817, + "learning_rate": 5.256806100047515e-06, + "loss": 5.4498, + "step": 8191 + }, + { + "epoch": 0.7899710703953713, + "grad_norm": 1.7564178705215454, + "learning_rate": 5.252160830877997e-06, + "loss": 5.4461, + "step": 8192 + }, + { + "epoch": 0.7900675024108004, + "grad_norm": 1.7636908292770386, + "learning_rate": 5.247517374140854e-06, + "loss": 5.5186, + "step": 8193 + }, + { + "epoch": 0.7901639344262295, + "grad_norm": 1.7182683944702148, + "learning_rate": 5.242875730262253e-06, + "loss": 5.319, + "step": 8194 + }, + { + "epoch": 0.7902603664416586, + "grad_norm": 1.3446539640426636, + "learning_rate": 5.2382358996682116e-06, + "loss": 5.3184, + "step": 8195 + }, + { + "epoch": 0.7903567984570877, + "grad_norm": 2.035614013671875, + "learning_rate": 5.233597882784538e-06, + "loss": 5.4615, + "step": 8196 + }, + { + "epoch": 0.7904532304725169, + "grad_norm": 1.8769824504852295, + "learning_rate": 5.228961680036939e-06, + "loss": 5.4496, + "step": 8197 + }, + { + "epoch": 0.790549662487946, + "grad_norm": 1.718842625617981, + "learning_rate": 5.224327291850895e-06, + "loss": 5.5444, + "step": 8198 + }, + { + "epoch": 0.7906460945033751, + "grad_norm": 1.3199455738067627, + "learning_rate": 5.219694718651752e-06, + "loss": 5.4507, + "step": 8199 + }, + { + "epoch": 0.7907425265188043, + "grad_norm": 1.5529230833053589, + "learning_rate": 5.2150639608646825e-06, + "loss": 5.4183, + "step": 8200 + }, + { + "epoch": 0.7908389585342334, + "grad_norm": 1.408331274986267, + "learning_rate": 5.210435018914691e-06, + "loss": 5.4602, + "step": 8201 + }, + { + "epoch": 0.7909353905496624, + "grad_norm": 1.4602899551391602, + "learning_rate": 5.205807893226619e-06, + "loss": 5.6059, + "step": 8202 + }, + { + "epoch": 0.7910318225650916, + "grad_norm": 1.7132411003112793, + "learning_rate": 5.2011825842251394e-06, + "loss": 5.3091, + "step": 8203 + }, + { + "epoch": 0.7911282545805207, + "grad_norm": 1.2011140584945679, + "learning_rate": 5.19655909233474e-06, + "loss": 5.519, + "step": 8204 + }, + { + "epoch": 0.7912246865959498, + "grad_norm": 1.2500596046447754, + "learning_rate": 5.191937417979789e-06, + "loss": 5.4996, + "step": 8205 + }, + { + "epoch": 0.791321118611379, + "grad_norm": 1.387010931968689, + "learning_rate": 5.187317561584434e-06, + "loss": 5.5341, + "step": 8206 + }, + { + "epoch": 0.7914175506268081, + "grad_norm": 1.4294500350952148, + "learning_rate": 5.18269952357269e-06, + "loss": 5.5083, + "step": 8207 + }, + { + "epoch": 0.7915139826422373, + "grad_norm": 1.4267059564590454, + "learning_rate": 5.178083304368389e-06, + "loss": 5.3776, + "step": 8208 + }, + { + "epoch": 0.7916104146576664, + "grad_norm": 1.5815984010696411, + "learning_rate": 5.1734689043952065e-06, + "loss": 5.5692, + "step": 8209 + }, + { + "epoch": 0.7917068466730954, + "grad_norm": 1.4950369596481323, + "learning_rate": 5.168856324076648e-06, + "loss": 5.5786, + "step": 8210 + }, + { + "epoch": 0.7918032786885246, + "grad_norm": 1.5146677494049072, + "learning_rate": 5.164245563836051e-06, + "loss": 5.646, + "step": 8211 + }, + { + "epoch": 0.7918997107039537, + "grad_norm": 1.7093918323516846, + "learning_rate": 5.159636624096572e-06, + "loss": 5.6074, + "step": 8212 + }, + { + "epoch": 0.7919961427193828, + "grad_norm": 2.1383323669433594, + "learning_rate": 5.1550295052812365e-06, + "loss": 5.5824, + "step": 8213 + }, + { + "epoch": 0.792092574734812, + "grad_norm": 1.8158122301101685, + "learning_rate": 5.1504242078128595e-06, + "loss": 5.745, + "step": 8214 + }, + { + "epoch": 0.7921890067502411, + "grad_norm": 1.8103454113006592, + "learning_rate": 5.145820732114121e-06, + "loss": 5.7042, + "step": 8215 + }, + { + "epoch": 0.7922854387656703, + "grad_norm": 1.9272814989089966, + "learning_rate": 5.141219078607515e-06, + "loss": 5.654, + "step": 8216 + }, + { + "epoch": 0.7923818707810993, + "grad_norm": 1.5526013374328613, + "learning_rate": 5.136619247715382e-06, + "loss": 5.6602, + "step": 8217 + }, + { + "epoch": 0.7924783027965284, + "grad_norm": 1.611610770225525, + "learning_rate": 5.132021239859888e-06, + "loss": 5.6188, + "step": 8218 + }, + { + "epoch": 0.7925747348119576, + "grad_norm": 2.2096168994903564, + "learning_rate": 5.127425055463031e-06, + "loss": 5.447, + "step": 8219 + }, + { + "epoch": 0.7926711668273867, + "grad_norm": 2.4979476928710938, + "learning_rate": 5.12283069494664e-06, + "loss": 5.2534, + "step": 8220 + }, + { + "epoch": 0.7927675988428158, + "grad_norm": 2.4422311782836914, + "learning_rate": 5.1182381587323855e-06, + "loss": 5.0625, + "step": 8221 + }, + { + "epoch": 0.792864030858245, + "grad_norm": 2.1045444011688232, + "learning_rate": 5.113647447241768e-06, + "loss": 5.3967, + "step": 8222 + }, + { + "epoch": 0.792960462873674, + "grad_norm": 1.9626014232635498, + "learning_rate": 5.1090585608960985e-06, + "loss": 5.4205, + "step": 8223 + }, + { + "epoch": 0.7930568948891031, + "grad_norm": 1.668393850326538, + "learning_rate": 5.104471500116567e-06, + "loss": 5.5832, + "step": 8224 + }, + { + "epoch": 0.7931533269045323, + "grad_norm": 1.7639532089233398, + "learning_rate": 5.0998862653241465e-06, + "loss": 5.3832, + "step": 8225 + }, + { + "epoch": 0.7932497589199614, + "grad_norm": 2.132340908050537, + "learning_rate": 5.095302856939671e-06, + "loss": 5.0695, + "step": 8226 + }, + { + "epoch": 0.7933461909353906, + "grad_norm": 1.8409860134124756, + "learning_rate": 5.0907212753838e-06, + "loss": 5.1406, + "step": 8227 + }, + { + "epoch": 0.7934426229508197, + "grad_norm": 2.8577463626861572, + "learning_rate": 5.086141521077029e-06, + "loss": 5.2353, + "step": 8228 + }, + { + "epoch": 0.7935390549662488, + "grad_norm": 2.699998617172241, + "learning_rate": 5.081563594439676e-06, + "loss": 5.1319, + "step": 8229 + }, + { + "epoch": 0.793635486981678, + "grad_norm": 2.1517996788024902, + "learning_rate": 5.076987495891908e-06, + "loss": 5.4914, + "step": 8230 + }, + { + "epoch": 0.793731918997107, + "grad_norm": 2.263718366622925, + "learning_rate": 5.072413225853695e-06, + "loss": 5.2809, + "step": 8231 + }, + { + "epoch": 0.7938283510125361, + "grad_norm": 2.4104130268096924, + "learning_rate": 5.067840784744881e-06, + "loss": 5.4774, + "step": 8232 + }, + { + "epoch": 0.7939247830279653, + "grad_norm": 2.2108590602874756, + "learning_rate": 5.063270172985102e-06, + "loss": 5.3516, + "step": 8233 + }, + { + "epoch": 0.7940212150433944, + "grad_norm": 1.928504467010498, + "learning_rate": 5.058701390993845e-06, + "loss": 5.2676, + "step": 8234 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 1.8047722578048706, + "learning_rate": 5.054134439190433e-06, + "loss": 5.4267, + "step": 8235 + }, + { + "epoch": 0.7942140790742527, + "grad_norm": 2.082773208618164, + "learning_rate": 5.049569317994013e-06, + "loss": 5.4857, + "step": 8236 + }, + { + "epoch": 0.7943105110896818, + "grad_norm": 2.1746459007263184, + "learning_rate": 5.045006027823565e-06, + "loss": 5.6262, + "step": 8237 + }, + { + "epoch": 0.794406943105111, + "grad_norm": 1.564111590385437, + "learning_rate": 5.0404445690979046e-06, + "loss": 5.4001, + "step": 8238 + }, + { + "epoch": 0.79450337512054, + "grad_norm": 1.565464735031128, + "learning_rate": 5.035884942235674e-06, + "loss": 5.5071, + "step": 8239 + }, + { + "epoch": 0.7945998071359691, + "grad_norm": 1.5208145380020142, + "learning_rate": 5.031327147655354e-06, + "loss": 5.4437, + "step": 8240 + }, + { + "epoch": 0.7946962391513983, + "grad_norm": 1.4641425609588623, + "learning_rate": 5.026771185775256e-06, + "loss": 5.2696, + "step": 8241 + }, + { + "epoch": 0.7947926711668274, + "grad_norm": 2.0455970764160156, + "learning_rate": 5.022217057013501e-06, + "loss": 5.1856, + "step": 8242 + }, + { + "epoch": 0.7948891031822565, + "grad_norm": 1.5952181816101074, + "learning_rate": 5.01766476178809e-06, + "loss": 5.3076, + "step": 8243 + }, + { + "epoch": 0.7949855351976857, + "grad_norm": 2.086437702178955, + "learning_rate": 5.013114300516805e-06, + "loss": 5.1937, + "step": 8244 + }, + { + "epoch": 0.7950819672131147, + "grad_norm": 2.001352310180664, + "learning_rate": 5.008565673617291e-06, + "loss": 5.2199, + "step": 8245 + }, + { + "epoch": 0.7951783992285438, + "grad_norm": 1.7103413343429565, + "learning_rate": 5.004018881507016e-06, + "loss": 5.1286, + "step": 8246 + }, + { + "epoch": 0.795274831243973, + "grad_norm": 2.0969293117523193, + "learning_rate": 4.9994739246032814e-06, + "loss": 5.4355, + "step": 8247 + }, + { + "epoch": 0.7953712632594021, + "grad_norm": 1.845260500907898, + "learning_rate": 4.9949308033232e-06, + "loss": 5.2863, + "step": 8248 + }, + { + "epoch": 0.7954676952748313, + "grad_norm": 2.161904811859131, + "learning_rate": 4.99038951808376e-06, + "loss": 5.3205, + "step": 8249 + }, + { + "epoch": 0.7955641272902604, + "grad_norm": 1.213966727256775, + "learning_rate": 4.985850069301731e-06, + "loss": 5.2747, + "step": 8250 + }, + { + "epoch": 0.7956605593056895, + "grad_norm": 1.9511752128601074, + "learning_rate": 4.981312457393761e-06, + "loss": 5.4685, + "step": 8251 + }, + { + "epoch": 0.7957569913211187, + "grad_norm": 1.5028783082962036, + "learning_rate": 4.976776682776288e-06, + "loss": 5.3451, + "step": 8252 + }, + { + "epoch": 0.7958534233365477, + "grad_norm": 2.2754452228546143, + "learning_rate": 4.9722427458656075e-06, + "loss": 5.4044, + "step": 8253 + }, + { + "epoch": 0.7959498553519768, + "grad_norm": 1.885406255722046, + "learning_rate": 4.9677106470778365e-06, + "loss": 5.5227, + "step": 8254 + }, + { + "epoch": 0.796046287367406, + "grad_norm": 1.7206393480300903, + "learning_rate": 4.9631803868289335e-06, + "loss": 5.3602, + "step": 8255 + }, + { + "epoch": 0.7961427193828351, + "grad_norm": 1.7598206996917725, + "learning_rate": 4.9586519655346605e-06, + "loss": 5.3229, + "step": 8256 + }, + { + "epoch": 0.7962391513982642, + "grad_norm": 2.0632224082946777, + "learning_rate": 4.954125383610656e-06, + "loss": 5.2287, + "step": 8257 + }, + { + "epoch": 0.7963355834136934, + "grad_norm": 1.5497208833694458, + "learning_rate": 4.949600641472346e-06, + "loss": 5.4141, + "step": 8258 + }, + { + "epoch": 0.7964320154291225, + "grad_norm": 2.1413910388946533, + "learning_rate": 4.945077739535009e-06, + "loss": 5.271, + "step": 8259 + }, + { + "epoch": 0.7965284474445516, + "grad_norm": 2.6742119789123535, + "learning_rate": 4.940556678213753e-06, + "loss": 5.3059, + "step": 8260 + }, + { + "epoch": 0.7966248794599807, + "grad_norm": 1.5744106769561768, + "learning_rate": 4.9360374579235205e-06, + "loss": 5.0164, + "step": 8261 + }, + { + "epoch": 0.7967213114754098, + "grad_norm": 1.3300763368606567, + "learning_rate": 4.931520079079072e-06, + "loss": 5.4229, + "step": 8262 + }, + { + "epoch": 0.796817743490839, + "grad_norm": 1.737249493598938, + "learning_rate": 4.927004542095009e-06, + "loss": 5.2934, + "step": 8263 + }, + { + "epoch": 0.7969141755062681, + "grad_norm": 1.6015474796295166, + "learning_rate": 4.922490847385766e-06, + "loss": 5.554, + "step": 8264 + }, + { + "epoch": 0.7970106075216972, + "grad_norm": 2.3364105224609375, + "learning_rate": 4.917978995365599e-06, + "loss": 5.1258, + "step": 8265 + }, + { + "epoch": 0.7971070395371264, + "grad_norm": 2.283517360687256, + "learning_rate": 4.913468986448614e-06, + "loss": 5.1148, + "step": 8266 + }, + { + "epoch": 0.7972034715525554, + "grad_norm": 2.293440103530884, + "learning_rate": 4.908960821048705e-06, + "loss": 5.0064, + "step": 8267 + }, + { + "epoch": 0.7972999035679845, + "grad_norm": 1.5322850942611694, + "learning_rate": 4.904454499579658e-06, + "loss": 5.2678, + "step": 8268 + }, + { + "epoch": 0.7973963355834137, + "grad_norm": 1.8664579391479492, + "learning_rate": 4.899950022455038e-06, + "loss": 5.3935, + "step": 8269 + }, + { + "epoch": 0.7974927675988428, + "grad_norm": 1.8277729749679565, + "learning_rate": 4.895447390088265e-06, + "loss": 5.2151, + "step": 8270 + }, + { + "epoch": 0.797589199614272, + "grad_norm": 1.595493197441101, + "learning_rate": 4.8909466028925876e-06, + "loss": 5.2905, + "step": 8271 + }, + { + "epoch": 0.7976856316297011, + "grad_norm": 2.1767737865448, + "learning_rate": 4.88644766128108e-06, + "loss": 5.3889, + "step": 8272 + }, + { + "epoch": 0.7977820636451302, + "grad_norm": 1.7958823442459106, + "learning_rate": 4.881950565666649e-06, + "loss": 5.3578, + "step": 8273 + }, + { + "epoch": 0.7978784956605594, + "grad_norm": 2.252964735031128, + "learning_rate": 4.87745531646204e-06, + "loss": 5.1521, + "step": 8274 + }, + { + "epoch": 0.7979749276759884, + "grad_norm": 1.897870421409607, + "learning_rate": 4.872961914079804e-06, + "loss": 5.5091, + "step": 8275 + }, + { + "epoch": 0.7980713596914175, + "grad_norm": 2.3863654136657715, + "learning_rate": 4.868470358932362e-06, + "loss": 5.4389, + "step": 8276 + }, + { + "epoch": 0.7981677917068467, + "grad_norm": 1.8569822311401367, + "learning_rate": 4.863980651431929e-06, + "loss": 5.317, + "step": 8277 + }, + { + "epoch": 0.7982642237222758, + "grad_norm": 2.185532808303833, + "learning_rate": 4.8594927919905665e-06, + "loss": 5.2965, + "step": 8278 + }, + { + "epoch": 0.7983606557377049, + "grad_norm": 2.0189929008483887, + "learning_rate": 4.855006781020166e-06, + "loss": 5.2601, + "step": 8279 + }, + { + "epoch": 0.7984570877531341, + "grad_norm": 1.6603074073791504, + "learning_rate": 4.850522618932449e-06, + "loss": 5.2519, + "step": 8280 + }, + { + "epoch": 0.7985535197685631, + "grad_norm": 2.6044909954071045, + "learning_rate": 4.846040306138969e-06, + "loss": 5.2981, + "step": 8281 + }, + { + "epoch": 0.7986499517839923, + "grad_norm": 2.8488354682922363, + "learning_rate": 4.8415598430510994e-06, + "loss": 5.4027, + "step": 8282 + }, + { + "epoch": 0.7987463837994214, + "grad_norm": 2.574483633041382, + "learning_rate": 4.8370812300800596e-06, + "loss": 5.4274, + "step": 8283 + }, + { + "epoch": 0.7988428158148505, + "grad_norm": 2.6465508937835693, + "learning_rate": 4.832604467636886e-06, + "loss": 5.4572, + "step": 8284 + }, + { + "epoch": 0.7989392478302797, + "grad_norm": 1.5481455326080322, + "learning_rate": 4.828129556132463e-06, + "loss": 5.3523, + "step": 8285 + }, + { + "epoch": 0.7990356798457088, + "grad_norm": 1.9618581533432007, + "learning_rate": 4.823656495977466e-06, + "loss": 5.3091, + "step": 8286 + }, + { + "epoch": 0.7991321118611379, + "grad_norm": 3.1020617485046387, + "learning_rate": 4.819185287582453e-06, + "loss": 5.2039, + "step": 8287 + }, + { + "epoch": 0.799228543876567, + "grad_norm": 2.4119184017181396, + "learning_rate": 4.814715931357774e-06, + "loss": 5.2568, + "step": 8288 + }, + { + "epoch": 0.7993249758919961, + "grad_norm": 2.181210517883301, + "learning_rate": 4.810248427713621e-06, + "loss": 5.3562, + "step": 8289 + }, + { + "epoch": 0.7994214079074252, + "grad_norm": 1.5284695625305176, + "learning_rate": 4.805782777060017e-06, + "loss": 5.2555, + "step": 8290 + }, + { + "epoch": 0.7995178399228544, + "grad_norm": 1.3345435857772827, + "learning_rate": 4.8013189798068205e-06, + "loss": 5.1989, + "step": 8291 + }, + { + "epoch": 0.7996142719382835, + "grad_norm": 2.2807230949401855, + "learning_rate": 4.7968570363636975e-06, + "loss": 5.1605, + "step": 8292 + }, + { + "epoch": 0.7997107039537127, + "grad_norm": 2.1053857803344727, + "learning_rate": 4.792396947140179e-06, + "loss": 5.2252, + "step": 8293 + }, + { + "epoch": 0.7998071359691418, + "grad_norm": 2.631824016571045, + "learning_rate": 4.787938712545587e-06, + "loss": 5.6668, + "step": 8294 + }, + { + "epoch": 0.7999035679845709, + "grad_norm": 1.5796315670013428, + "learning_rate": 4.783482332989114e-06, + "loss": 5.3607, + "step": 8295 + }, + { + "epoch": 0.8, + "grad_norm": 1.9434245824813843, + "learning_rate": 4.779027808879747e-06, + "loss": 4.9281, + "step": 8296 + }, + { + "epoch": 0.8000964320154291, + "grad_norm": 1.8969290256500244, + "learning_rate": 4.7745751406263165e-06, + "loss": 5.4772, + "step": 8297 + }, + { + "epoch": 0.8001928640308582, + "grad_norm": 1.5854851007461548, + "learning_rate": 4.770124328637488e-06, + "loss": 5.5172, + "step": 8298 + }, + { + "epoch": 0.8002892960462874, + "grad_norm": 1.4275331497192383, + "learning_rate": 4.765675373321746e-06, + "loss": 5.4171, + "step": 8299 + }, + { + "epoch": 0.8003857280617165, + "grad_norm": 1.462019920349121, + "learning_rate": 4.761228275087418e-06, + "loss": 5.4437, + "step": 8300 + }, + { + "epoch": 0.8004821600771456, + "grad_norm": 1.7195851802825928, + "learning_rate": 4.756783034342646e-06, + "loss": 5.5717, + "step": 8301 + }, + { + "epoch": 0.8005785920925748, + "grad_norm": 2.3961057662963867, + "learning_rate": 4.752339651495421e-06, + "loss": 5.5508, + "step": 8302 + }, + { + "epoch": 0.8006750241080038, + "grad_norm": 2.034095525741577, + "learning_rate": 4.747898126953535e-06, + "loss": 5.4831, + "step": 8303 + }, + { + "epoch": 0.800771456123433, + "grad_norm": 1.9915522336959839, + "learning_rate": 4.743458461124631e-06, + "loss": 5.3069, + "step": 8304 + }, + { + "epoch": 0.8008678881388621, + "grad_norm": 2.4703495502471924, + "learning_rate": 4.739020654416179e-06, + "loss": 5.2088, + "step": 8305 + }, + { + "epoch": 0.8009643201542912, + "grad_norm": 2.0650484561920166, + "learning_rate": 4.7345847072354735e-06, + "loss": 5.5477, + "step": 8306 + }, + { + "epoch": 0.8010607521697204, + "grad_norm": 1.7207677364349365, + "learning_rate": 4.73015061998964e-06, + "loss": 5.3209, + "step": 8307 + }, + { + "epoch": 0.8011571841851495, + "grad_norm": 1.4799929857254028, + "learning_rate": 4.725718393085635e-06, + "loss": 5.5778, + "step": 8308 + }, + { + "epoch": 0.8012536162005786, + "grad_norm": 2.313948631286621, + "learning_rate": 4.7212880269302425e-06, + "loss": 5.2824, + "step": 8309 + }, + { + "epoch": 0.8013500482160077, + "grad_norm": 1.9508775472640991, + "learning_rate": 4.716859521930083e-06, + "loss": 5.1181, + "step": 8310 + }, + { + "epoch": 0.8014464802314368, + "grad_norm": 1.784595251083374, + "learning_rate": 4.71243287849158e-06, + "loss": 5.3617, + "step": 8311 + }, + { + "epoch": 0.8015429122468659, + "grad_norm": 1.753365159034729, + "learning_rate": 4.70800809702103e-06, + "loss": 5.2528, + "step": 8312 + }, + { + "epoch": 0.8016393442622951, + "grad_norm": 1.5051957368850708, + "learning_rate": 4.703585177924514e-06, + "loss": 5.4087, + "step": 8313 + }, + { + "epoch": 0.8017357762777242, + "grad_norm": 2.101379156112671, + "learning_rate": 4.6991641216079685e-06, + "loss": 5.2237, + "step": 8314 + }, + { + "epoch": 0.8018322082931534, + "grad_norm": 1.9982516765594482, + "learning_rate": 4.6947449284771545e-06, + "loss": 5.2467, + "step": 8315 + }, + { + "epoch": 0.8019286403085825, + "grad_norm": 2.578899621963501, + "learning_rate": 4.6903275989376606e-06, + "loss": 5.1403, + "step": 8316 + }, + { + "epoch": 0.8020250723240115, + "grad_norm": 1.4210747480392456, + "learning_rate": 4.685912133394901e-06, + "loss": 5.3073, + "step": 8317 + }, + { + "epoch": 0.8021215043394407, + "grad_norm": 1.9051433801651, + "learning_rate": 4.68149853225413e-06, + "loss": 5.3432, + "step": 8318 + }, + { + "epoch": 0.8022179363548698, + "grad_norm": 1.4908527135849, + "learning_rate": 4.677086795920405e-06, + "loss": 5.3001, + "step": 8319 + }, + { + "epoch": 0.8023143683702989, + "grad_norm": 2.5710270404815674, + "learning_rate": 4.6726769247986495e-06, + "loss": 5.1285, + "step": 8320 + }, + { + "epoch": 0.8024108003857281, + "grad_norm": 1.8322161436080933, + "learning_rate": 4.668268919293584e-06, + "loss": 5.1026, + "step": 8321 + }, + { + "epoch": 0.8025072324011572, + "grad_norm": 1.5488042831420898, + "learning_rate": 4.663862779809769e-06, + "loss": 5.2775, + "step": 8322 + }, + { + "epoch": 0.8026036644165863, + "grad_norm": 1.8448419570922852, + "learning_rate": 4.659458506751602e-06, + "loss": 5.272, + "step": 8323 + }, + { + "epoch": 0.8027000964320155, + "grad_norm": 1.886016845703125, + "learning_rate": 4.655056100523297e-06, + "loss": 5.1741, + "step": 8324 + }, + { + "epoch": 0.8027965284474445, + "grad_norm": 1.5082029104232788, + "learning_rate": 4.650655561528902e-06, + "loss": 5.4213, + "step": 8325 + }, + { + "epoch": 0.8028929604628737, + "grad_norm": 1.805802345275879, + "learning_rate": 4.646256890172293e-06, + "loss": 5.3271, + "step": 8326 + }, + { + "epoch": 0.8029893924783028, + "grad_norm": 2.386030673980713, + "learning_rate": 4.641860086857178e-06, + "loss": 5.2855, + "step": 8327 + }, + { + "epoch": 0.8030858244937319, + "grad_norm": 3.6562511920928955, + "learning_rate": 4.6374651519870835e-06, + "loss": 5.202, + "step": 8328 + }, + { + "epoch": 0.8031822565091611, + "grad_norm": 2.584418773651123, + "learning_rate": 4.633072085965385e-06, + "loss": 5.467, + "step": 8329 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 1.6800307035446167, + "learning_rate": 4.628680889195247e-06, + "loss": 5.5267, + "step": 8330 + }, + { + "epoch": 0.8033751205400192, + "grad_norm": 1.9072834253311157, + "learning_rate": 4.624291562079719e-06, + "loss": 5.5564, + "step": 8331 + }, + { + "epoch": 0.8034715525554484, + "grad_norm": 2.1802399158477783, + "learning_rate": 4.619904105021624e-06, + "loss": 5.2704, + "step": 8332 + }, + { + "epoch": 0.8035679845708775, + "grad_norm": 2.4447078704833984, + "learning_rate": 4.615518518423651e-06, + "loss": 5.1735, + "step": 8333 + }, + { + "epoch": 0.8036644165863066, + "grad_norm": 2.029158353805542, + "learning_rate": 4.611134802688297e-06, + "loss": 5.5398, + "step": 8334 + }, + { + "epoch": 0.8037608486017358, + "grad_norm": 1.670362949371338, + "learning_rate": 4.6067529582178996e-06, + "loss": 5.5097, + "step": 8335 + }, + { + "epoch": 0.8038572806171649, + "grad_norm": 1.3384279012680054, + "learning_rate": 4.602372985414607e-06, + "loss": 5.2268, + "step": 8336 + }, + { + "epoch": 0.8039537126325941, + "grad_norm": 1.7458466291427612, + "learning_rate": 4.597994884680429e-06, + "loss": 5.3914, + "step": 8337 + }, + { + "epoch": 0.8040501446480232, + "grad_norm": 1.5138611793518066, + "learning_rate": 4.593618656417154e-06, + "loss": 5.4463, + "step": 8338 + }, + { + "epoch": 0.8041465766634522, + "grad_norm": 1.6070963144302368, + "learning_rate": 4.589244301026458e-06, + "loss": 5.3436, + "step": 8339 + }, + { + "epoch": 0.8042430086788814, + "grad_norm": 1.9965664148330688, + "learning_rate": 4.584871818909789e-06, + "loss": 5.1622, + "step": 8340 + }, + { + "epoch": 0.8043394406943105, + "grad_norm": 2.0178098678588867, + "learning_rate": 4.5805012104684604e-06, + "loss": 5.2043, + "step": 8341 + }, + { + "epoch": 0.8044358727097396, + "grad_norm": 1.9921296834945679, + "learning_rate": 4.576132476103598e-06, + "loss": 5.2368, + "step": 8342 + }, + { + "epoch": 0.8045323047251688, + "grad_norm": 2.0088868141174316, + "learning_rate": 4.571765616216158e-06, + "loss": 5.3383, + "step": 8343 + }, + { + "epoch": 0.8046287367405979, + "grad_norm": 1.8907527923583984, + "learning_rate": 4.5674006312069276e-06, + "loss": 5.604, + "step": 8344 + }, + { + "epoch": 0.804725168756027, + "grad_norm": 2.125643491744995, + "learning_rate": 4.563037521476518e-06, + "loss": 5.6776, + "step": 8345 + }, + { + "epoch": 0.8048216007714561, + "grad_norm": 1.9110143184661865, + "learning_rate": 4.558676287425378e-06, + "loss": 5.4483, + "step": 8346 + }, + { + "epoch": 0.8049180327868852, + "grad_norm": 1.9522032737731934, + "learning_rate": 4.554316929453756e-06, + "loss": 5.3301, + "step": 8347 + }, + { + "epoch": 0.8050144648023144, + "grad_norm": 1.6629881858825684, + "learning_rate": 4.549959447961777e-06, + "loss": 5.1497, + "step": 8348 + }, + { + "epoch": 0.8051108968177435, + "grad_norm": 1.950198769569397, + "learning_rate": 4.545603843349333e-06, + "loss": 5.2427, + "step": 8349 + }, + { + "epoch": 0.8052073288331726, + "grad_norm": 1.694627285003662, + "learning_rate": 4.541250116016207e-06, + "loss": 5.3832, + "step": 8350 + }, + { + "epoch": 0.8053037608486018, + "grad_norm": 2.0922000408172607, + "learning_rate": 4.536898266361961e-06, + "loss": 5.371, + "step": 8351 + }, + { + "epoch": 0.8054001928640309, + "grad_norm": 1.6638298034667969, + "learning_rate": 4.5325482947860045e-06, + "loss": 5.5579, + "step": 8352 + }, + { + "epoch": 0.8054966248794599, + "grad_norm": 1.5774444341659546, + "learning_rate": 4.528200201687574e-06, + "loss": 5.5749, + "step": 8353 + }, + { + "epoch": 0.8055930568948891, + "grad_norm": 1.7741270065307617, + "learning_rate": 4.523853987465737e-06, + "loss": 5.3369, + "step": 8354 + }, + { + "epoch": 0.8056894889103182, + "grad_norm": 1.697285771369934, + "learning_rate": 4.519509652519369e-06, + "loss": 5.4695, + "step": 8355 + }, + { + "epoch": 0.8057859209257473, + "grad_norm": 1.6511272192001343, + "learning_rate": 4.515167197247208e-06, + "loss": 5.4738, + "step": 8356 + }, + { + "epoch": 0.8058823529411765, + "grad_norm": 1.634980320930481, + "learning_rate": 4.510826622047784e-06, + "loss": 5.5278, + "step": 8357 + }, + { + "epoch": 0.8059787849566056, + "grad_norm": 1.831929087638855, + "learning_rate": 4.5064879273194745e-06, + "loss": 5.6491, + "step": 8358 + }, + { + "epoch": 0.8060752169720348, + "grad_norm": 1.7214475870132446, + "learning_rate": 4.502151113460479e-06, + "loss": 5.2073, + "step": 8359 + }, + { + "epoch": 0.8061716489874639, + "grad_norm": 1.5255870819091797, + "learning_rate": 4.497816180868827e-06, + "loss": 5.4141, + "step": 8360 + }, + { + "epoch": 0.8062680810028929, + "grad_norm": 1.3721483945846558, + "learning_rate": 4.493483129942369e-06, + "loss": 5.412, + "step": 8361 + }, + { + "epoch": 0.8063645130183221, + "grad_norm": 1.640354037284851, + "learning_rate": 4.489151961078799e-06, + "loss": 5.1388, + "step": 8362 + }, + { + "epoch": 0.8064609450337512, + "grad_norm": 1.8631590604782104, + "learning_rate": 4.484822674675604e-06, + "loss": 5.5571, + "step": 8363 + }, + { + "epoch": 0.8065573770491803, + "grad_norm": 2.900637149810791, + "learning_rate": 4.4804952711301475e-06, + "loss": 5.3107, + "step": 8364 + }, + { + "epoch": 0.8066538090646095, + "grad_norm": 2.548086166381836, + "learning_rate": 4.476169750839571e-06, + "loss": 5.3064, + "step": 8365 + }, + { + "epoch": 0.8067502410800386, + "grad_norm": 2.028170108795166, + "learning_rate": 4.471846114200876e-06, + "loss": 5.384, + "step": 8366 + }, + { + "epoch": 0.8068466730954676, + "grad_norm": 2.790290355682373, + "learning_rate": 4.467524361610876e-06, + "loss": 5.3439, + "step": 8367 + }, + { + "epoch": 0.8069431051108968, + "grad_norm": 1.8551281690597534, + "learning_rate": 4.463204493466222e-06, + "loss": 5.2721, + "step": 8368 + }, + { + "epoch": 0.8070395371263259, + "grad_norm": 2.1321818828582764, + "learning_rate": 4.458886510163379e-06, + "loss": 5.4466, + "step": 8369 + }, + { + "epoch": 0.8071359691417551, + "grad_norm": 2.132803440093994, + "learning_rate": 4.454570412098652e-06, + "loss": 5.3931, + "step": 8370 + }, + { + "epoch": 0.8072324011571842, + "grad_norm": 1.8760459423065186, + "learning_rate": 4.450256199668168e-06, + "loss": 5.4333, + "step": 8371 + }, + { + "epoch": 0.8073288331726133, + "grad_norm": 2.2348291873931885, + "learning_rate": 4.4459438732678745e-06, + "loss": 5.1631, + "step": 8372 + }, + { + "epoch": 0.8074252651880425, + "grad_norm": 1.6730623245239258, + "learning_rate": 4.44163343329356e-06, + "loss": 5.0672, + "step": 8373 + }, + { + "epoch": 0.8075216972034716, + "grad_norm": 2.7504119873046875, + "learning_rate": 4.4373248801408145e-06, + "loss": 5.4656, + "step": 8374 + }, + { + "epoch": 0.8076181292189006, + "grad_norm": 1.623645305633545, + "learning_rate": 4.4330182142050954e-06, + "loss": 5.4008, + "step": 8375 + }, + { + "epoch": 0.8077145612343298, + "grad_norm": 1.7359988689422607, + "learning_rate": 4.4287134358816454e-06, + "loss": 5.3819, + "step": 8376 + }, + { + "epoch": 0.8078109932497589, + "grad_norm": 1.475975513458252, + "learning_rate": 4.4244105455655546e-06, + "loss": 5.2296, + "step": 8377 + }, + { + "epoch": 0.807907425265188, + "grad_norm": 2.375635862350464, + "learning_rate": 4.420109543651743e-06, + "loss": 5.2899, + "step": 8378 + }, + { + "epoch": 0.8080038572806172, + "grad_norm": 2.897284746170044, + "learning_rate": 4.4158104305349505e-06, + "loss": 5.4413, + "step": 8379 + }, + { + "epoch": 0.8081002892960463, + "grad_norm": 2.4448719024658203, + "learning_rate": 4.411513206609732e-06, + "loss": 5.4794, + "step": 8380 + }, + { + "epoch": 0.8081967213114755, + "grad_norm": 2.1005470752716064, + "learning_rate": 4.407217872270503e-06, + "loss": 5.266, + "step": 8381 + }, + { + "epoch": 0.8082931533269045, + "grad_norm": 2.2180263996124268, + "learning_rate": 4.402924427911459e-06, + "loss": 5.2026, + "step": 8382 + }, + { + "epoch": 0.8083895853423336, + "grad_norm": 1.6899741888046265, + "learning_rate": 4.3986328739266725e-06, + "loss": 5.4346, + "step": 8383 + }, + { + "epoch": 0.8084860173577628, + "grad_norm": 1.2716604471206665, + "learning_rate": 4.394343210709998e-06, + "loss": 5.4691, + "step": 8384 + }, + { + "epoch": 0.8085824493731919, + "grad_norm": 1.4693126678466797, + "learning_rate": 4.390055438655141e-06, + "loss": 5.2957, + "step": 8385 + }, + { + "epoch": 0.808678881388621, + "grad_norm": 1.791194200515747, + "learning_rate": 4.385769558155631e-06, + "loss": 5.329, + "step": 8386 + }, + { + "epoch": 0.8087753134040502, + "grad_norm": 2.0176358222961426, + "learning_rate": 4.3814855696048204e-06, + "loss": 5.1613, + "step": 8387 + }, + { + "epoch": 0.8088717454194793, + "grad_norm": 1.6236380338668823, + "learning_rate": 4.377203473395886e-06, + "loss": 5.3044, + "step": 8388 + }, + { + "epoch": 0.8089681774349083, + "grad_norm": 1.610203504562378, + "learning_rate": 4.372923269921833e-06, + "loss": 5.6126, + "step": 8389 + }, + { + "epoch": 0.8090646094503375, + "grad_norm": 1.4778625965118408, + "learning_rate": 4.368644959575499e-06, + "loss": 5.3984, + "step": 8390 + }, + { + "epoch": 0.8091610414657666, + "grad_norm": 1.4511287212371826, + "learning_rate": 4.364368542749528e-06, + "loss": 5.0742, + "step": 8391 + }, + { + "epoch": 0.8092574734811958, + "grad_norm": 1.8950238227844238, + "learning_rate": 4.360094019836428e-06, + "loss": 5.225, + "step": 8392 + }, + { + "epoch": 0.8093539054966249, + "grad_norm": 1.4183170795440674, + "learning_rate": 4.355821391228479e-06, + "loss": 5.3616, + "step": 8393 + }, + { + "epoch": 0.809450337512054, + "grad_norm": 1.9486538171768188, + "learning_rate": 4.351550657317849e-06, + "loss": 5.426, + "step": 8394 + }, + { + "epoch": 0.8095467695274832, + "grad_norm": 2.711817741394043, + "learning_rate": 4.347281818496476e-06, + "loss": 5.4549, + "step": 8395 + }, + { + "epoch": 0.8096432015429122, + "grad_norm": 1.629373550415039, + "learning_rate": 4.343014875156162e-06, + "loss": 5.3373, + "step": 8396 + }, + { + "epoch": 0.8097396335583413, + "grad_norm": 1.5447078943252563, + "learning_rate": 4.338749827688515e-06, + "loss": 5.4051, + "step": 8397 + }, + { + "epoch": 0.8098360655737705, + "grad_norm": 1.9293932914733887, + "learning_rate": 4.334486676484986e-06, + "loss": 5.4973, + "step": 8398 + }, + { + "epoch": 0.8099324975891996, + "grad_norm": 1.7803810834884644, + "learning_rate": 4.330225421936823e-06, + "loss": 5.4276, + "step": 8399 + }, + { + "epoch": 0.8100289296046287, + "grad_norm": 1.9946227073669434, + "learning_rate": 4.325966064435144e-06, + "loss": 5.5247, + "step": 8400 + }, + { + "epoch": 0.8101253616200579, + "grad_norm": 2.272580146789551, + "learning_rate": 4.321708604370847e-06, + "loss": 5.5634, + "step": 8401 + }, + { + "epoch": 0.810221793635487, + "grad_norm": 1.8527711629867554, + "learning_rate": 4.317453042134684e-06, + "loss": 5.6563, + "step": 8402 + }, + { + "epoch": 0.8103182256509162, + "grad_norm": 1.4267529249191284, + "learning_rate": 4.313199378117222e-06, + "loss": 5.451, + "step": 8403 + }, + { + "epoch": 0.8104146576663452, + "grad_norm": 1.6892119646072388, + "learning_rate": 4.308947612708863e-06, + "loss": 4.9875, + "step": 8404 + }, + { + "epoch": 0.8105110896817743, + "grad_norm": 1.4486936330795288, + "learning_rate": 4.304697746299824e-06, + "loss": 5.4128, + "step": 8405 + }, + { + "epoch": 0.8106075216972035, + "grad_norm": 1.8472551107406616, + "learning_rate": 4.300449779280155e-06, + "loss": 5.5088, + "step": 8406 + }, + { + "epoch": 0.8107039537126326, + "grad_norm": 1.3316534757614136, + "learning_rate": 4.296203712039726e-06, + "loss": 5.2328, + "step": 8407 + }, + { + "epoch": 0.8108003857280617, + "grad_norm": 1.6534088850021362, + "learning_rate": 4.291959544968246e-06, + "loss": 5.2676, + "step": 8408 + }, + { + "epoch": 0.8108968177434909, + "grad_norm": 1.7725309133529663, + "learning_rate": 4.287717278455225e-06, + "loss": 5.1334, + "step": 8409 + }, + { + "epoch": 0.81099324975892, + "grad_norm": 1.612363338470459, + "learning_rate": 4.28347691289002e-06, + "loss": 5.2785, + "step": 8410 + }, + { + "epoch": 0.811089681774349, + "grad_norm": 2.1301462650299072, + "learning_rate": 4.279238448661807e-06, + "loss": 5.5394, + "step": 8411 + }, + { + "epoch": 0.8111861137897782, + "grad_norm": 1.4092590808868408, + "learning_rate": 4.275001886159585e-06, + "loss": 5.5318, + "step": 8412 + }, + { + "epoch": 0.8112825458052073, + "grad_norm": 1.5770865678787231, + "learning_rate": 4.27076722577218e-06, + "loss": 5.5859, + "step": 8413 + }, + { + "epoch": 0.8113789778206365, + "grad_norm": 2.027367353439331, + "learning_rate": 4.266534467888247e-06, + "loss": 5.596, + "step": 8414 + }, + { + "epoch": 0.8114754098360656, + "grad_norm": 1.693069577217102, + "learning_rate": 4.262303612896262e-06, + "loss": 5.6363, + "step": 8415 + }, + { + "epoch": 0.8115718418514947, + "grad_norm": 1.797494888305664, + "learning_rate": 4.258074661184527e-06, + "loss": 5.2553, + "step": 8416 + }, + { + "epoch": 0.8116682738669239, + "grad_norm": 1.9120694398880005, + "learning_rate": 4.253847613141176e-06, + "loss": 5.413, + "step": 8417 + }, + { + "epoch": 0.8117647058823529, + "grad_norm": 2.018561363220215, + "learning_rate": 4.2496224691541465e-06, + "loss": 5.0967, + "step": 8418 + }, + { + "epoch": 0.811861137897782, + "grad_norm": 1.6676373481750488, + "learning_rate": 4.245399229611238e-06, + "loss": 5.3166, + "step": 8419 + }, + { + "epoch": 0.8119575699132112, + "grad_norm": 1.7361814975738525, + "learning_rate": 4.241177894900037e-06, + "loss": 5.4996, + "step": 8420 + }, + { + "epoch": 0.8120540019286403, + "grad_norm": 1.610198736190796, + "learning_rate": 4.236958465407978e-06, + "loss": 5.1821, + "step": 8421 + }, + { + "epoch": 0.8121504339440694, + "grad_norm": 2.2631256580352783, + "learning_rate": 4.232740941522317e-06, + "loss": 5.3989, + "step": 8422 + }, + { + "epoch": 0.8122468659594986, + "grad_norm": 1.9212526082992554, + "learning_rate": 4.228525323630139e-06, + "loss": 5.0792, + "step": 8423 + }, + { + "epoch": 0.8123432979749277, + "grad_norm": 1.3736730813980103, + "learning_rate": 4.2243116121183285e-06, + "loss": 5.4568, + "step": 8424 + }, + { + "epoch": 0.8124397299903569, + "grad_norm": 1.6877923011779785, + "learning_rate": 4.22009980737364e-06, + "loss": 5.3492, + "step": 8425 + }, + { + "epoch": 0.8125361620057859, + "grad_norm": 1.8612737655639648, + "learning_rate": 4.215889909782603e-06, + "loss": 5.1999, + "step": 8426 + }, + { + "epoch": 0.812632594021215, + "grad_norm": 1.9246728420257568, + "learning_rate": 4.21168191973162e-06, + "loss": 5.2744, + "step": 8427 + }, + { + "epoch": 0.8127290260366442, + "grad_norm": 1.9836934804916382, + "learning_rate": 4.207475837606878e-06, + "loss": 5.3214, + "step": 8428 + }, + { + "epoch": 0.8128254580520733, + "grad_norm": 2.1023671627044678, + "learning_rate": 4.203271663794414e-06, + "loss": 5.4163, + "step": 8429 + }, + { + "epoch": 0.8129218900675024, + "grad_norm": 2.6604793071746826, + "learning_rate": 4.199069398680078e-06, + "loss": 5.4839, + "step": 8430 + }, + { + "epoch": 0.8130183220829316, + "grad_norm": 1.8466097116470337, + "learning_rate": 4.194869042649552e-06, + "loss": 5.421, + "step": 8431 + }, + { + "epoch": 0.8131147540983606, + "grad_norm": 2.1714675426483154, + "learning_rate": 4.190670596088339e-06, + "loss": 5.2397, + "step": 8432 + }, + { + "epoch": 0.8132111861137897, + "grad_norm": 1.7176117897033691, + "learning_rate": 4.186474059381768e-06, + "loss": 5.4048, + "step": 8433 + }, + { + "epoch": 0.8133076181292189, + "grad_norm": 1.4215129613876343, + "learning_rate": 4.182279432914993e-06, + "loss": 5.2753, + "step": 8434 + }, + { + "epoch": 0.813404050144648, + "grad_norm": 1.8749583959579468, + "learning_rate": 4.178086717072979e-06, + "loss": 5.3957, + "step": 8435 + }, + { + "epoch": 0.8135004821600772, + "grad_norm": 1.8011828660964966, + "learning_rate": 4.173895912240549e-06, + "loss": 5.1571, + "step": 8436 + }, + { + "epoch": 0.8135969141755063, + "grad_norm": 1.4105064868927002, + "learning_rate": 4.16970701880231e-06, + "loss": 5.193, + "step": 8437 + }, + { + "epoch": 0.8136933461909354, + "grad_norm": 1.4831466674804688, + "learning_rate": 4.165520037142731e-06, + "loss": 5.4057, + "step": 8438 + }, + { + "epoch": 0.8137897782063646, + "grad_norm": 1.456889271736145, + "learning_rate": 4.1613349676460775e-06, + "loss": 5.5372, + "step": 8439 + }, + { + "epoch": 0.8138862102217936, + "grad_norm": 1.6181327104568481, + "learning_rate": 4.157151810696453e-06, + "loss": 5.0205, + "step": 8440 + }, + { + "epoch": 0.8139826422372227, + "grad_norm": 1.6165015697479248, + "learning_rate": 4.152970566677777e-06, + "loss": 5.3761, + "step": 8441 + }, + { + "epoch": 0.8140790742526519, + "grad_norm": 1.761871099472046, + "learning_rate": 4.148791235973815e-06, + "loss": 5.3423, + "step": 8442 + }, + { + "epoch": 0.814175506268081, + "grad_norm": 1.4200199842453003, + "learning_rate": 4.1446138189681155e-06, + "loss": 5.6138, + "step": 8443 + }, + { + "epoch": 0.8142719382835101, + "grad_norm": 1.5783532857894897, + "learning_rate": 4.140438316044104e-06, + "loss": 5.3656, + "step": 8444 + }, + { + "epoch": 0.8143683702989393, + "grad_norm": 2.2382822036743164, + "learning_rate": 4.136264727584982e-06, + "loss": 5.5628, + "step": 8445 + }, + { + "epoch": 0.8144648023143684, + "grad_norm": 2.3198659420013428, + "learning_rate": 4.1320930539738066e-06, + "loss": 5.4671, + "step": 8446 + }, + { + "epoch": 0.8145612343297975, + "grad_norm": 1.41427481174469, + "learning_rate": 4.127923295593444e-06, + "loss": 5.226, + "step": 8447 + }, + { + "epoch": 0.8146576663452266, + "grad_norm": 1.5489786863327026, + "learning_rate": 4.123755452826594e-06, + "loss": 5.4498, + "step": 8448 + }, + { + "epoch": 0.8147540983606557, + "grad_norm": 1.5151747465133667, + "learning_rate": 4.119589526055773e-06, + "loss": 5.1261, + "step": 8449 + }, + { + "epoch": 0.8148505303760849, + "grad_norm": 1.6481211185455322, + "learning_rate": 4.115425515663327e-06, + "loss": 5.0574, + "step": 8450 + }, + { + "epoch": 0.814946962391514, + "grad_norm": 1.5244829654693604, + "learning_rate": 4.1112634220314194e-06, + "loss": 5.3026, + "step": 8451 + }, + { + "epoch": 0.8150433944069431, + "grad_norm": 1.4081951379776, + "learning_rate": 4.107103245542049e-06, + "loss": 5.4497, + "step": 8452 + }, + { + "epoch": 0.8151398264223723, + "grad_norm": 1.2846310138702393, + "learning_rate": 4.1029449865770315e-06, + "loss": 5.4563, + "step": 8453 + }, + { + "epoch": 0.8152362584378013, + "grad_norm": 1.8077144622802734, + "learning_rate": 4.098788645517989e-06, + "loss": 5.4022, + "step": 8454 + }, + { + "epoch": 0.8153326904532304, + "grad_norm": 1.7855055332183838, + "learning_rate": 4.094634222746416e-06, + "loss": 5.2251, + "step": 8455 + }, + { + "epoch": 0.8154291224686596, + "grad_norm": 1.4216816425323486, + "learning_rate": 4.090481718643574e-06, + "loss": 5.4688, + "step": 8456 + }, + { + "epoch": 0.8155255544840887, + "grad_norm": 1.6669899225234985, + "learning_rate": 4.086331133590587e-06, + "loss": 5.358, + "step": 8457 + }, + { + "epoch": 0.8156219864995179, + "grad_norm": 1.4941524267196655, + "learning_rate": 4.082182467968387e-06, + "loss": 5.4207, + "step": 8458 + }, + { + "epoch": 0.815718418514947, + "grad_norm": 1.5840681791305542, + "learning_rate": 4.078035722157733e-06, + "loss": 5.4805, + "step": 8459 + }, + { + "epoch": 0.8158148505303761, + "grad_norm": 1.1502903699874878, + "learning_rate": 4.073890896539212e-06, + "loss": 5.3268, + "step": 8460 + }, + { + "epoch": 0.8159112825458052, + "grad_norm": 1.499258279800415, + "learning_rate": 4.0697479914932335e-06, + "loss": 5.3783, + "step": 8461 + }, + { + "epoch": 0.8160077145612343, + "grad_norm": 1.6802701950073242, + "learning_rate": 4.065607007400013e-06, + "loss": 5.102, + "step": 8462 + }, + { + "epoch": 0.8161041465766634, + "grad_norm": 1.6389330625534058, + "learning_rate": 4.061467944639624e-06, + "loss": 5.4557, + "step": 8463 + }, + { + "epoch": 0.8162005785920926, + "grad_norm": 1.6418476104736328, + "learning_rate": 4.057330803591933e-06, + "loss": 5.0726, + "step": 8464 + }, + { + "epoch": 0.8162970106075217, + "grad_norm": 1.8326398134231567, + "learning_rate": 4.053195584636643e-06, + "loss": 5.437, + "step": 8465 + }, + { + "epoch": 0.8163934426229508, + "grad_norm": 1.772873878479004, + "learning_rate": 4.049062288153283e-06, + "loss": 5.3654, + "step": 8466 + }, + { + "epoch": 0.81648987463838, + "grad_norm": 1.6214743852615356, + "learning_rate": 4.044930914521202e-06, + "loss": 5.3137, + "step": 8467 + }, + { + "epoch": 0.816586306653809, + "grad_norm": 2.488140106201172, + "learning_rate": 4.040801464119562e-06, + "loss": 5.211, + "step": 8468 + }, + { + "epoch": 0.8166827386692382, + "grad_norm": 2.503756046295166, + "learning_rate": 4.036673937327376e-06, + "loss": 5.2359, + "step": 8469 + }, + { + "epoch": 0.8167791706846673, + "grad_norm": 1.6002191305160522, + "learning_rate": 4.032548334523445e-06, + "loss": 5.2701, + "step": 8470 + }, + { + "epoch": 0.8168756027000964, + "grad_norm": 2.3097035884857178, + "learning_rate": 4.0284246560864295e-06, + "loss": 5.05, + "step": 8471 + }, + { + "epoch": 0.8169720347155256, + "grad_norm": 2.1519181728363037, + "learning_rate": 4.024302902394786e-06, + "loss": 5.38, + "step": 8472 + }, + { + "epoch": 0.8170684667309547, + "grad_norm": 1.6291686296463013, + "learning_rate": 4.020183073826802e-06, + "loss": 5.3637, + "step": 8473 + }, + { + "epoch": 0.8171648987463838, + "grad_norm": 1.6962943077087402, + "learning_rate": 4.016065170760594e-06, + "loss": 5.3977, + "step": 8474 + }, + { + "epoch": 0.817261330761813, + "grad_norm": 1.474258542060852, + "learning_rate": 4.011949193574099e-06, + "loss": 5.0596, + "step": 8475 + }, + { + "epoch": 0.817357762777242, + "grad_norm": 1.435381293296814, + "learning_rate": 4.007835142645072e-06, + "loss": 5.2282, + "step": 8476 + }, + { + "epoch": 0.8174541947926711, + "grad_norm": 1.6384496688842773, + "learning_rate": 4.003723018351102e-06, + "loss": 5.2998, + "step": 8477 + }, + { + "epoch": 0.8175506268081003, + "grad_norm": 1.4228888750076294, + "learning_rate": 3.999612821069595e-06, + "loss": 5.3987, + "step": 8478 + }, + { + "epoch": 0.8176470588235294, + "grad_norm": 1.5355722904205322, + "learning_rate": 3.9955045511777645e-06, + "loss": 5.3092, + "step": 8479 + }, + { + "epoch": 0.8177434908389586, + "grad_norm": 1.730719804763794, + "learning_rate": 3.991398209052686e-06, + "loss": 5.4488, + "step": 8480 + }, + { + "epoch": 0.8178399228543877, + "grad_norm": 1.527720332145691, + "learning_rate": 3.987293795071209e-06, + "loss": 5.415, + "step": 8481 + }, + { + "epoch": 0.8179363548698168, + "grad_norm": 1.1284018754959106, + "learning_rate": 3.98319130961006e-06, + "loss": 5.3646, + "step": 8482 + }, + { + "epoch": 0.8180327868852459, + "grad_norm": 1.523292064666748, + "learning_rate": 3.9790907530457385e-06, + "loss": 5.307, + "step": 8483 + }, + { + "epoch": 0.818129218900675, + "grad_norm": 2.2117764949798584, + "learning_rate": 3.974992125754592e-06, + "loss": 5.4552, + "step": 8484 + }, + { + "epoch": 0.8182256509161041, + "grad_norm": 1.5934650897979736, + "learning_rate": 3.970895428112795e-06, + "loss": 5.4495, + "step": 8485 + }, + { + "epoch": 0.8183220829315333, + "grad_norm": 2.4011406898498535, + "learning_rate": 3.966800660496337e-06, + "loss": 5.6129, + "step": 8486 + }, + { + "epoch": 0.8184185149469624, + "grad_norm": 1.478804111480713, + "learning_rate": 3.9627078232810145e-06, + "loss": 5.3497, + "step": 8487 + }, + { + "epoch": 0.8185149469623915, + "grad_norm": 1.6074410676956177, + "learning_rate": 3.9586169168424865e-06, + "loss": 5.4799, + "step": 8488 + }, + { + "epoch": 0.8186113789778207, + "grad_norm": 1.5906213521957397, + "learning_rate": 3.954527941556196e-06, + "loss": 5.4668, + "step": 8489 + }, + { + "epoch": 0.8187078109932497, + "grad_norm": 2.2089571952819824, + "learning_rate": 3.950440897797428e-06, + "loss": 5.5029, + "step": 8490 + }, + { + "epoch": 0.8188042430086789, + "grad_norm": 1.7660610675811768, + "learning_rate": 3.946355785941286e-06, + "loss": 5.4963, + "step": 8491 + }, + { + "epoch": 0.818900675024108, + "grad_norm": 1.6686534881591797, + "learning_rate": 3.9422726063626956e-06, + "loss": 5.144, + "step": 8492 + }, + { + "epoch": 0.8189971070395371, + "grad_norm": 1.2829374074935913, + "learning_rate": 3.938191359436408e-06, + "loss": 5.1525, + "step": 8493 + }, + { + "epoch": 0.8190935390549663, + "grad_norm": 1.792547583580017, + "learning_rate": 3.934112045536997e-06, + "loss": 5.1718, + "step": 8494 + }, + { + "epoch": 0.8191899710703954, + "grad_norm": 1.7005075216293335, + "learning_rate": 3.930034665038854e-06, + "loss": 5.307, + "step": 8495 + }, + { + "epoch": 0.8192864030858245, + "grad_norm": 1.7884712219238281, + "learning_rate": 3.925959218316194e-06, + "loss": 5.4712, + "step": 8496 + }, + { + "epoch": 0.8193828351012536, + "grad_norm": 1.7676575183868408, + "learning_rate": 3.921885705743067e-06, + "loss": 5.1341, + "step": 8497 + }, + { + "epoch": 0.8194792671166827, + "grad_norm": 1.2784380912780762, + "learning_rate": 3.917814127693314e-06, + "loss": 5.4096, + "step": 8498 + }, + { + "epoch": 0.8195756991321118, + "grad_norm": 1.8464665412902832, + "learning_rate": 3.913744484540646e-06, + "loss": 5.4233, + "step": 8499 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 1.8958032131195068, + "learning_rate": 3.90967677665855e-06, + "loss": 5.592, + "step": 8500 + }, + { + "epoch": 0.8197685631629701, + "grad_norm": 1.9058643579483032, + "learning_rate": 3.90561100442036e-06, + "loss": 5.476, + "step": 8501 + }, + { + "epoch": 0.8198649951783993, + "grad_norm": 2.376222848892212, + "learning_rate": 3.90154716819923e-06, + "loss": 5.3365, + "step": 8502 + }, + { + "epoch": 0.8199614271938284, + "grad_norm": 2.3367671966552734, + "learning_rate": 3.897485268368134e-06, + "loss": 5.4168, + "step": 8503 + }, + { + "epoch": 0.8200578592092574, + "grad_norm": 1.8440359830856323, + "learning_rate": 3.893425305299864e-06, + "loss": 5.527, + "step": 8504 + }, + { + "epoch": 0.8201542912246866, + "grad_norm": 1.445265531539917, + "learning_rate": 3.889367279367051e-06, + "loss": 5.3331, + "step": 8505 + }, + { + "epoch": 0.8202507232401157, + "grad_norm": 1.7062791585922241, + "learning_rate": 3.885311190942112e-06, + "loss": 5.2689, + "step": 8506 + }, + { + "epoch": 0.8203471552555448, + "grad_norm": 1.4480444192886353, + "learning_rate": 3.881257040397338e-06, + "loss": 5.6354, + "step": 8507 + }, + { + "epoch": 0.820443587270974, + "grad_norm": 2.0283937454223633, + "learning_rate": 3.877204828104792e-06, + "loss": 5.2308, + "step": 8508 + }, + { + "epoch": 0.8205400192864031, + "grad_norm": 1.4375903606414795, + "learning_rate": 3.873154554436387e-06, + "loss": 5.3278, + "step": 8509 + }, + { + "epoch": 0.8206364513018322, + "grad_norm": 1.8892368078231812, + "learning_rate": 3.8691062197638574e-06, + "loss": 5.1355, + "step": 8510 + }, + { + "epoch": 0.8207328833172614, + "grad_norm": 1.577386498451233, + "learning_rate": 3.86505982445875e-06, + "loss": 5.3593, + "step": 8511 + }, + { + "epoch": 0.8208293153326904, + "grad_norm": 1.8832910060882568, + "learning_rate": 3.86101536889244e-06, + "loss": 5.3398, + "step": 8512 + }, + { + "epoch": 0.8209257473481196, + "grad_norm": 1.605860710144043, + "learning_rate": 3.856972853436125e-06, + "loss": 5.2697, + "step": 8513 + }, + { + "epoch": 0.8210221793635487, + "grad_norm": 1.6351351737976074, + "learning_rate": 3.852932278460808e-06, + "loss": 5.1828, + "step": 8514 + }, + { + "epoch": 0.8211186113789778, + "grad_norm": 3.092255115509033, + "learning_rate": 3.848893644337351e-06, + "loss": 5.0725, + "step": 8515 + }, + { + "epoch": 0.821215043394407, + "grad_norm": 2.229773998260498, + "learning_rate": 3.844856951436399e-06, + "loss": 5.0268, + "step": 8516 + }, + { + "epoch": 0.8213114754098361, + "grad_norm": 1.8128174543380737, + "learning_rate": 3.840822200128435e-06, + "loss": 5.3321, + "step": 8517 + }, + { + "epoch": 0.8214079074252651, + "grad_norm": 1.6126933097839355, + "learning_rate": 3.836789390783768e-06, + "loss": 5.4153, + "step": 8518 + }, + { + "epoch": 0.8215043394406943, + "grad_norm": 1.924379587173462, + "learning_rate": 3.832758523772522e-06, + "loss": 5.3106, + "step": 8519 + }, + { + "epoch": 0.8216007714561234, + "grad_norm": 1.9546083211898804, + "learning_rate": 3.828729599464645e-06, + "loss": 5.4437, + "step": 8520 + }, + { + "epoch": 0.8216972034715525, + "grad_norm": 1.9502668380737305, + "learning_rate": 3.824702618229911e-06, + "loss": 5.3606, + "step": 8521 + }, + { + "epoch": 0.8217936354869817, + "grad_norm": 1.7339500188827515, + "learning_rate": 3.820677580437912e-06, + "loss": 5.2539, + "step": 8522 + }, + { + "epoch": 0.8218900675024108, + "grad_norm": 1.2330776453018188, + "learning_rate": 3.816654486458046e-06, + "loss": 5.2815, + "step": 8523 + }, + { + "epoch": 0.82198649951784, + "grad_norm": 1.57437002658844, + "learning_rate": 3.8126333366595714e-06, + "loss": 5.332, + "step": 8524 + }, + { + "epoch": 0.8220829315332691, + "grad_norm": 2.434471607208252, + "learning_rate": 3.808614131411517e-06, + "loss": 5.1178, + "step": 8525 + }, + { + "epoch": 0.8221793635486981, + "grad_norm": 2.3072736263275146, + "learning_rate": 3.8045968710827913e-06, + "loss": 5.1497, + "step": 8526 + }, + { + "epoch": 0.8222757955641273, + "grad_norm": 2.094449758529663, + "learning_rate": 3.8005815560420684e-06, + "loss": 5.0715, + "step": 8527 + }, + { + "epoch": 0.8223722275795564, + "grad_norm": 2.162595510482788, + "learning_rate": 3.7965681866578784e-06, + "loss": 5.3844, + "step": 8528 + }, + { + "epoch": 0.8224686595949855, + "grad_norm": 1.332236409187317, + "learning_rate": 3.7925567632985633e-06, + "loss": 5.1436, + "step": 8529 + }, + { + "epoch": 0.8225650916104147, + "grad_norm": 1.4866318702697754, + "learning_rate": 3.788547286332292e-06, + "loss": 5.3219, + "step": 8530 + }, + { + "epoch": 0.8226615236258438, + "grad_norm": 1.9960576295852661, + "learning_rate": 3.784539756127034e-06, + "loss": 5.1915, + "step": 8531 + }, + { + "epoch": 0.8227579556412729, + "grad_norm": 1.3756725788116455, + "learning_rate": 3.780534173050615e-06, + "loss": 5.5281, + "step": 8532 + }, + { + "epoch": 0.822854387656702, + "grad_norm": 1.3162708282470703, + "learning_rate": 3.776530537470646e-06, + "loss": 5.4345, + "step": 8533 + }, + { + "epoch": 0.8229508196721311, + "grad_norm": 1.4289692640304565, + "learning_rate": 3.7725288497545834e-06, + "loss": 5.3477, + "step": 8534 + }, + { + "epoch": 0.8230472516875603, + "grad_norm": 1.8939791917800903, + "learning_rate": 3.7685291102696973e-06, + "loss": 5.2202, + "step": 8535 + }, + { + "epoch": 0.8231436837029894, + "grad_norm": 1.3407301902770996, + "learning_rate": 3.7645313193830773e-06, + "loss": 5.1692, + "step": 8536 + }, + { + "epoch": 0.8232401157184185, + "grad_norm": 1.4706910848617554, + "learning_rate": 3.760535477461635e-06, + "loss": 5.2448, + "step": 8537 + }, + { + "epoch": 0.8233365477338477, + "grad_norm": 2.1589572429656982, + "learning_rate": 3.756541584872106e-06, + "loss": 5.5339, + "step": 8538 + }, + { + "epoch": 0.8234329797492768, + "grad_norm": 1.8160746097564697, + "learning_rate": 3.752549641981046e-06, + "loss": 5.5208, + "step": 8539 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.991275429725647, + "learning_rate": 3.748559649154826e-06, + "loss": 5.2015, + "step": 8540 + }, + { + "epoch": 0.823625843780135, + "grad_norm": 1.5736563205718994, + "learning_rate": 3.7445716067596503e-06, + "loss": 5.3865, + "step": 8541 + }, + { + "epoch": 0.8237222757955641, + "grad_norm": 1.5982089042663574, + "learning_rate": 3.740585515161521e-06, + "loss": 5.3511, + "step": 8542 + }, + { + "epoch": 0.8238187078109932, + "grad_norm": 1.6349139213562012, + "learning_rate": 3.7366013747262978e-06, + "loss": 5.2285, + "step": 8543 + }, + { + "epoch": 0.8239151398264224, + "grad_norm": 2.2540194988250732, + "learning_rate": 3.7326191858196247e-06, + "loss": 5.2556, + "step": 8544 + }, + { + "epoch": 0.8240115718418515, + "grad_norm": 1.5831221342086792, + "learning_rate": 3.7286389488069865e-06, + "loss": 5.2942, + "step": 8545 + }, + { + "epoch": 0.8241080038572807, + "grad_norm": 3.0762808322906494, + "learning_rate": 3.7246606640536834e-06, + "loss": 5.3, + "step": 8546 + }, + { + "epoch": 0.8242044358727098, + "grad_norm": 1.4892032146453857, + "learning_rate": 3.7206843319248395e-06, + "loss": 5.3298, + "step": 8547 + }, + { + "epoch": 0.8243008678881388, + "grad_norm": 1.302190899848938, + "learning_rate": 3.7167099527853995e-06, + "loss": 5.2188, + "step": 8548 + }, + { + "epoch": 0.824397299903568, + "grad_norm": 1.60276460647583, + "learning_rate": 3.712737527000129e-06, + "loss": 5.4145, + "step": 8549 + }, + { + "epoch": 0.8244937319189971, + "grad_norm": 1.9335486888885498, + "learning_rate": 3.7087670549335956e-06, + "loss": 5.4391, + "step": 8550 + }, + { + "epoch": 0.8245901639344262, + "grad_norm": 1.5555092096328735, + "learning_rate": 3.704798536950227e-06, + "loss": 5.1606, + "step": 8551 + }, + { + "epoch": 0.8246865959498554, + "grad_norm": 1.7132959365844727, + "learning_rate": 3.7008319734142343e-06, + "loss": 5.2565, + "step": 8552 + }, + { + "epoch": 0.8247830279652845, + "grad_norm": 2.9777865409851074, + "learning_rate": 3.6968673646896685e-06, + "loss": 5.1893, + "step": 8553 + }, + { + "epoch": 0.8248794599807135, + "grad_norm": 2.7174644470214844, + "learning_rate": 3.6929047111403962e-06, + "loss": 5.1653, + "step": 8554 + }, + { + "epoch": 0.8249758919961427, + "grad_norm": 2.3112637996673584, + "learning_rate": 3.6889440131301024e-06, + "loss": 5.1694, + "step": 8555 + }, + { + "epoch": 0.8250723240115718, + "grad_norm": 2.2156829833984375, + "learning_rate": 3.684985271022301e-06, + "loss": 5.1486, + "step": 8556 + }, + { + "epoch": 0.825168756027001, + "grad_norm": 2.161741256713867, + "learning_rate": 3.681028485180313e-06, + "loss": 5.1301, + "step": 8557 + }, + { + "epoch": 0.8252651880424301, + "grad_norm": 1.7161362171173096, + "learning_rate": 3.677073655967295e-06, + "loss": 5.1987, + "step": 8558 + }, + { + "epoch": 0.8253616200578592, + "grad_norm": 2.110731363296509, + "learning_rate": 3.673120783746209e-06, + "loss": 5.3131, + "step": 8559 + }, + { + "epoch": 0.8254580520732884, + "grad_norm": 1.7631564140319824, + "learning_rate": 3.669169868879857e-06, + "loss": 5.3438, + "step": 8560 + }, + { + "epoch": 0.8255544840887175, + "grad_norm": 1.8265060186386108, + "learning_rate": 3.6652209117308266e-06, + "loss": 5.2688, + "step": 8561 + }, + { + "epoch": 0.8256509161041465, + "grad_norm": 1.5775563716888428, + "learning_rate": 3.6612739126615753e-06, + "loss": 5.4316, + "step": 8562 + }, + { + "epoch": 0.8257473481195757, + "grad_norm": 1.4082493782043457, + "learning_rate": 3.657328872034335e-06, + "loss": 5.2589, + "step": 8563 + }, + { + "epoch": 0.8258437801350048, + "grad_norm": 1.9559282064437866, + "learning_rate": 3.6533857902111835e-06, + "loss": 5.3569, + "step": 8564 + }, + { + "epoch": 0.8259402121504339, + "grad_norm": 2.1504666805267334, + "learning_rate": 3.6494446675540083e-06, + "loss": 5.1555, + "step": 8565 + }, + { + "epoch": 0.8260366441658631, + "grad_norm": 2.1694254875183105, + "learning_rate": 3.645505504424529e-06, + "loss": 5.3904, + "step": 8566 + }, + { + "epoch": 0.8261330761812922, + "grad_norm": 1.702282428741455, + "learning_rate": 3.6415683011842615e-06, + "loss": 5.2984, + "step": 8567 + }, + { + "epoch": 0.8262295081967214, + "grad_norm": 1.8527063131332397, + "learning_rate": 3.6376330581945783e-06, + "loss": 5.1947, + "step": 8568 + }, + { + "epoch": 0.8263259402121504, + "grad_norm": 1.6550989151000977, + "learning_rate": 3.6336997758166263e-06, + "loss": 5.1601, + "step": 8569 + }, + { + "epoch": 0.8264223722275795, + "grad_norm": 1.5771762132644653, + "learning_rate": 3.629768454411425e-06, + "loss": 5.0515, + "step": 8570 + }, + { + "epoch": 0.8265188042430087, + "grad_norm": 1.8359850645065308, + "learning_rate": 3.6258390943397667e-06, + "loss": 5.4036, + "step": 8571 + }, + { + "epoch": 0.8266152362584378, + "grad_norm": 2.200096368789673, + "learning_rate": 3.621911695962288e-06, + "loss": 5.206, + "step": 8572 + }, + { + "epoch": 0.8267116682738669, + "grad_norm": 1.4471431970596313, + "learning_rate": 3.6179862596394444e-06, + "loss": 5.4435, + "step": 8573 + }, + { + "epoch": 0.8268081002892961, + "grad_norm": 1.6323292255401611, + "learning_rate": 3.614062785731509e-06, + "loss": 5.2817, + "step": 8574 + }, + { + "epoch": 0.8269045323047252, + "grad_norm": 2.2107338905334473, + "learning_rate": 3.610141274598558e-06, + "loss": 5.274, + "step": 8575 + }, + { + "epoch": 0.8270009643201542, + "grad_norm": 1.4711014032363892, + "learning_rate": 3.6062217266005228e-06, + "loss": 5.0279, + "step": 8576 + }, + { + "epoch": 0.8270973963355834, + "grad_norm": 1.402116060256958, + "learning_rate": 3.6023041420971236e-06, + "loss": 5.4376, + "step": 8577 + }, + { + "epoch": 0.8271938283510125, + "grad_norm": 1.3867031335830688, + "learning_rate": 3.598388521447912e-06, + "loss": 5.3642, + "step": 8578 + }, + { + "epoch": 0.8272902603664417, + "grad_norm": 1.946570634841919, + "learning_rate": 3.5944748650122623e-06, + "loss": 5.207, + "step": 8579 + }, + { + "epoch": 0.8273866923818708, + "grad_norm": 1.6567718982696533, + "learning_rate": 3.5905631731493638e-06, + "loss": 5.283, + "step": 8580 + }, + { + "epoch": 0.8274831243972999, + "grad_norm": 1.6838682889938354, + "learning_rate": 3.5866534462182272e-06, + "loss": 5.3592, + "step": 8581 + }, + { + "epoch": 0.8275795564127291, + "grad_norm": 1.6713383197784424, + "learning_rate": 3.582745684577679e-06, + "loss": 5.4844, + "step": 8582 + }, + { + "epoch": 0.8276759884281581, + "grad_norm": 1.3187789916992188, + "learning_rate": 3.5788398885863744e-06, + "loss": 5.2986, + "step": 8583 + }, + { + "epoch": 0.8277724204435872, + "grad_norm": 1.9047659635543823, + "learning_rate": 3.574936058602779e-06, + "loss": 5.1325, + "step": 8584 + }, + { + "epoch": 0.8278688524590164, + "grad_norm": 1.7308496236801147, + "learning_rate": 3.5710341949851873e-06, + "loss": 5.2868, + "step": 8585 + }, + { + "epoch": 0.8279652844744455, + "grad_norm": 1.598899006843567, + "learning_rate": 3.56713429809169e-06, + "loss": 5.3578, + "step": 8586 + }, + { + "epoch": 0.8280617164898746, + "grad_norm": 1.3973374366760254, + "learning_rate": 3.56323636828024e-06, + "loss": 5.3749, + "step": 8587 + }, + { + "epoch": 0.8281581485053038, + "grad_norm": 1.4796127080917358, + "learning_rate": 3.559340405908565e-06, + "loss": 5.4533, + "step": 8588 + }, + { + "epoch": 0.8282545805207329, + "grad_norm": 1.5301483869552612, + "learning_rate": 3.5554464113342402e-06, + "loss": 5.211, + "step": 8589 + }, + { + "epoch": 0.8283510125361621, + "grad_norm": 1.3907175064086914, + "learning_rate": 3.5515543849146488e-06, + "loss": 5.3641, + "step": 8590 + }, + { + "epoch": 0.8284474445515911, + "grad_norm": 1.327587604522705, + "learning_rate": 3.547664327006997e-06, + "loss": 5.2575, + "step": 8591 + }, + { + "epoch": 0.8285438765670202, + "grad_norm": 1.5548031330108643, + "learning_rate": 3.5437762379683077e-06, + "loss": 5.2953, + "step": 8592 + }, + { + "epoch": 0.8286403085824494, + "grad_norm": 1.82388174533844, + "learning_rate": 3.5398901181554366e-06, + "loss": 5.2232, + "step": 8593 + }, + { + "epoch": 0.8287367405978785, + "grad_norm": 1.2289836406707764, + "learning_rate": 3.536005967925024e-06, + "loss": 5.3477, + "step": 8594 + }, + { + "epoch": 0.8288331726133076, + "grad_norm": 1.879461646080017, + "learning_rate": 3.532123787633576e-06, + "loss": 5.3639, + "step": 8595 + }, + { + "epoch": 0.8289296046287368, + "grad_norm": 1.5881009101867676, + "learning_rate": 3.52824357763738e-06, + "loss": 5.3536, + "step": 8596 + }, + { + "epoch": 0.8290260366441659, + "grad_norm": 1.9677038192749023, + "learning_rate": 3.5243653382925596e-06, + "loss": 5.4065, + "step": 8597 + }, + { + "epoch": 0.8291224686595949, + "grad_norm": 1.4255962371826172, + "learning_rate": 3.5204890699550577e-06, + "loss": 5.339, + "step": 8598 + }, + { + "epoch": 0.8292189006750241, + "grad_norm": 1.7717411518096924, + "learning_rate": 3.5166147729806335e-06, + "loss": 5.1313, + "step": 8599 + }, + { + "epoch": 0.8293153326904532, + "grad_norm": 1.723655104637146, + "learning_rate": 3.512742447724862e-06, + "loss": 5.4574, + "step": 8600 + }, + { + "epoch": 0.8294117647058824, + "grad_norm": 2.0263383388519287, + "learning_rate": 3.508872094543142e-06, + "loss": 5.3971, + "step": 8601 + }, + { + "epoch": 0.8295081967213115, + "grad_norm": 1.8939155340194702, + "learning_rate": 3.505003713790689e-06, + "loss": 5.5144, + "step": 8602 + }, + { + "epoch": 0.8296046287367406, + "grad_norm": 1.605765461921692, + "learning_rate": 3.501137305822541e-06, + "loss": 5.3506, + "step": 8603 + }, + { + "epoch": 0.8297010607521698, + "grad_norm": 1.4617685079574585, + "learning_rate": 3.4972728709935536e-06, + "loss": 5.2686, + "step": 8604 + }, + { + "epoch": 0.8297974927675988, + "grad_norm": 2.1938557624816895, + "learning_rate": 3.4934104096583875e-06, + "loss": 5.2551, + "step": 8605 + }, + { + "epoch": 0.8298939247830279, + "grad_norm": 1.6476824283599854, + "learning_rate": 3.4895499221715532e-06, + "loss": 5.2625, + "step": 8606 + }, + { + "epoch": 0.8299903567984571, + "grad_norm": 2.560152530670166, + "learning_rate": 3.4856914088873484e-06, + "loss": 5.7128, + "step": 8607 + }, + { + "epoch": 0.8300867888138862, + "grad_norm": 1.6636204719543457, + "learning_rate": 3.481834870159903e-06, + "loss": 5.3424, + "step": 8608 + }, + { + "epoch": 0.8301832208293153, + "grad_norm": 2.433943748474121, + "learning_rate": 3.4779803063431702e-06, + "loss": 5.2609, + "step": 8609 + }, + { + "epoch": 0.8302796528447445, + "grad_norm": 1.824268102645874, + "learning_rate": 3.4741277177909233e-06, + "loss": 5.3254, + "step": 8610 + }, + { + "epoch": 0.8303760848601736, + "grad_norm": 1.8062775135040283, + "learning_rate": 3.4702771048567285e-06, + "loss": 5.0812, + "step": 8611 + }, + { + "epoch": 0.8304725168756028, + "grad_norm": 2.721433639526367, + "learning_rate": 3.466428467894012e-06, + "loss": 5.4023, + "step": 8612 + }, + { + "epoch": 0.8305689488910318, + "grad_norm": 1.5723886489868164, + "learning_rate": 3.4625818072559766e-06, + "loss": 5.2747, + "step": 8613 + }, + { + "epoch": 0.8306653809064609, + "grad_norm": 1.7197296619415283, + "learning_rate": 3.4587371232956858e-06, + "loss": 5.2298, + "step": 8614 + }, + { + "epoch": 0.8307618129218901, + "grad_norm": 1.6937904357910156, + "learning_rate": 3.4548944163659837e-06, + "loss": 5.2307, + "step": 8615 + }, + { + "epoch": 0.8308582449373192, + "grad_norm": 1.7441610097885132, + "learning_rate": 3.4510536868195525e-06, + "loss": 5.1591, + "step": 8616 + }, + { + "epoch": 0.8309546769527483, + "grad_norm": 1.8434710502624512, + "learning_rate": 3.4472149350088937e-06, + "loss": 5.3838, + "step": 8617 + }, + { + "epoch": 0.8310511089681775, + "grad_norm": 1.4871487617492676, + "learning_rate": 3.4433781612863257e-06, + "loss": 5.57, + "step": 8618 + }, + { + "epoch": 0.8311475409836065, + "grad_norm": 1.9354028701782227, + "learning_rate": 3.4395433660039665e-06, + "loss": 5.3577, + "step": 8619 + }, + { + "epoch": 0.8312439729990356, + "grad_norm": 1.4551897048950195, + "learning_rate": 3.4357105495137933e-06, + "loss": 5.4503, + "step": 8620 + }, + { + "epoch": 0.8313404050144648, + "grad_norm": 1.6277092695236206, + "learning_rate": 3.4318797121675573e-06, + "loss": 5.5876, + "step": 8621 + }, + { + "epoch": 0.8314368370298939, + "grad_norm": 1.473209023475647, + "learning_rate": 3.4280508543168556e-06, + "loss": 5.2768, + "step": 8622 + }, + { + "epoch": 0.8315332690453231, + "grad_norm": 1.3579249382019043, + "learning_rate": 3.4242239763130955e-06, + "loss": 5.1877, + "step": 8623 + }, + { + "epoch": 0.8316297010607522, + "grad_norm": 1.5833014249801636, + "learning_rate": 3.420399078507505e-06, + "loss": 5.1515, + "step": 8624 + }, + { + "epoch": 0.8317261330761813, + "grad_norm": 2.0918023586273193, + "learning_rate": 3.4165761612511256e-06, + "loss": 5.141, + "step": 8625 + }, + { + "epoch": 0.8318225650916105, + "grad_norm": 1.5987786054611206, + "learning_rate": 3.412755224894823e-06, + "loss": 5.2896, + "step": 8626 + }, + { + "epoch": 0.8319189971070395, + "grad_norm": 1.4870052337646484, + "learning_rate": 3.4089362697892728e-06, + "loss": 5.578, + "step": 8627 + }, + { + "epoch": 0.8320154291224686, + "grad_norm": 1.6044206619262695, + "learning_rate": 3.4051192962849808e-06, + "loss": 5.4739, + "step": 8628 + }, + { + "epoch": 0.8321118611378978, + "grad_norm": 1.9497416019439697, + "learning_rate": 3.401304304732264e-06, + "loss": 5.336, + "step": 8629 + }, + { + "epoch": 0.8322082931533269, + "grad_norm": 1.6339755058288574, + "learning_rate": 3.3974912954812447e-06, + "loss": 4.9676, + "step": 8630 + }, + { + "epoch": 0.832304725168756, + "grad_norm": 1.3273961544036865, + "learning_rate": 3.393680268881896e-06, + "loss": 5.4334, + "step": 8631 + }, + { + "epoch": 0.8324011571841852, + "grad_norm": 1.4406189918518066, + "learning_rate": 3.3898712252839746e-06, + "loss": 5.636, + "step": 8632 + }, + { + "epoch": 0.8324975891996143, + "grad_norm": 1.4179598093032837, + "learning_rate": 3.3860641650370755e-06, + "loss": 5.4175, + "step": 8633 + }, + { + "epoch": 0.8325940212150434, + "grad_norm": 1.6809263229370117, + "learning_rate": 3.3822590884906024e-06, + "loss": 5.2277, + "step": 8634 + }, + { + "epoch": 0.8326904532304725, + "grad_norm": 1.8004729747772217, + "learning_rate": 3.3784559959937844e-06, + "loss": 5.4167, + "step": 8635 + }, + { + "epoch": 0.8327868852459016, + "grad_norm": 1.5567991733551025, + "learning_rate": 3.3746548878956645e-06, + "loss": 5.2751, + "step": 8636 + }, + { + "epoch": 0.8328833172613308, + "grad_norm": 1.632132887840271, + "learning_rate": 3.3708557645451053e-06, + "loss": 5.3146, + "step": 8637 + }, + { + "epoch": 0.8329797492767599, + "grad_norm": 1.6241663694381714, + "learning_rate": 3.3670586262907754e-06, + "loss": 5.3483, + "step": 8638 + }, + { + "epoch": 0.833076181292189, + "grad_norm": 1.3891810178756714, + "learning_rate": 3.3632634734811876e-06, + "loss": 5.288, + "step": 8639 + }, + { + "epoch": 0.8331726133076182, + "grad_norm": 1.429870367050171, + "learning_rate": 3.359470306464643e-06, + "loss": 5.3964, + "step": 8640 + }, + { + "epoch": 0.8332690453230472, + "grad_norm": 1.4185791015625, + "learning_rate": 3.3556791255892756e-06, + "loss": 5.3321, + "step": 8641 + }, + { + "epoch": 0.8333654773384763, + "grad_norm": 1.3524746894836426, + "learning_rate": 3.3518899312030423e-06, + "loss": 5.1479, + "step": 8642 + }, + { + "epoch": 0.8334619093539055, + "grad_norm": 1.3615882396697998, + "learning_rate": 3.3481027236537036e-06, + "loss": 5.1938, + "step": 8643 + }, + { + "epoch": 0.8335583413693346, + "grad_norm": 1.2675479650497437, + "learning_rate": 3.3443175032888492e-06, + "loss": 5.6772, + "step": 8644 + }, + { + "epoch": 0.8336547733847638, + "grad_norm": 1.2380472421646118, + "learning_rate": 3.340534270455881e-06, + "loss": 5.2402, + "step": 8645 + }, + { + "epoch": 0.8337512054001929, + "grad_norm": 1.3341552019119263, + "learning_rate": 3.336753025502018e-06, + "loss": 5.2859, + "step": 8646 + }, + { + "epoch": 0.833847637415622, + "grad_norm": 1.3842436075210571, + "learning_rate": 3.3329737687743e-06, + "loss": 5.2886, + "step": 8647 + }, + { + "epoch": 0.8339440694310511, + "grad_norm": 1.2200220823287964, + "learning_rate": 3.329196500619586e-06, + "loss": 5.3183, + "step": 8648 + }, + { + "epoch": 0.8340405014464802, + "grad_norm": 2.4158382415771484, + "learning_rate": 3.3254212213845366e-06, + "loss": 4.7756, + "step": 8649 + }, + { + "epoch": 0.8341369334619093, + "grad_norm": 2.437070608139038, + "learning_rate": 3.3216479314156586e-06, + "loss": 4.7362, + "step": 8650 + }, + { + "epoch": 0.8342333654773385, + "grad_norm": 1.8299586772918701, + "learning_rate": 3.3178766310592466e-06, + "loss": 4.8895, + "step": 8651 + }, + { + "epoch": 0.8343297974927676, + "grad_norm": 1.398386836051941, + "learning_rate": 3.3141073206614303e-06, + "loss": 5.2079, + "step": 8652 + }, + { + "epoch": 0.8344262295081967, + "grad_norm": 1.4229735136032104, + "learning_rate": 3.3103400005681547e-06, + "loss": 5.524, + "step": 8653 + }, + { + "epoch": 0.8345226615236259, + "grad_norm": 1.831843376159668, + "learning_rate": 3.3065746711251842e-06, + "loss": 5.3228, + "step": 8654 + }, + { + "epoch": 0.834619093539055, + "grad_norm": 1.3718652725219727, + "learning_rate": 3.3028113326780795e-06, + "loss": 5.4536, + "step": 8655 + }, + { + "epoch": 0.8347155255544841, + "grad_norm": 1.3890557289123535, + "learning_rate": 3.2990499855722555e-06, + "loss": 5.595, + "step": 8656 + }, + { + "epoch": 0.8348119575699132, + "grad_norm": 1.4703624248504639, + "learning_rate": 3.295290630152903e-06, + "loss": 5.2344, + "step": 8657 + }, + { + "epoch": 0.8349083895853423, + "grad_norm": 1.6085593700408936, + "learning_rate": 3.2915332667650735e-06, + "loss": 5.1184, + "step": 8658 + }, + { + "epoch": 0.8350048216007715, + "grad_norm": 1.5400810241699219, + "learning_rate": 3.2877778957535964e-06, + "loss": 5.3141, + "step": 8659 + }, + { + "epoch": 0.8351012536162006, + "grad_norm": 1.466532826423645, + "learning_rate": 3.2840245174631412e-06, + "loss": 5.1792, + "step": 8660 + }, + { + "epoch": 0.8351976856316297, + "grad_norm": 1.4729695320129395, + "learning_rate": 3.2802731322381875e-06, + "loss": 5.2877, + "step": 8661 + }, + { + "epoch": 0.8352941176470589, + "grad_norm": 1.5740758180618286, + "learning_rate": 3.276523740423032e-06, + "loss": 5.1029, + "step": 8662 + }, + { + "epoch": 0.8353905496624879, + "grad_norm": 1.8953076601028442, + "learning_rate": 3.2727763423617913e-06, + "loss": 5.0641, + "step": 8663 + }, + { + "epoch": 0.835486981677917, + "grad_norm": 1.8474618196487427, + "learning_rate": 3.2690309383983958e-06, + "loss": 5.2109, + "step": 8664 + }, + { + "epoch": 0.8355834136933462, + "grad_norm": 1.5422040224075317, + "learning_rate": 3.265287528876601e-06, + "loss": 5.2277, + "step": 8665 + }, + { + "epoch": 0.8356798457087753, + "grad_norm": 1.5352449417114258, + "learning_rate": 3.2615461141399546e-06, + "loss": 5.3806, + "step": 8666 + }, + { + "epoch": 0.8357762777242045, + "grad_norm": 2.475133180618286, + "learning_rate": 3.257806694531859e-06, + "loss": 5.5289, + "step": 8667 + }, + { + "epoch": 0.8358727097396336, + "grad_norm": 2.1723008155822754, + "learning_rate": 3.2540692703955017e-06, + "loss": 5.5737, + "step": 8668 + }, + { + "epoch": 0.8359691417550627, + "grad_norm": 2.013154983520508, + "learning_rate": 3.250333842073899e-06, + "loss": 5.1969, + "step": 8669 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 1.673035740852356, + "learning_rate": 3.246600409909889e-06, + "loss": 5.2826, + "step": 8670 + }, + { + "epoch": 0.8361620057859209, + "grad_norm": 2.350820541381836, + "learning_rate": 3.2428689742461188e-06, + "loss": 5.3598, + "step": 8671 + }, + { + "epoch": 0.83625843780135, + "grad_norm": 2.3700082302093506, + "learning_rate": 3.2391395354250564e-06, + "loss": 5.5979, + "step": 8672 + }, + { + "epoch": 0.8363548698167792, + "grad_norm": 1.3452765941619873, + "learning_rate": 3.2354120937889893e-06, + "loss": 5.4411, + "step": 8673 + }, + { + "epoch": 0.8364513018322083, + "grad_norm": 1.3715236186981201, + "learning_rate": 3.231686649680002e-06, + "loss": 5.2787, + "step": 8674 + }, + { + "epoch": 0.8365477338476374, + "grad_norm": 1.7246484756469727, + "learning_rate": 3.2279632034400293e-06, + "loss": 5.2404, + "step": 8675 + }, + { + "epoch": 0.8366441658630666, + "grad_norm": 1.4646024703979492, + "learning_rate": 3.2242417554107952e-06, + "loss": 5.3683, + "step": 8676 + }, + { + "epoch": 0.8367405978784956, + "grad_norm": 1.8963366746902466, + "learning_rate": 3.2205223059338486e-06, + "loss": 5.5574, + "step": 8677 + }, + { + "epoch": 0.8368370298939248, + "grad_norm": 1.49545156955719, + "learning_rate": 3.216804855350558e-06, + "loss": 5.5587, + "step": 8678 + }, + { + "epoch": 0.8369334619093539, + "grad_norm": 2.0483152866363525, + "learning_rate": 3.2130894040021094e-06, + "loss": 5.2794, + "step": 8679 + }, + { + "epoch": 0.837029893924783, + "grad_norm": 1.4102226495742798, + "learning_rate": 3.2093759522294986e-06, + "loss": 5.4925, + "step": 8680 + }, + { + "epoch": 0.8371263259402122, + "grad_norm": 1.5365780591964722, + "learning_rate": 3.205664500373551e-06, + "loss": 5.4394, + "step": 8681 + }, + { + "epoch": 0.8372227579556413, + "grad_norm": 1.3808708190917969, + "learning_rate": 3.2019550487748767e-06, + "loss": 5.4123, + "step": 8682 + }, + { + "epoch": 0.8373191899710704, + "grad_norm": 1.4638391733169556, + "learning_rate": 3.1982475977739513e-06, + "loss": 5.3138, + "step": 8683 + }, + { + "epoch": 0.8374156219864995, + "grad_norm": 1.7748236656188965, + "learning_rate": 3.1945421477110217e-06, + "loss": 5.112, + "step": 8684 + }, + { + "epoch": 0.8375120540019286, + "grad_norm": 1.4994903802871704, + "learning_rate": 3.1908386989261767e-06, + "loss": 5.2655, + "step": 8685 + }, + { + "epoch": 0.8376084860173577, + "grad_norm": 1.8089131116867065, + "learning_rate": 3.1871372517593118e-06, + "loss": 5.0461, + "step": 8686 + }, + { + "epoch": 0.8377049180327869, + "grad_norm": 1.5508267879486084, + "learning_rate": 3.1834378065501437e-06, + "loss": 5.3367, + "step": 8687 + }, + { + "epoch": 0.837801350048216, + "grad_norm": 1.4903358221054077, + "learning_rate": 3.1797403636382035e-06, + "loss": 5.3731, + "step": 8688 + }, + { + "epoch": 0.8378977820636452, + "grad_norm": 2.2309017181396484, + "learning_rate": 3.1760449233628335e-06, + "loss": 5.7165, + "step": 8689 + }, + { + "epoch": 0.8379942140790743, + "grad_norm": 2.172210693359375, + "learning_rate": 3.172351486063202e-06, + "loss": 5.75, + "step": 8690 + }, + { + "epoch": 0.8380906460945033, + "grad_norm": 1.8810527324676514, + "learning_rate": 3.1686600520782842e-06, + "loss": 5.7513, + "step": 8691 + }, + { + "epoch": 0.8381870781099325, + "grad_norm": 1.7981775999069214, + "learning_rate": 3.164970621746882e-06, + "loss": 5.7851, + "step": 8692 + }, + { + "epoch": 0.8382835101253616, + "grad_norm": 1.1915472745895386, + "learning_rate": 3.1612831954075912e-06, + "loss": 5.4128, + "step": 8693 + }, + { + "epoch": 0.8383799421407907, + "grad_norm": 2.130270004272461, + "learning_rate": 3.1575977733988597e-06, + "loss": 5.0069, + "step": 8694 + }, + { + "epoch": 0.8384763741562199, + "grad_norm": 2.032287359237671, + "learning_rate": 3.1539143560589154e-06, + "loss": 5.2608, + "step": 8695 + }, + { + "epoch": 0.838572806171649, + "grad_norm": 1.8962472677230835, + "learning_rate": 3.1502329437258256e-06, + "loss": 5.3403, + "step": 8696 + }, + { + "epoch": 0.8386692381870781, + "grad_norm": 2.91919207572937, + "learning_rate": 3.146553536737462e-06, + "loss": 5.2859, + "step": 8697 + }, + { + "epoch": 0.8387656702025073, + "grad_norm": 2.785006523132324, + "learning_rate": 3.1428761354315217e-06, + "loss": 5.2857, + "step": 8698 + }, + { + "epoch": 0.8388621022179363, + "grad_norm": 2.147364377975464, + "learning_rate": 3.1392007401454983e-06, + "loss": 5.26, + "step": 8699 + }, + { + "epoch": 0.8389585342333655, + "grad_norm": 1.5658804178237915, + "learning_rate": 3.1355273512167355e-06, + "loss": 5.2978, + "step": 8700 + }, + { + "epoch": 0.8390549662487946, + "grad_norm": 1.9132970571517944, + "learning_rate": 3.1318559689823528e-06, + "loss": 5.3581, + "step": 8701 + }, + { + "epoch": 0.8391513982642237, + "grad_norm": 2.045973539352417, + "learning_rate": 3.1281865937793228e-06, + "loss": 5.1651, + "step": 8702 + }, + { + "epoch": 0.8392478302796529, + "grad_norm": 3.69364070892334, + "learning_rate": 3.1245192259444e-06, + "loss": 4.9722, + "step": 8703 + }, + { + "epoch": 0.839344262295082, + "grad_norm": 1.9175302982330322, + "learning_rate": 3.1208538658141828e-06, + "loss": 5.1417, + "step": 8704 + }, + { + "epoch": 0.839440694310511, + "grad_norm": 1.8313301801681519, + "learning_rate": 3.1171905137250655e-06, + "loss": 5.3366, + "step": 8705 + }, + { + "epoch": 0.8395371263259402, + "grad_norm": 1.9773309230804443, + "learning_rate": 3.1135291700132714e-06, + "loss": 5.4796, + "step": 8706 + }, + { + "epoch": 0.8396335583413693, + "grad_norm": 1.9758732318878174, + "learning_rate": 3.1098698350148307e-06, + "loss": 5.2026, + "step": 8707 + }, + { + "epoch": 0.8397299903567984, + "grad_norm": 2.226059913635254, + "learning_rate": 3.106212509065595e-06, + "loss": 5.0529, + "step": 8708 + }, + { + "epoch": 0.8398264223722276, + "grad_norm": 2.9162683486938477, + "learning_rate": 3.102557192501232e-06, + "loss": 5.3544, + "step": 8709 + }, + { + "epoch": 0.8399228543876567, + "grad_norm": 1.9449362754821777, + "learning_rate": 3.0989038856572117e-06, + "loss": 5.2829, + "step": 8710 + }, + { + "epoch": 0.8400192864030859, + "grad_norm": 1.4504179954528809, + "learning_rate": 3.095252588868844e-06, + "loss": 5.3549, + "step": 8711 + }, + { + "epoch": 0.840115718418515, + "grad_norm": 1.8380835056304932, + "learning_rate": 3.0916033024712244e-06, + "loss": 5.2595, + "step": 8712 + }, + { + "epoch": 0.840212150433944, + "grad_norm": 1.9862850904464722, + "learning_rate": 3.087956026799299e-06, + "loss": 5.466, + "step": 8713 + }, + { + "epoch": 0.8403085824493732, + "grad_norm": 1.4714542627334595, + "learning_rate": 3.084310762187795e-06, + "loss": 5.5673, + "step": 8714 + }, + { + "epoch": 0.8404050144648023, + "grad_norm": 1.2500660419464111, + "learning_rate": 3.0806675089712743e-06, + "loss": 5.431, + "step": 8715 + }, + { + "epoch": 0.8405014464802314, + "grad_norm": 2.234236717224121, + "learning_rate": 3.077026267484112e-06, + "loss": 5.1954, + "step": 8716 + }, + { + "epoch": 0.8405978784956606, + "grad_norm": 1.5007071495056152, + "learning_rate": 3.0733870380605017e-06, + "loss": 5.4699, + "step": 8717 + }, + { + "epoch": 0.8406943105110897, + "grad_norm": 1.502801537513733, + "learning_rate": 3.0697498210344317e-06, + "loss": 5.4362, + "step": 8718 + }, + { + "epoch": 0.8407907425265188, + "grad_norm": 1.473844289779663, + "learning_rate": 3.066114616739743e-06, + "loss": 5.4998, + "step": 8719 + }, + { + "epoch": 0.840887174541948, + "grad_norm": 1.969283938407898, + "learning_rate": 3.06248142551005e-06, + "loss": 5.5932, + "step": 8720 + }, + { + "epoch": 0.840983606557377, + "grad_norm": 1.4135093688964844, + "learning_rate": 3.0588502476788127e-06, + "loss": 5.1516, + "step": 8721 + }, + { + "epoch": 0.8410800385728062, + "grad_norm": 1.3901300430297852, + "learning_rate": 3.055221083579296e-06, + "loss": 5.2027, + "step": 8722 + }, + { + "epoch": 0.8411764705882353, + "grad_norm": 1.2442997694015503, + "learning_rate": 3.0515939335445774e-06, + "loss": 4.9434, + "step": 8723 + }, + { + "epoch": 0.8412729026036644, + "grad_norm": 1.276995062828064, + "learning_rate": 3.0479687979075523e-06, + "loss": 5.2781, + "step": 8724 + }, + { + "epoch": 0.8413693346190936, + "grad_norm": 1.4660111665725708, + "learning_rate": 3.044345677000937e-06, + "loss": 5.3427, + "step": 8725 + }, + { + "epoch": 0.8414657666345227, + "grad_norm": 1.9774223566055298, + "learning_rate": 3.040724571157244e-06, + "loss": 5.2237, + "step": 8726 + }, + { + "epoch": 0.8415621986499517, + "grad_norm": 2.3179216384887695, + "learning_rate": 3.0371054807088324e-06, + "loss": 5.1166, + "step": 8727 + }, + { + "epoch": 0.8416586306653809, + "grad_norm": 1.863420009613037, + "learning_rate": 3.033488405987844e-06, + "loss": 5.2027, + "step": 8728 + }, + { + "epoch": 0.84175506268081, + "grad_norm": 1.8324657678604126, + "learning_rate": 3.029873347326251e-06, + "loss": 5.1757, + "step": 8729 + }, + { + "epoch": 0.8418514946962391, + "grad_norm": 1.8389853239059448, + "learning_rate": 3.026260305055842e-06, + "loss": 5.0536, + "step": 8730 + }, + { + "epoch": 0.8419479267116683, + "grad_norm": 2.0523788928985596, + "learning_rate": 3.022649279508219e-06, + "loss": 5.1439, + "step": 8731 + }, + { + "epoch": 0.8420443587270974, + "grad_norm": 1.5196501016616821, + "learning_rate": 3.019040271014792e-06, + "loss": 5.2223, + "step": 8732 + }, + { + "epoch": 0.8421407907425266, + "grad_norm": 1.6929364204406738, + "learning_rate": 3.015433279906796e-06, + "loss": 5.4598, + "step": 8733 + }, + { + "epoch": 0.8422372227579557, + "grad_norm": 1.7514232397079468, + "learning_rate": 3.011828306515277e-06, + "loss": 5.4955, + "step": 8734 + }, + { + "epoch": 0.8423336547733847, + "grad_norm": 1.5505858659744263, + "learning_rate": 3.00822535117109e-06, + "loss": 5.3961, + "step": 8735 + }, + { + "epoch": 0.8424300867888139, + "grad_norm": 1.819931983947754, + "learning_rate": 3.004624414204921e-06, + "loss": 5.4517, + "step": 8736 + }, + { + "epoch": 0.842526518804243, + "grad_norm": 1.3740835189819336, + "learning_rate": 3.0010254959472378e-06, + "loss": 5.3799, + "step": 8737 + }, + { + "epoch": 0.8426229508196721, + "grad_norm": 1.3988529443740845, + "learning_rate": 2.9974285967283715e-06, + "loss": 5.1648, + "step": 8738 + }, + { + "epoch": 0.8427193828351013, + "grad_norm": 1.658254623413086, + "learning_rate": 2.993833716878422e-06, + "loss": 5.4029, + "step": 8739 + }, + { + "epoch": 0.8428158148505304, + "grad_norm": 1.5389713048934937, + "learning_rate": 2.990240856727328e-06, + "loss": 5.4084, + "step": 8740 + }, + { + "epoch": 0.8429122468659594, + "grad_norm": 2.0551414489746094, + "learning_rate": 2.9866500166048362e-06, + "loss": 5.1492, + "step": 8741 + }, + { + "epoch": 0.8430086788813886, + "grad_norm": 1.4755884408950806, + "learning_rate": 2.98306119684052e-06, + "loss": 5.2131, + "step": 8742 + }, + { + "epoch": 0.8431051108968177, + "grad_norm": 1.793928623199463, + "learning_rate": 2.979474397763737e-06, + "loss": 5.3226, + "step": 8743 + }, + { + "epoch": 0.8432015429122469, + "grad_norm": 1.6696982383728027, + "learning_rate": 2.9758896197037034e-06, + "loss": 5.2176, + "step": 8744 + }, + { + "epoch": 0.843297974927676, + "grad_norm": 1.4749228954315186, + "learning_rate": 2.972306862989402e-06, + "loss": 5.2863, + "step": 8745 + }, + { + "epoch": 0.8433944069431051, + "grad_norm": 2.333369493484497, + "learning_rate": 2.968726127949675e-06, + "loss": 5.3427, + "step": 8746 + }, + { + "epoch": 0.8434908389585343, + "grad_norm": 2.4179954528808594, + "learning_rate": 2.9651474149131414e-06, + "loss": 5.5985, + "step": 8747 + }, + { + "epoch": 0.8435872709739634, + "grad_norm": 3.1339452266693115, + "learning_rate": 2.96157072420826e-06, + "loss": 5.6478, + "step": 8748 + }, + { + "epoch": 0.8436837029893924, + "grad_norm": 1.6780545711517334, + "learning_rate": 2.9579960561632923e-06, + "loss": 5.6525, + "step": 8749 + }, + { + "epoch": 0.8437801350048216, + "grad_norm": 1.4332596063613892, + "learning_rate": 2.954423411106316e-06, + "loss": 5.3993, + "step": 8750 + }, + { + "epoch": 0.8438765670202507, + "grad_norm": 2.2706139087677, + "learning_rate": 2.9508527893652242e-06, + "loss": 5.2504, + "step": 8751 + }, + { + "epoch": 0.8439729990356798, + "grad_norm": 1.8883894681930542, + "learning_rate": 2.947284191267727e-06, + "loss": 5.3057, + "step": 8752 + }, + { + "epoch": 0.844069431051109, + "grad_norm": 1.3833422660827637, + "learning_rate": 2.9437176171413505e-06, + "loss": 5.3083, + "step": 8753 + }, + { + "epoch": 0.8441658630665381, + "grad_norm": 1.59494149684906, + "learning_rate": 2.9401530673134114e-06, + "loss": 5.0939, + "step": 8754 + }, + { + "epoch": 0.8442622950819673, + "grad_norm": 1.735300064086914, + "learning_rate": 2.936590542111087e-06, + "loss": 5.2132, + "step": 8755 + }, + { + "epoch": 0.8443587270973963, + "grad_norm": 1.3717191219329834, + "learning_rate": 2.933030041861312e-06, + "loss": 5.2446, + "step": 8756 + }, + { + "epoch": 0.8444551591128254, + "grad_norm": 1.5546331405639648, + "learning_rate": 2.929471566890893e-06, + "loss": 5.181, + "step": 8757 + }, + { + "epoch": 0.8445515911282546, + "grad_norm": 1.2530041933059692, + "learning_rate": 2.925915117526401e-06, + "loss": 5.0998, + "step": 8758 + }, + { + "epoch": 0.8446480231436837, + "grad_norm": 1.5519239902496338, + "learning_rate": 2.9223606940942533e-06, + "loss": 5.1864, + "step": 8759 + }, + { + "epoch": 0.8447444551591128, + "grad_norm": 1.4355556964874268, + "learning_rate": 2.9188082969206663e-06, + "loss": 5.0638, + "step": 8760 + }, + { + "epoch": 0.844840887174542, + "grad_norm": 1.8417457342147827, + "learning_rate": 2.9152579263316853e-06, + "loss": 5.2944, + "step": 8761 + }, + { + "epoch": 0.8449373191899711, + "grad_norm": 1.5674550533294678, + "learning_rate": 2.9117095826531355e-06, + "loss": 5.2023, + "step": 8762 + }, + { + "epoch": 0.8450337512054001, + "grad_norm": 1.6819422245025635, + "learning_rate": 2.908163266210706e-06, + "loss": 5.4059, + "step": 8763 + }, + { + "epoch": 0.8451301832208293, + "grad_norm": 1.4573054313659668, + "learning_rate": 2.904618977329854e-06, + "loss": 5.5032, + "step": 8764 + }, + { + "epoch": 0.8452266152362584, + "grad_norm": 1.4711501598358154, + "learning_rate": 2.9010767163358794e-06, + "loss": 5.5159, + "step": 8765 + }, + { + "epoch": 0.8453230472516876, + "grad_norm": 1.4529951810836792, + "learning_rate": 2.897536483553884e-06, + "loss": 5.2995, + "step": 8766 + }, + { + "epoch": 0.8454194792671167, + "grad_norm": 1.2530328035354614, + "learning_rate": 2.893998279308788e-06, + "loss": 5.2237, + "step": 8767 + }, + { + "epoch": 0.8455159112825458, + "grad_norm": 2.4222252368927, + "learning_rate": 2.8904621039253204e-06, + "loss": 5.3679, + "step": 8768 + }, + { + "epoch": 0.845612343297975, + "grad_norm": 1.6316444873809814, + "learning_rate": 2.8869279577280277e-06, + "loss": 5.3977, + "step": 8769 + }, + { + "epoch": 0.845708775313404, + "grad_norm": 1.8801872730255127, + "learning_rate": 2.88339584104127e-06, + "loss": 5.3022, + "step": 8770 + }, + { + "epoch": 0.8458052073288331, + "grad_norm": 1.603678822517395, + "learning_rate": 2.879865754189223e-06, + "loss": 5.3139, + "step": 8771 + }, + { + "epoch": 0.8459016393442623, + "grad_norm": 2.3136630058288574, + "learning_rate": 2.876337697495876e-06, + "loss": 5.311, + "step": 8772 + }, + { + "epoch": 0.8459980713596914, + "grad_norm": 2.1701560020446777, + "learning_rate": 2.8728116712850193e-06, + "loss": 5.1931, + "step": 8773 + }, + { + "epoch": 0.8460945033751205, + "grad_norm": 1.446325659751892, + "learning_rate": 2.8692876758802723e-06, + "loss": 5.288, + "step": 8774 + }, + { + "epoch": 0.8461909353905497, + "grad_norm": 1.4927654266357422, + "learning_rate": 2.8657657116050674e-06, + "loss": 5.5513, + "step": 8775 + }, + { + "epoch": 0.8462873674059788, + "grad_norm": 1.488927960395813, + "learning_rate": 2.862245778782638e-06, + "loss": 5.2466, + "step": 8776 + }, + { + "epoch": 0.846383799421408, + "grad_norm": 1.2688755989074707, + "learning_rate": 2.8587278777360475e-06, + "loss": 5.3043, + "step": 8777 + }, + { + "epoch": 0.846480231436837, + "grad_norm": 1.3956745862960815, + "learning_rate": 2.8552120087881605e-06, + "loss": 5.456, + "step": 8778 + }, + { + "epoch": 0.8465766634522661, + "grad_norm": 1.6277028322219849, + "learning_rate": 2.8516981722616577e-06, + "loss": 5.2491, + "step": 8779 + }, + { + "epoch": 0.8466730954676953, + "grad_norm": 2.3531925678253174, + "learning_rate": 2.8481863684790443e-06, + "loss": 5.1936, + "step": 8780 + }, + { + "epoch": 0.8467695274831244, + "grad_norm": 1.4150259494781494, + "learning_rate": 2.8446765977626106e-06, + "loss": 5.1462, + "step": 8781 + }, + { + "epoch": 0.8468659594985535, + "grad_norm": 1.731268048286438, + "learning_rate": 2.8411688604344984e-06, + "loss": 5.3884, + "step": 8782 + }, + { + "epoch": 0.8469623915139827, + "grad_norm": 1.5245100259780884, + "learning_rate": 2.8376631568166305e-06, + "loss": 5.3558, + "step": 8783 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 1.8925566673278809, + "learning_rate": 2.8341594872307614e-06, + "loss": 4.9366, + "step": 8784 + }, + { + "epoch": 0.8471552555448408, + "grad_norm": 2.127530813217163, + "learning_rate": 2.8306578519984527e-06, + "loss": 5.4171, + "step": 8785 + }, + { + "epoch": 0.84725168756027, + "grad_norm": 1.8322093486785889, + "learning_rate": 2.8271582514410866e-06, + "loss": 5.0671, + "step": 8786 + }, + { + "epoch": 0.8473481195756991, + "grad_norm": 1.2754517793655396, + "learning_rate": 2.8236606858798336e-06, + "loss": 5.4802, + "step": 8787 + }, + { + "epoch": 0.8474445515911283, + "grad_norm": 1.4688324928283691, + "learning_rate": 2.8201651556357207e-06, + "loss": 5.0855, + "step": 8788 + }, + { + "epoch": 0.8475409836065574, + "grad_norm": 1.4285101890563965, + "learning_rate": 2.816671661029538e-06, + "loss": 5.4131, + "step": 8789 + }, + { + "epoch": 0.8476374156219865, + "grad_norm": 1.501191258430481, + "learning_rate": 2.8131802023819377e-06, + "loss": 5.453, + "step": 8790 + }, + { + "epoch": 0.8477338476374157, + "grad_norm": 1.4265378713607788, + "learning_rate": 2.8096907800133467e-06, + "loss": 5.483, + "step": 8791 + }, + { + "epoch": 0.8478302796528447, + "grad_norm": 1.3497527837753296, + "learning_rate": 2.8062033942440253e-06, + "loss": 5.2952, + "step": 8792 + }, + { + "epoch": 0.8479267116682738, + "grad_norm": 2.1750271320343018, + "learning_rate": 2.8027180453940397e-06, + "loss": 5.2694, + "step": 8793 + }, + { + "epoch": 0.848023143683703, + "grad_norm": 1.4540727138519287, + "learning_rate": 2.7992347337832708e-06, + "loss": 5.2352, + "step": 8794 + }, + { + "epoch": 0.8481195756991321, + "grad_norm": 1.9524061679840088, + "learning_rate": 2.7957534597314123e-06, + "loss": 5.1058, + "step": 8795 + }, + { + "epoch": 0.8482160077145612, + "grad_norm": 1.4741486310958862, + "learning_rate": 2.7922742235579725e-06, + "loss": 5.5127, + "step": 8796 + }, + { + "epoch": 0.8483124397299904, + "grad_norm": 1.917881965637207, + "learning_rate": 2.7887970255822767e-06, + "loss": 5.2622, + "step": 8797 + }, + { + "epoch": 0.8484088717454195, + "grad_norm": 1.5568385124206543, + "learning_rate": 2.785321866123444e-06, + "loss": 5.1102, + "step": 8798 + }, + { + "epoch": 0.8485053037608487, + "grad_norm": 1.5007655620574951, + "learning_rate": 2.781848745500437e-06, + "loss": 5.3257, + "step": 8799 + }, + { + "epoch": 0.8486017357762777, + "grad_norm": 1.6139057874679565, + "learning_rate": 2.7783776640319948e-06, + "loss": 5.4334, + "step": 8800 + }, + { + "epoch": 0.8486981677917068, + "grad_norm": 1.6485302448272705, + "learning_rate": 2.774908622036712e-06, + "loss": 4.9353, + "step": 8801 + }, + { + "epoch": 0.848794599807136, + "grad_norm": 1.2393304109573364, + "learning_rate": 2.7714416198329567e-06, + "loss": 5.2096, + "step": 8802 + }, + { + "epoch": 0.8488910318225651, + "grad_norm": 1.779579997062683, + "learning_rate": 2.76797665773893e-06, + "loss": 5.3262, + "step": 8803 + }, + { + "epoch": 0.8489874638379942, + "grad_norm": 1.7109520435333252, + "learning_rate": 2.7645137360726435e-06, + "loss": 5.2292, + "step": 8804 + }, + { + "epoch": 0.8490838958534234, + "grad_norm": 1.5204576253890991, + "learning_rate": 2.7610528551519216e-06, + "loss": 5.3282, + "step": 8805 + }, + { + "epoch": 0.8491803278688524, + "grad_norm": 1.585100531578064, + "learning_rate": 2.7575940152943876e-06, + "loss": 5.2924, + "step": 8806 + }, + { + "epoch": 0.8492767598842815, + "grad_norm": 1.589712381362915, + "learning_rate": 2.7541372168175095e-06, + "loss": 5.445, + "step": 8807 + }, + { + "epoch": 0.8493731918997107, + "grad_norm": 1.540223240852356, + "learning_rate": 2.7506824600385337e-06, + "loss": 5.4922, + "step": 8808 + }, + { + "epoch": 0.8494696239151398, + "grad_norm": 1.5865046977996826, + "learning_rate": 2.7472297452745348e-06, + "loss": 5.1105, + "step": 8809 + }, + { + "epoch": 0.849566055930569, + "grad_norm": 1.809287190437317, + "learning_rate": 2.7437790728424033e-06, + "loss": 5.1305, + "step": 8810 + }, + { + "epoch": 0.8496624879459981, + "grad_norm": 1.260455846786499, + "learning_rate": 2.7403304430588335e-06, + "loss": 5.175, + "step": 8811 + }, + { + "epoch": 0.8497589199614272, + "grad_norm": 1.4781566858291626, + "learning_rate": 2.736883856240341e-06, + "loss": 5.4402, + "step": 8812 + }, + { + "epoch": 0.8498553519768564, + "grad_norm": 1.5423510074615479, + "learning_rate": 2.7334393127032426e-06, + "loss": 5.3778, + "step": 8813 + }, + { + "epoch": 0.8499517839922854, + "grad_norm": 1.5003883838653564, + "learning_rate": 2.7299968127636828e-06, + "loss": 5.4733, + "step": 8814 + }, + { + "epoch": 0.8500482160077145, + "grad_norm": 1.520817518234253, + "learning_rate": 2.726556356737603e-06, + "loss": 5.293, + "step": 8815 + }, + { + "epoch": 0.8501446480231437, + "grad_norm": 1.6644784212112427, + "learning_rate": 2.723117944940773e-06, + "loss": 5.3407, + "step": 8816 + }, + { + "epoch": 0.8502410800385728, + "grad_norm": 1.3875441551208496, + "learning_rate": 2.7196815776887487e-06, + "loss": 5.4575, + "step": 8817 + }, + { + "epoch": 0.8503375120540019, + "grad_norm": 1.1152499914169312, + "learning_rate": 2.716247255296936e-06, + "loss": 5.4828, + "step": 8818 + }, + { + "epoch": 0.8504339440694311, + "grad_norm": 1.0399008989334106, + "learning_rate": 2.712814978080519e-06, + "loss": 5.3371, + "step": 8819 + }, + { + "epoch": 0.8505303760848602, + "grad_norm": 1.5712729692459106, + "learning_rate": 2.7093847463545123e-06, + "loss": 5.0639, + "step": 8820 + }, + { + "epoch": 0.8506268081002893, + "grad_norm": 1.8377894163131714, + "learning_rate": 2.7059565604337393e-06, + "loss": 5.3422, + "step": 8821 + }, + { + "epoch": 0.8507232401157184, + "grad_norm": 2.263434410095215, + "learning_rate": 2.7025304206328346e-06, + "loss": 5.2595, + "step": 8822 + }, + { + "epoch": 0.8508196721311475, + "grad_norm": 2.203892469406128, + "learning_rate": 2.6991063272662406e-06, + "loss": 5.2269, + "step": 8823 + }, + { + "epoch": 0.8509161041465767, + "grad_norm": 1.6120684146881104, + "learning_rate": 2.695684280648228e-06, + "loss": 5.4123, + "step": 8824 + }, + { + "epoch": 0.8510125361620058, + "grad_norm": 1.479636549949646, + "learning_rate": 2.6922642810928487e-06, + "loss": 5.4534, + "step": 8825 + }, + { + "epoch": 0.8511089681774349, + "grad_norm": 1.3676066398620605, + "learning_rate": 2.6888463289140078e-06, + "loss": 5.3057, + "step": 8826 + }, + { + "epoch": 0.8512054001928641, + "grad_norm": 1.498213291168213, + "learning_rate": 2.6854304244253837e-06, + "loss": 5.436, + "step": 8827 + }, + { + "epoch": 0.8513018322082931, + "grad_norm": 1.6480896472930908, + "learning_rate": 2.682016567940493e-06, + "loss": 5.4226, + "step": 8828 + }, + { + "epoch": 0.8513982642237222, + "grad_norm": 1.8650373220443726, + "learning_rate": 2.6786047597726517e-06, + "loss": 5.3603, + "step": 8829 + }, + { + "epoch": 0.8514946962391514, + "grad_norm": 1.2686853408813477, + "learning_rate": 2.675195000234995e-06, + "loss": 5.2181, + "step": 8830 + }, + { + "epoch": 0.8515911282545805, + "grad_norm": 1.2854549884796143, + "learning_rate": 2.671787289640457e-06, + "loss": 5.2016, + "step": 8831 + }, + { + "epoch": 0.8516875602700097, + "grad_norm": 1.1643025875091553, + "learning_rate": 2.668381628301808e-06, + "loss": 5.2701, + "step": 8832 + }, + { + "epoch": 0.8517839922854388, + "grad_norm": 1.311723232269287, + "learning_rate": 2.6649780165315998e-06, + "loss": 5.3858, + "step": 8833 + }, + { + "epoch": 0.8518804243008679, + "grad_norm": 1.4466655254364014, + "learning_rate": 2.6615764546422284e-06, + "loss": 5.2528, + "step": 8834 + }, + { + "epoch": 0.851976856316297, + "grad_norm": 1.4998397827148438, + "learning_rate": 2.6581769429458696e-06, + "loss": 5.2262, + "step": 8835 + }, + { + "epoch": 0.8520732883317261, + "grad_norm": 1.350635051727295, + "learning_rate": 2.6547794817545323e-06, + "loss": 5.342, + "step": 8836 + }, + { + "epoch": 0.8521697203471552, + "grad_norm": 1.4418987035751343, + "learning_rate": 2.6513840713800337e-06, + "loss": 5.4265, + "step": 8837 + }, + { + "epoch": 0.8522661523625844, + "grad_norm": 2.1133058071136475, + "learning_rate": 2.6479907121339965e-06, + "loss": 5.333, + "step": 8838 + }, + { + "epoch": 0.8523625843780135, + "grad_norm": 1.2529598474502563, + "learning_rate": 2.644599404327863e-06, + "loss": 5.3092, + "step": 8839 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 1.4392814636230469, + "learning_rate": 2.6412101482728795e-06, + "loss": 5.3683, + "step": 8840 + }, + { + "epoch": 0.8525554484088718, + "grad_norm": 1.2509559392929077, + "learning_rate": 2.637822944280116e-06, + "loss": 5.3663, + "step": 8841 + }, + { + "epoch": 0.8526518804243008, + "grad_norm": 1.5231459140777588, + "learning_rate": 2.634437792660427e-06, + "loss": 5.1871, + "step": 8842 + }, + { + "epoch": 0.85274831243973, + "grad_norm": 1.7123758792877197, + "learning_rate": 2.631054693724522e-06, + "loss": 5.2122, + "step": 8843 + }, + { + "epoch": 0.8528447444551591, + "grad_norm": 1.2960550785064697, + "learning_rate": 2.6276736477828745e-06, + "loss": 5.0302, + "step": 8844 + }, + { + "epoch": 0.8529411764705882, + "grad_norm": 1.3192814588546753, + "learning_rate": 2.624294655145812e-06, + "loss": 5.3231, + "step": 8845 + }, + { + "epoch": 0.8530376084860174, + "grad_norm": 1.3728018999099731, + "learning_rate": 2.6209177161234445e-06, + "loss": 5.4508, + "step": 8846 + }, + { + "epoch": 0.8531340405014465, + "grad_norm": 1.2792702913284302, + "learning_rate": 2.6175428310257015e-06, + "loss": 5.2627, + "step": 8847 + }, + { + "epoch": 0.8532304725168756, + "grad_norm": 1.778122067451477, + "learning_rate": 2.6141700001623304e-06, + "loss": 5.2105, + "step": 8848 + }, + { + "epoch": 0.8533269045323048, + "grad_norm": 1.400598406791687, + "learning_rate": 2.6107992238428913e-06, + "loss": 5.2322, + "step": 8849 + }, + { + "epoch": 0.8534233365477338, + "grad_norm": 2.440168619155884, + "learning_rate": 2.6074305023767316e-06, + "loss": 5.117, + "step": 8850 + }, + { + "epoch": 0.8535197685631629, + "grad_norm": 1.5291141271591187, + "learning_rate": 2.604063836073048e-06, + "loss": 5.3212, + "step": 8851 + }, + { + "epoch": 0.8536162005785921, + "grad_norm": 1.6913338899612427, + "learning_rate": 2.6006992252408157e-06, + "loss": 5.5065, + "step": 8852 + }, + { + "epoch": 0.8537126325940212, + "grad_norm": 1.4595667123794556, + "learning_rate": 2.597336670188841e-06, + "loss": 5.3476, + "step": 8853 + }, + { + "epoch": 0.8538090646094504, + "grad_norm": 1.826342225074768, + "learning_rate": 2.5939761712257344e-06, + "loss": 5.2699, + "step": 8854 + }, + { + "epoch": 0.8539054966248795, + "grad_norm": 1.5909428596496582, + "learning_rate": 2.5906177286599163e-06, + "loss": 5.2932, + "step": 8855 + }, + { + "epoch": 0.8540019286403085, + "grad_norm": 1.4280884265899658, + "learning_rate": 2.587261342799621e-06, + "loss": 5.3092, + "step": 8856 + }, + { + "epoch": 0.8540983606557377, + "grad_norm": 1.2526113986968994, + "learning_rate": 2.5839070139528937e-06, + "loss": 5.3124, + "step": 8857 + }, + { + "epoch": 0.8541947926711668, + "grad_norm": 1.847706913948059, + "learning_rate": 2.5805547424275905e-06, + "loss": 5.2325, + "step": 8858 + }, + { + "epoch": 0.8542912246865959, + "grad_norm": 1.7151378393173218, + "learning_rate": 2.5772045285313823e-06, + "loss": 5.1897, + "step": 8859 + }, + { + "epoch": 0.8543876567020251, + "grad_norm": 1.395297884941101, + "learning_rate": 2.573856372571748e-06, + "loss": 5.1141, + "step": 8860 + }, + { + "epoch": 0.8544840887174542, + "grad_norm": 1.5256364345550537, + "learning_rate": 2.570510274855961e-06, + "loss": 5.3341, + "step": 8861 + }, + { + "epoch": 0.8545805207328833, + "grad_norm": 2.2418036460876465, + "learning_rate": 2.5671662356911486e-06, + "loss": 5.2934, + "step": 8862 + }, + { + "epoch": 0.8546769527483125, + "grad_norm": 1.345988154411316, + "learning_rate": 2.5638242553842005e-06, + "loss": 5.2458, + "step": 8863 + }, + { + "epoch": 0.8547733847637415, + "grad_norm": 1.5381110906600952, + "learning_rate": 2.56048433424185e-06, + "loss": 5.2064, + "step": 8864 + }, + { + "epoch": 0.8548698167791707, + "grad_norm": 2.150954008102417, + "learning_rate": 2.557146472570629e-06, + "loss": 5.214, + "step": 8865 + }, + { + "epoch": 0.8549662487945998, + "grad_norm": 1.9028948545455933, + "learning_rate": 2.5538106706768818e-06, + "loss": 5.0764, + "step": 8866 + }, + { + "epoch": 0.8550626808100289, + "grad_norm": 2.843379020690918, + "learning_rate": 2.550476928866763e-06, + "loss": 4.7304, + "step": 8867 + }, + { + "epoch": 0.8551591128254581, + "grad_norm": 1.6937530040740967, + "learning_rate": 2.5471452474462475e-06, + "loss": 5.0064, + "step": 8868 + }, + { + "epoch": 0.8552555448408872, + "grad_norm": 1.5208648443222046, + "learning_rate": 2.5438156267210968e-06, + "loss": 5.4442, + "step": 8869 + }, + { + "epoch": 0.8553519768563163, + "grad_norm": 1.4626522064208984, + "learning_rate": 2.5404880669969156e-06, + "loss": 5.4041, + "step": 8870 + }, + { + "epoch": 0.8554484088717454, + "grad_norm": 1.8962445259094238, + "learning_rate": 2.5371625685790933e-06, + "loss": 5.1654, + "step": 8871 + }, + { + "epoch": 0.8555448408871745, + "grad_norm": 2.3589024543762207, + "learning_rate": 2.5338391317728443e-06, + "loss": 5.3195, + "step": 8872 + }, + { + "epoch": 0.8556412729026036, + "grad_norm": 1.8364065885543823, + "learning_rate": 2.5305177568831883e-06, + "loss": 5.3762, + "step": 8873 + }, + { + "epoch": 0.8557377049180328, + "grad_norm": 1.6120588779449463, + "learning_rate": 2.5271984442149566e-06, + "loss": 5.4443, + "step": 8874 + }, + { + "epoch": 0.8558341369334619, + "grad_norm": 1.4324393272399902, + "learning_rate": 2.5238811940727913e-06, + "loss": 5.2527, + "step": 8875 + }, + { + "epoch": 0.8559305689488911, + "grad_norm": 1.4597344398498535, + "learning_rate": 2.5205660067611465e-06, + "loss": 5.3769, + "step": 8876 + }, + { + "epoch": 0.8560270009643202, + "grad_norm": 1.7703617811203003, + "learning_rate": 2.5172528825842867e-06, + "loss": 5.3316, + "step": 8877 + }, + { + "epoch": 0.8561234329797492, + "grad_norm": 2.080476999282837, + "learning_rate": 2.5139418218462856e-06, + "loss": 5.2693, + "step": 8878 + }, + { + "epoch": 0.8562198649951784, + "grad_norm": 1.6852048635482788, + "learning_rate": 2.5106328248510307e-06, + "loss": 5.2578, + "step": 8879 + }, + { + "epoch": 0.8563162970106075, + "grad_norm": 2.0577216148376465, + "learning_rate": 2.507325891902215e-06, + "loss": 5.4555, + "step": 8880 + }, + { + "epoch": 0.8564127290260366, + "grad_norm": 1.6537138223648071, + "learning_rate": 2.5040210233033405e-06, + "loss": 5.3204, + "step": 8881 + }, + { + "epoch": 0.8565091610414658, + "grad_norm": 1.9542057514190674, + "learning_rate": 2.500718219357731e-06, + "loss": 5.1965, + "step": 8882 + }, + { + "epoch": 0.8566055930568949, + "grad_norm": 1.8888587951660156, + "learning_rate": 2.497417480368511e-06, + "loss": 5.5484, + "step": 8883 + }, + { + "epoch": 0.856702025072324, + "grad_norm": 1.7487002611160278, + "learning_rate": 2.4941188066386177e-06, + "loss": 5.3081, + "step": 8884 + }, + { + "epoch": 0.8567984570877532, + "grad_norm": 1.1857284307479858, + "learning_rate": 2.490822198470805e-06, + "loss": 5.3439, + "step": 8885 + }, + { + "epoch": 0.8568948891031822, + "grad_norm": 1.5622692108154297, + "learning_rate": 2.487527656167618e-06, + "loss": 5.2603, + "step": 8886 + }, + { + "epoch": 0.8569913211186114, + "grad_norm": 1.6270771026611328, + "learning_rate": 2.484235180031444e-06, + "loss": 5.0462, + "step": 8887 + }, + { + "epoch": 0.8570877531340405, + "grad_norm": 1.5311273336410522, + "learning_rate": 2.480944770364441e-06, + "loss": 5.4862, + "step": 8888 + }, + { + "epoch": 0.8571841851494696, + "grad_norm": 2.0781943798065186, + "learning_rate": 2.477656427468622e-06, + "loss": 5.1769, + "step": 8889 + }, + { + "epoch": 0.8572806171648988, + "grad_norm": 3.274467706680298, + "learning_rate": 2.474370151645769e-06, + "loss": 4.9636, + "step": 8890 + }, + { + "epoch": 0.8573770491803279, + "grad_norm": 2.9725348949432373, + "learning_rate": 2.4710859431975013e-06, + "loss": 4.9118, + "step": 8891 + }, + { + "epoch": 0.857473481195757, + "grad_norm": 2.4250385761260986, + "learning_rate": 2.4678038024252363e-06, + "loss": 4.9383, + "step": 8892 + }, + { + "epoch": 0.8575699132111861, + "grad_norm": 1.9679328203201294, + "learning_rate": 2.464523729630211e-06, + "loss": 5.2059, + "step": 8893 + }, + { + "epoch": 0.8576663452266152, + "grad_norm": 1.7848235368728638, + "learning_rate": 2.4612457251134507e-06, + "loss": 5.419, + "step": 8894 + }, + { + "epoch": 0.8577627772420443, + "grad_norm": 1.9798721075057983, + "learning_rate": 2.4579697891758263e-06, + "loss": 5.3964, + "step": 8895 + }, + { + "epoch": 0.8578592092574735, + "grad_norm": 1.5757559537887573, + "learning_rate": 2.454695922117986e-06, + "loss": 5.0617, + "step": 8896 + }, + { + "epoch": 0.8579556412729026, + "grad_norm": 1.776081919670105, + "learning_rate": 2.4514241242404034e-06, + "loss": 5.0594, + "step": 8897 + }, + { + "epoch": 0.8580520732883318, + "grad_norm": 2.7397584915161133, + "learning_rate": 2.4481543958433636e-06, + "loss": 4.9036, + "step": 8898 + }, + { + "epoch": 0.8581485053037609, + "grad_norm": 2.857830286026001, + "learning_rate": 2.4448867372269563e-06, + "loss": 4.894, + "step": 8899 + }, + { + "epoch": 0.8582449373191899, + "grad_norm": 2.819514513015747, + "learning_rate": 2.441621148691084e-06, + "loss": 4.8747, + "step": 8900 + }, + { + "epoch": 0.8583413693346191, + "grad_norm": 2.1223902702331543, + "learning_rate": 2.4383576305354566e-06, + "loss": 4.8736, + "step": 8901 + }, + { + "epoch": 0.8584378013500482, + "grad_norm": 1.6183650493621826, + "learning_rate": 2.4350961830595985e-06, + "loss": 5.0853, + "step": 8902 + }, + { + "epoch": 0.8585342333654773, + "grad_norm": 1.7012778520584106, + "learning_rate": 2.43183680656284e-06, + "loss": 5.259, + "step": 8903 + }, + { + "epoch": 0.8586306653809065, + "grad_norm": 1.8032054901123047, + "learning_rate": 2.428579501344325e-06, + "loss": 5.152, + "step": 8904 + }, + { + "epoch": 0.8587270973963356, + "grad_norm": 1.8052140474319458, + "learning_rate": 2.425324267702994e-06, + "loss": 5.3463, + "step": 8905 + }, + { + "epoch": 0.8588235294117647, + "grad_norm": 2.6469550132751465, + "learning_rate": 2.4220711059376263e-06, + "loss": 5.0875, + "step": 8906 + }, + { + "epoch": 0.8589199614271938, + "grad_norm": 2.6507925987243652, + "learning_rate": 2.418820016346779e-06, + "loss": 5.3111, + "step": 8907 + }, + { + "epoch": 0.8590163934426229, + "grad_norm": 3.4785845279693604, + "learning_rate": 2.415570999228839e-06, + "loss": 5.2518, + "step": 8908 + }, + { + "epoch": 0.8591128254580521, + "grad_norm": 2.520369052886963, + "learning_rate": 2.4123240548819955e-06, + "loss": 5.334, + "step": 8909 + }, + { + "epoch": 0.8592092574734812, + "grad_norm": 1.941839337348938, + "learning_rate": 2.4090791836042487e-06, + "loss": 5.3349, + "step": 8910 + }, + { + "epoch": 0.8593056894889103, + "grad_norm": 1.4896513223648071, + "learning_rate": 2.405836385693408e-06, + "loss": 5.3139, + "step": 8911 + }, + { + "epoch": 0.8594021215043395, + "grad_norm": 1.6655473709106445, + "learning_rate": 2.4025956614471016e-06, + "loss": 5.5632, + "step": 8912 + }, + { + "epoch": 0.8594985535197686, + "grad_norm": 1.5661609172821045, + "learning_rate": 2.3993570111627424e-06, + "loss": 5.468, + "step": 8913 + }, + { + "epoch": 0.8595949855351976, + "grad_norm": 1.7263245582580566, + "learning_rate": 2.3961204351375886e-06, + "loss": 5.2949, + "step": 8914 + }, + { + "epoch": 0.8596914175506268, + "grad_norm": 2.1017239093780518, + "learning_rate": 2.392885933668676e-06, + "loss": 5.1603, + "step": 8915 + }, + { + "epoch": 0.8597878495660559, + "grad_norm": 1.8557233810424805, + "learning_rate": 2.3896535070528665e-06, + "loss": 5.1476, + "step": 8916 + }, + { + "epoch": 0.859884281581485, + "grad_norm": 2.398555278778076, + "learning_rate": 2.386423155586831e-06, + "loss": 5.3612, + "step": 8917 + }, + { + "epoch": 0.8599807135969142, + "grad_norm": 3.307469606399536, + "learning_rate": 2.3831948795670467e-06, + "loss": 5.1031, + "step": 8918 + }, + { + "epoch": 0.8600771456123433, + "grad_norm": 2.324124813079834, + "learning_rate": 2.3799686792897984e-06, + "loss": 5.2316, + "step": 8919 + }, + { + "epoch": 0.8601735776277725, + "grad_norm": 1.817804217338562, + "learning_rate": 2.376744555051183e-06, + "loss": 5.4728, + "step": 8920 + }, + { + "epoch": 0.8602700096432015, + "grad_norm": 1.4219073057174683, + "learning_rate": 2.373522507147108e-06, + "loss": 5.1066, + "step": 8921 + }, + { + "epoch": 0.8603664416586306, + "grad_norm": 1.7867058515548706, + "learning_rate": 2.3703025358732892e-06, + "loss": 5.2308, + "step": 8922 + }, + { + "epoch": 0.8604628736740598, + "grad_norm": 1.733210802078247, + "learning_rate": 2.367084641525255e-06, + "loss": 5.3577, + "step": 8923 + }, + { + "epoch": 0.8605593056894889, + "grad_norm": 1.7138370275497437, + "learning_rate": 2.363868824398327e-06, + "loss": 5.1057, + "step": 8924 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 1.5722928047180176, + "learning_rate": 2.3606550847876663e-06, + "loss": 5.4905, + "step": 8925 + }, + { + "epoch": 0.8607521697203472, + "grad_norm": 1.8267968893051147, + "learning_rate": 2.3574434229882145e-06, + "loss": 5.4892, + "step": 8926 + }, + { + "epoch": 0.8608486017357763, + "grad_norm": 1.479461908340454, + "learning_rate": 2.3542338392947364e-06, + "loss": 5.4507, + "step": 8927 + }, + { + "epoch": 0.8609450337512053, + "grad_norm": 1.1992725133895874, + "learning_rate": 2.3510263340018017e-06, + "loss": 5.3064, + "step": 8928 + }, + { + "epoch": 0.8610414657666345, + "grad_norm": 1.2448617219924927, + "learning_rate": 2.3478209074038025e-06, + "loss": 5.3862, + "step": 8929 + }, + { + "epoch": 0.8611378977820636, + "grad_norm": 1.817922592163086, + "learning_rate": 2.3446175597949068e-06, + "loss": 5.4332, + "step": 8930 + }, + { + "epoch": 0.8612343297974928, + "grad_norm": 1.5839325189590454, + "learning_rate": 2.3414162914691394e-06, + "loss": 5.2587, + "step": 8931 + }, + { + "epoch": 0.8613307618129219, + "grad_norm": 1.6948248147964478, + "learning_rate": 2.338217102720286e-06, + "loss": 5.2955, + "step": 8932 + }, + { + "epoch": 0.861427193828351, + "grad_norm": 1.796281099319458, + "learning_rate": 2.335019993841983e-06, + "loss": 4.9679, + "step": 8933 + }, + { + "epoch": 0.8615236258437802, + "grad_norm": 1.358445167541504, + "learning_rate": 2.3318249651276463e-06, + "loss": 5.1933, + "step": 8934 + }, + { + "epoch": 0.8616200578592093, + "grad_norm": 1.616562008857727, + "learning_rate": 2.3286320168705133e-06, + "loss": 5.4545, + "step": 8935 + }, + { + "epoch": 0.8617164898746383, + "grad_norm": 1.2025768756866455, + "learning_rate": 2.3254411493636303e-06, + "loss": 5.3997, + "step": 8936 + }, + { + "epoch": 0.8618129218900675, + "grad_norm": 1.7830098867416382, + "learning_rate": 2.3222523628998543e-06, + "loss": 5.4899, + "step": 8937 + }, + { + "epoch": 0.8619093539054966, + "grad_norm": 1.4358497858047485, + "learning_rate": 2.319065657771838e-06, + "loss": 5.4274, + "step": 8938 + }, + { + "epoch": 0.8620057859209257, + "grad_norm": 1.7484349012374878, + "learning_rate": 2.315881034272066e-06, + "loss": 5.3433, + "step": 8939 + }, + { + "epoch": 0.8621022179363549, + "grad_norm": 2.0413503646850586, + "learning_rate": 2.3126984926928106e-06, + "loss": 5.3851, + "step": 8940 + }, + { + "epoch": 0.862198649951784, + "grad_norm": 1.667717456817627, + "learning_rate": 2.3095180333261633e-06, + "loss": 5.3936, + "step": 8941 + }, + { + "epoch": 0.8622950819672132, + "grad_norm": 1.6569379568099976, + "learning_rate": 2.306339656464024e-06, + "loss": 5.5517, + "step": 8942 + }, + { + "epoch": 0.8623915139826422, + "grad_norm": 1.2126528024673462, + "learning_rate": 2.3031633623980975e-06, + "loss": 5.4285, + "step": 8943 + }, + { + "epoch": 0.8624879459980713, + "grad_norm": 1.718490719795227, + "learning_rate": 2.299989151419907e-06, + "loss": 5.6567, + "step": 8944 + }, + { + "epoch": 0.8625843780135005, + "grad_norm": 1.2423325777053833, + "learning_rate": 2.2968170238207688e-06, + "loss": 5.4132, + "step": 8945 + }, + { + "epoch": 0.8626808100289296, + "grad_norm": 1.2373385429382324, + "learning_rate": 2.2936469798918226e-06, + "loss": 5.4035, + "step": 8946 + }, + { + "epoch": 0.8627772420443587, + "grad_norm": 1.2927653789520264, + "learning_rate": 2.2904790199240076e-06, + "loss": 5.4831, + "step": 8947 + }, + { + "epoch": 0.8628736740597879, + "grad_norm": 1.5885530710220337, + "learning_rate": 2.2873131442080837e-06, + "loss": 5.4242, + "step": 8948 + }, + { + "epoch": 0.862970106075217, + "grad_norm": 2.016500949859619, + "learning_rate": 2.2841493530345954e-06, + "loss": 5.2972, + "step": 8949 + }, + { + "epoch": 0.863066538090646, + "grad_norm": 1.9777928590774536, + "learning_rate": 2.28098764669393e-06, + "loss": 5.4314, + "step": 8950 + }, + { + "epoch": 0.8631629701060752, + "grad_norm": 1.6486750841140747, + "learning_rate": 2.277828025476253e-06, + "loss": 5.4086, + "step": 8951 + }, + { + "epoch": 0.8632594021215043, + "grad_norm": 1.663986325263977, + "learning_rate": 2.2746704896715514e-06, + "loss": 5.3156, + "step": 8952 + }, + { + "epoch": 0.8633558341369335, + "grad_norm": 1.4531464576721191, + "learning_rate": 2.271515039569622e-06, + "loss": 5.3371, + "step": 8953 + }, + { + "epoch": 0.8634522661523626, + "grad_norm": 1.444671869277954, + "learning_rate": 2.268361675460068e-06, + "loss": 5.2161, + "step": 8954 + }, + { + "epoch": 0.8635486981677917, + "grad_norm": 1.3160561323165894, + "learning_rate": 2.2652103976323e-06, + "loss": 5.2707, + "step": 8955 + }, + { + "epoch": 0.8636451301832209, + "grad_norm": 1.3164089918136597, + "learning_rate": 2.262061206375546e-06, + "loss": 5.342, + "step": 8956 + }, + { + "epoch": 0.86374156219865, + "grad_norm": 1.092313528060913, + "learning_rate": 2.2589141019788206e-06, + "loss": 5.3208, + "step": 8957 + }, + { + "epoch": 0.863837994214079, + "grad_norm": 2.1576802730560303, + "learning_rate": 2.255769084730977e-06, + "loss": 5.0025, + "step": 8958 + }, + { + "epoch": 0.8639344262295082, + "grad_norm": 1.5527323484420776, + "learning_rate": 2.2526261549206478e-06, + "loss": 5.0887, + "step": 8959 + }, + { + "epoch": 0.8640308582449373, + "grad_norm": 1.661899447441101, + "learning_rate": 2.249485312836294e-06, + "loss": 4.9556, + "step": 8960 + }, + { + "epoch": 0.8641272902603664, + "grad_norm": 2.0985050201416016, + "learning_rate": 2.246346558766177e-06, + "loss": 5.0251, + "step": 8961 + }, + { + "epoch": 0.8642237222757956, + "grad_norm": 1.6163936853408813, + "learning_rate": 2.243209892998366e-06, + "loss": 5.0851, + "step": 8962 + }, + { + "epoch": 0.8643201542912247, + "grad_norm": 2.953946113586426, + "learning_rate": 2.2400753158207443e-06, + "loss": 4.8054, + "step": 8963 + }, + { + "epoch": 0.8644165863066539, + "grad_norm": 1.6821002960205078, + "learning_rate": 2.2369428275209966e-06, + "loss": 4.702, + "step": 8964 + }, + { + "epoch": 0.8645130183220829, + "grad_norm": 1.980743646621704, + "learning_rate": 2.233812428386617e-06, + "loss": 5.0533, + "step": 8965 + }, + { + "epoch": 0.864609450337512, + "grad_norm": 1.9022349119186401, + "learning_rate": 2.230684118704915e-06, + "loss": 5.1002, + "step": 8966 + }, + { + "epoch": 0.8647058823529412, + "grad_norm": 2.6753735542297363, + "learning_rate": 2.2275578987630026e-06, + "loss": 5.3855, + "step": 8967 + }, + { + "epoch": 0.8648023143683703, + "grad_norm": 2.470048189163208, + "learning_rate": 2.224433768847789e-06, + "loss": 5.4463, + "step": 8968 + }, + { + "epoch": 0.8648987463837994, + "grad_norm": 2.0778346061706543, + "learning_rate": 2.2213117292460227e-06, + "loss": 5.1593, + "step": 8969 + }, + { + "epoch": 0.8649951783992286, + "grad_norm": 2.2732954025268555, + "learning_rate": 2.2181917802442246e-06, + "loss": 5.1424, + "step": 8970 + }, + { + "epoch": 0.8650916104146577, + "grad_norm": 2.308687210083008, + "learning_rate": 2.2150739221287437e-06, + "loss": 5.1418, + "step": 8971 + }, + { + "epoch": 0.8651880424300867, + "grad_norm": 1.9184025526046753, + "learning_rate": 2.2119581551857366e-06, + "loss": 4.9234, + "step": 8972 + }, + { + "epoch": 0.8652844744455159, + "grad_norm": 2.313812017440796, + "learning_rate": 2.2088444797011663e-06, + "loss": 5.6128, + "step": 8973 + }, + { + "epoch": 0.865380906460945, + "grad_norm": 1.4543827772140503, + "learning_rate": 2.205732895960788e-06, + "loss": 5.1762, + "step": 8974 + }, + { + "epoch": 0.8654773384763742, + "grad_norm": 1.9742202758789062, + "learning_rate": 2.202623404250201e-06, + "loss": 4.7624, + "step": 8975 + }, + { + "epoch": 0.8655737704918033, + "grad_norm": 1.6413131952285767, + "learning_rate": 2.199516004854768e-06, + "loss": 4.8845, + "step": 8976 + }, + { + "epoch": 0.8656702025072324, + "grad_norm": 2.0021092891693115, + "learning_rate": 2.1964106980597034e-06, + "loss": 4.7321, + "step": 8977 + }, + { + "epoch": 0.8657666345226616, + "grad_norm": 1.2720345258712769, + "learning_rate": 2.1933074841499925e-06, + "loss": 5.3699, + "step": 8978 + }, + { + "epoch": 0.8658630665380906, + "grad_norm": 1.2791117429733276, + "learning_rate": 2.190206363410449e-06, + "loss": 5.2247, + "step": 8979 + }, + { + "epoch": 0.8659594985535197, + "grad_norm": 1.8056052923202515, + "learning_rate": 2.1871073361256935e-06, + "loss": 4.8768, + "step": 8980 + }, + { + "epoch": 0.8660559305689489, + "grad_norm": 1.7680760622024536, + "learning_rate": 2.1840104025801473e-06, + "loss": 5.2594, + "step": 8981 + }, + { + "epoch": 0.866152362584378, + "grad_norm": 1.584013819694519, + "learning_rate": 2.1809155630580446e-06, + "loss": 5.1316, + "step": 8982 + }, + { + "epoch": 0.8662487945998071, + "grad_norm": 1.464479684829712, + "learning_rate": 2.1778228178434278e-06, + "loss": 5.0033, + "step": 8983 + }, + { + "epoch": 0.8663452266152363, + "grad_norm": 1.646968126296997, + "learning_rate": 2.1747321672201447e-06, + "loss": 5.3559, + "step": 8984 + }, + { + "epoch": 0.8664416586306654, + "grad_norm": 1.5744132995605469, + "learning_rate": 2.171643611471849e-06, + "loss": 5.2633, + "step": 8985 + }, + { + "epoch": 0.8665380906460945, + "grad_norm": 1.3985291719436646, + "learning_rate": 2.1685571508820028e-06, + "loss": 4.9988, + "step": 8986 + }, + { + "epoch": 0.8666345226615236, + "grad_norm": 1.531923532485962, + "learning_rate": 2.1654727857338826e-06, + "loss": 5.1871, + "step": 8987 + }, + { + "epoch": 0.8667309546769527, + "grad_norm": 1.4512090682983398, + "learning_rate": 2.1623905163105614e-06, + "loss": 5.1465, + "step": 8988 + }, + { + "epoch": 0.8668273866923819, + "grad_norm": 1.312412142753601, + "learning_rate": 2.1593103428949325e-06, + "loss": 4.9227, + "step": 8989 + }, + { + "epoch": 0.866923818707811, + "grad_norm": 1.4142298698425293, + "learning_rate": 2.1562322657696894e-06, + "loss": 4.9271, + "step": 8990 + }, + { + "epoch": 0.8670202507232401, + "grad_norm": 1.598565697669983, + "learning_rate": 2.153156285217331e-06, + "loss": 5.1542, + "step": 8991 + }, + { + "epoch": 0.8671166827386693, + "grad_norm": 1.4952389001846313, + "learning_rate": 2.150082401520173e-06, + "loss": 4.7889, + "step": 8992 + }, + { + "epoch": 0.8672131147540983, + "grad_norm": 2.6135082244873047, + "learning_rate": 2.1470106149603208e-06, + "loss": 4.7899, + "step": 8993 + }, + { + "epoch": 0.8673095467695274, + "grad_norm": 1.1449005603790283, + "learning_rate": 2.1439409258197153e-06, + "loss": 5.0207, + "step": 8994 + }, + { + "epoch": 0.8674059787849566, + "grad_norm": 1.937544822692871, + "learning_rate": 2.140873334380078e-06, + "loss": 5.0058, + "step": 8995 + }, + { + "epoch": 0.8675024108003857, + "grad_norm": 1.4265497922897339, + "learning_rate": 2.137807840922948e-06, + "loss": 4.8232, + "step": 8996 + }, + { + "epoch": 0.8675988428158149, + "grad_norm": 1.7079081535339355, + "learning_rate": 2.1347444457296776e-06, + "loss": 4.6761, + "step": 8997 + }, + { + "epoch": 0.867695274831244, + "grad_norm": 1.5631568431854248, + "learning_rate": 2.1316831490814177e-06, + "loss": 4.8442, + "step": 8998 + }, + { + "epoch": 0.8677917068466731, + "grad_norm": 1.5479756593704224, + "learning_rate": 2.1286239512591345e-06, + "loss": 4.9284, + "step": 8999 + }, + { + "epoch": 0.8678881388621023, + "grad_norm": 1.201627254486084, + "learning_rate": 2.125566852543598e-06, + "loss": 5.0127, + "step": 9000 + }, + { + "epoch": 0.8679845708775313, + "grad_norm": 1.298079013824463, + "learning_rate": 2.1225118532153726e-06, + "loss": 4.9617, + "step": 9001 + }, + { + "epoch": 0.8680810028929604, + "grad_norm": 1.7701750993728638, + "learning_rate": 2.1194589535548594e-06, + "loss": 4.8924, + "step": 9002 + }, + { + "epoch": 0.8681774349083896, + "grad_norm": 1.5674797296524048, + "learning_rate": 2.116408153842239e-06, + "loss": 4.9691, + "step": 9003 + }, + { + "epoch": 0.8682738669238187, + "grad_norm": 1.1749649047851562, + "learning_rate": 2.1133594543575136e-06, + "loss": 5.0949, + "step": 9004 + }, + { + "epoch": 0.8683702989392478, + "grad_norm": 1.215448260307312, + "learning_rate": 2.110312855380489e-06, + "loss": 5.2334, + "step": 9005 + }, + { + "epoch": 0.868466730954677, + "grad_norm": 1.5165867805480957, + "learning_rate": 2.107268357190775e-06, + "loss": 4.9507, + "step": 9006 + }, + { + "epoch": 0.868563162970106, + "grad_norm": 1.5362794399261475, + "learning_rate": 2.104225960067799e-06, + "loss": 4.9769, + "step": 9007 + }, + { + "epoch": 0.8686595949855352, + "grad_norm": 1.9548985958099365, + "learning_rate": 2.101185664290781e-06, + "loss": 4.9451, + "step": 9008 + }, + { + "epoch": 0.8687560270009643, + "grad_norm": 1.1427680253982544, + "learning_rate": 2.0981474701387617e-06, + "loss": 5.0011, + "step": 9009 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 1.158821940422058, + "learning_rate": 2.095111377890577e-06, + "loss": 5.0598, + "step": 9010 + }, + { + "epoch": 0.8689488910318226, + "grad_norm": 2.47770094871521, + "learning_rate": 2.092077387824884e-06, + "loss": 4.7233, + "step": 9011 + }, + { + "epoch": 0.8690453230472517, + "grad_norm": 1.392436146736145, + "learning_rate": 2.089045500220127e-06, + "loss": 4.7625, + "step": 9012 + }, + { + "epoch": 0.8691417550626808, + "grad_norm": 1.9775599241256714, + "learning_rate": 2.0860157153545807e-06, + "loss": 5.3848, + "step": 9013 + }, + { + "epoch": 0.86923818707811, + "grad_norm": 1.2611746788024902, + "learning_rate": 2.082988033506306e-06, + "loss": 5.1213, + "step": 9014 + }, + { + "epoch": 0.869334619093539, + "grad_norm": 1.2889389991760254, + "learning_rate": 2.0799624549531826e-06, + "loss": 4.7742, + "step": 9015 + }, + { + "epoch": 0.8694310511089681, + "grad_norm": 1.4459049701690674, + "learning_rate": 2.076938979972898e-06, + "loss": 4.8876, + "step": 9016 + }, + { + "epoch": 0.8695274831243973, + "grad_norm": 1.4970827102661133, + "learning_rate": 2.073917608842943e-06, + "loss": 4.9227, + "step": 9017 + }, + { + "epoch": 0.8696239151398264, + "grad_norm": 1.4928706884384155, + "learning_rate": 2.070898341840602e-06, + "loss": 4.7655, + "step": 9018 + }, + { + "epoch": 0.8697203471552556, + "grad_norm": 1.43646240234375, + "learning_rate": 2.0678811792430004e-06, + "loss": 4.5062, + "step": 9019 + }, + { + "epoch": 0.8698167791706847, + "grad_norm": 1.334377408027649, + "learning_rate": 2.0648661213270303e-06, + "loss": 4.7298, + "step": 9020 + }, + { + "epoch": 0.8699132111861138, + "grad_norm": 1.415796160697937, + "learning_rate": 2.0618531683694292e-06, + "loss": 4.9322, + "step": 9021 + }, + { + "epoch": 0.870009643201543, + "grad_norm": 1.6182160377502441, + "learning_rate": 2.058842320646706e-06, + "loss": 5.0847, + "step": 9022 + }, + { + "epoch": 0.870106075216972, + "grad_norm": 1.4269812107086182, + "learning_rate": 2.0558335784351982e-06, + "loss": 4.9231, + "step": 9023 + }, + { + "epoch": 0.8702025072324011, + "grad_norm": 1.8621985912322998, + "learning_rate": 2.0528269420110463e-06, + "loss": 5.2304, + "step": 9024 + }, + { + "epoch": 0.8702989392478303, + "grad_norm": 1.5570560693740845, + "learning_rate": 2.049822411650193e-06, + "loss": 5.2014, + "step": 9025 + }, + { + "epoch": 0.8703953712632594, + "grad_norm": 1.4386082887649536, + "learning_rate": 2.046819987628393e-06, + "loss": 4.8949, + "step": 9026 + }, + { + "epoch": 0.8704918032786885, + "grad_norm": 1.6708887815475464, + "learning_rate": 2.0438196702212033e-06, + "loss": 5.2695, + "step": 9027 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 1.3958178758621216, + "learning_rate": 2.040821459703993e-06, + "loss": 5.4044, + "step": 9028 + }, + { + "epoch": 0.8706846673095467, + "grad_norm": 1.9869838953018188, + "learning_rate": 2.0378253563519247e-06, + "loss": 5.381, + "step": 9029 + }, + { + "epoch": 0.8707810993249759, + "grad_norm": 1.598752737045288, + "learning_rate": 2.03483136043999e-06, + "loss": 5.1376, + "step": 9030 + }, + { + "epoch": 0.870877531340405, + "grad_norm": 1.8221887350082397, + "learning_rate": 2.031839472242958e-06, + "loss": 4.6903, + "step": 9031 + }, + { + "epoch": 0.8709739633558341, + "grad_norm": 1.7239811420440674, + "learning_rate": 2.028849692035442e-06, + "loss": 5.1144, + "step": 9032 + }, + { + "epoch": 0.8710703953712633, + "grad_norm": 1.378615140914917, + "learning_rate": 2.0258620200918203e-06, + "loss": 5.1514, + "step": 9033 + }, + { + "epoch": 0.8711668273866924, + "grad_norm": 1.574353814125061, + "learning_rate": 2.0228764566863096e-06, + "loss": 4.9786, + "step": 9034 + }, + { + "epoch": 0.8712632594021215, + "grad_norm": 1.297423243522644, + "learning_rate": 2.019893002092915e-06, + "loss": 4.9377, + "step": 9035 + }, + { + "epoch": 0.8713596914175507, + "grad_norm": 1.676241397857666, + "learning_rate": 2.01691165658546e-06, + "loss": 5.0377, + "step": 9036 + }, + { + "epoch": 0.8714561234329797, + "grad_norm": 2.4597113132476807, + "learning_rate": 2.0139324204375588e-06, + "loss": 4.9966, + "step": 9037 + }, + { + "epoch": 0.8715525554484088, + "grad_norm": 1.4877861738204956, + "learning_rate": 2.010955293922659e-06, + "loss": 5.0244, + "step": 9038 + }, + { + "epoch": 0.871648987463838, + "grad_norm": 1.4290616512298584, + "learning_rate": 2.007980277313981e-06, + "loss": 5.3221, + "step": 9039 + }, + { + "epoch": 0.8717454194792671, + "grad_norm": 1.2618407011032104, + "learning_rate": 2.0050073708845736e-06, + "loss": 4.9303, + "step": 9040 + }, + { + "epoch": 0.8718418514946963, + "grad_norm": 1.274341344833374, + "learning_rate": 2.00203657490729e-06, + "loss": 5.1845, + "step": 9041 + }, + { + "epoch": 0.8719382835101254, + "grad_norm": 1.5069444179534912, + "learning_rate": 1.9990678896547822e-06, + "loss": 4.9286, + "step": 9042 + }, + { + "epoch": 0.8720347155255544, + "grad_norm": 2.3564820289611816, + "learning_rate": 1.996101315399515e-06, + "loss": 4.9755, + "step": 9043 + }, + { + "epoch": 0.8721311475409836, + "grad_norm": 2.4069337844848633, + "learning_rate": 1.9931368524137573e-06, + "loss": 4.8736, + "step": 9044 + }, + { + "epoch": 0.8722275795564127, + "grad_norm": 1.8507570028305054, + "learning_rate": 1.9901745009695773e-06, + "loss": 4.8523, + "step": 9045 + }, + { + "epoch": 0.8723240115718418, + "grad_norm": 1.660759687423706, + "learning_rate": 1.9872142613388684e-06, + "loss": 4.9004, + "step": 9046 + }, + { + "epoch": 0.872420443587271, + "grad_norm": 2.0530951023101807, + "learning_rate": 1.984256133793305e-06, + "loss": 5.0583, + "step": 9047 + }, + { + "epoch": 0.8725168756027001, + "grad_norm": 1.788384199142456, + "learning_rate": 1.9813001186043874e-06, + "loss": 4.7322, + "step": 9048 + }, + { + "epoch": 0.8726133076181292, + "grad_norm": 1.6417851448059082, + "learning_rate": 1.978346216043414e-06, + "loss": 4.7763, + "step": 9049 + }, + { + "epoch": 0.8727097396335584, + "grad_norm": 1.5798804759979248, + "learning_rate": 1.9753944263814885e-06, + "loss": 4.7215, + "step": 9050 + }, + { + "epoch": 0.8728061716489874, + "grad_norm": 1.9188660383224487, + "learning_rate": 1.972444749889524e-06, + "loss": 5.0782, + "step": 9051 + }, + { + "epoch": 0.8729026036644166, + "grad_norm": 1.4638385772705078, + "learning_rate": 1.969497186838237e-06, + "loss": 4.9935, + "step": 9052 + }, + { + "epoch": 0.8729990356798457, + "grad_norm": 1.81497323513031, + "learning_rate": 1.966551737498154e-06, + "loss": 4.8609, + "step": 9053 + }, + { + "epoch": 0.8730954676952748, + "grad_norm": 1.8591644763946533, + "learning_rate": 1.9636084021396024e-06, + "loss": 5.1341, + "step": 9054 + }, + { + "epoch": 0.873191899710704, + "grad_norm": 1.7914597988128662, + "learning_rate": 1.9606671810327243e-06, + "loss": 4.9315, + "step": 9055 + }, + { + "epoch": 0.8732883317261331, + "grad_norm": 1.470792293548584, + "learning_rate": 1.957728074447443e-06, + "loss": 4.9755, + "step": 9056 + }, + { + "epoch": 0.8733847637415622, + "grad_norm": 1.3992798328399658, + "learning_rate": 1.9547910826535316e-06, + "loss": 4.8957, + "step": 9057 + }, + { + "epoch": 0.8734811957569913, + "grad_norm": 1.9718914031982422, + "learning_rate": 1.9518562059205242e-06, + "loss": 5.0346, + "step": 9058 + }, + { + "epoch": 0.8735776277724204, + "grad_norm": 1.628426432609558, + "learning_rate": 1.948923444517786e-06, + "loss": 5.1628, + "step": 9059 + }, + { + "epoch": 0.8736740597878495, + "grad_norm": 1.5583940744400024, + "learning_rate": 1.945992798714483e-06, + "loss": 5.2376, + "step": 9060 + }, + { + "epoch": 0.8737704918032787, + "grad_norm": 1.3942890167236328, + "learning_rate": 1.943064268779593e-06, + "loss": 5.2489, + "step": 9061 + }, + { + "epoch": 0.8738669238187078, + "grad_norm": 1.4801989793777466, + "learning_rate": 1.9401378549818744e-06, + "loss": 4.879, + "step": 9062 + }, + { + "epoch": 0.873963355834137, + "grad_norm": 1.3287039995193481, + "learning_rate": 1.9372135575899316e-06, + "loss": 5.091, + "step": 9063 + }, + { + "epoch": 0.8740597878495661, + "grad_norm": 1.8011207580566406, + "learning_rate": 1.9342913768721326e-06, + "loss": 4.7502, + "step": 9064 + }, + { + "epoch": 0.8741562198649951, + "grad_norm": 1.6116900444030762, + "learning_rate": 1.9313713130966916e-06, + "loss": 5.1971, + "step": 9065 + }, + { + "epoch": 0.8742526518804243, + "grad_norm": 1.8434886932373047, + "learning_rate": 1.9284533665315935e-06, + "loss": 4.9595, + "step": 9066 + }, + { + "epoch": 0.8743490838958534, + "grad_norm": 1.5633876323699951, + "learning_rate": 1.9255375374446492e-06, + "loss": 5.1234, + "step": 9067 + }, + { + "epoch": 0.8744455159112825, + "grad_norm": 2.215188980102539, + "learning_rate": 1.92262382610347e-06, + "loss": 5.1959, + "step": 9068 + }, + { + "epoch": 0.8745419479267117, + "grad_norm": 1.489586353302002, + "learning_rate": 1.9197122327754726e-06, + "loss": 4.8933, + "step": 9069 + }, + { + "epoch": 0.8746383799421408, + "grad_norm": 1.4991575479507446, + "learning_rate": 1.9168027577278784e-06, + "loss": 5.1133, + "step": 9070 + }, + { + "epoch": 0.8747348119575699, + "grad_norm": 1.5347263813018799, + "learning_rate": 1.9138954012277165e-06, + "loss": 4.9236, + "step": 9071 + }, + { + "epoch": 0.874831243972999, + "grad_norm": 1.348811149597168, + "learning_rate": 1.910990163541826e-06, + "loss": 5.0719, + "step": 9072 + }, + { + "epoch": 0.8749276759884281, + "grad_norm": 1.2400962114334106, + "learning_rate": 1.9080870449368295e-06, + "loss": 5.1042, + "step": 9073 + }, + { + "epoch": 0.8750241080038573, + "grad_norm": 1.9861935377120972, + "learning_rate": 1.9051860456791943e-06, + "loss": 4.6638, + "step": 9074 + }, + { + "epoch": 0.8751205400192864, + "grad_norm": 1.874833583831787, + "learning_rate": 1.9022871660351466e-06, + "loss": 4.7375, + "step": 9075 + }, + { + "epoch": 0.8752169720347155, + "grad_norm": 1.4495443105697632, + "learning_rate": 1.8993904062707652e-06, + "loss": 4.9727, + "step": 9076 + }, + { + "epoch": 0.8753134040501447, + "grad_norm": 1.6052286624908447, + "learning_rate": 1.896495766651893e-06, + "loss": 5.4239, + "step": 9077 + }, + { + "epoch": 0.8754098360655738, + "grad_norm": 1.618124008178711, + "learning_rate": 1.893603247444206e-06, + "loss": 5.1333, + "step": 9078 + }, + { + "epoch": 0.8755062680810028, + "grad_norm": 1.4103294610977173, + "learning_rate": 1.890712848913173e-06, + "loss": 5.1679, + "step": 9079 + }, + { + "epoch": 0.875602700096432, + "grad_norm": 1.5201468467712402, + "learning_rate": 1.8878245713240761e-06, + "loss": 4.9314, + "step": 9080 + }, + { + "epoch": 0.8756991321118611, + "grad_norm": 1.4240190982818604, + "learning_rate": 1.8849384149419864e-06, + "loss": 4.9673, + "step": 9081 + }, + { + "epoch": 0.8757955641272903, + "grad_norm": 1.456866979598999, + "learning_rate": 1.8820543800318057e-06, + "loss": 4.8622, + "step": 9082 + }, + { + "epoch": 0.8758919961427194, + "grad_norm": 1.61824631690979, + "learning_rate": 1.8791724668582117e-06, + "loss": 4.8941, + "step": 9083 + }, + { + "epoch": 0.8759884281581485, + "grad_norm": 1.3925046920776367, + "learning_rate": 1.8762926756857201e-06, + "loss": 4.8572, + "step": 9084 + }, + { + "epoch": 0.8760848601735777, + "grad_norm": 1.7208852767944336, + "learning_rate": 1.8734150067786227e-06, + "loss": 5.0589, + "step": 9085 + }, + { + "epoch": 0.8761812921890068, + "grad_norm": 1.0893921852111816, + "learning_rate": 1.870539460401033e-06, + "loss": 4.939, + "step": 9086 + }, + { + "epoch": 0.8762777242044358, + "grad_norm": 1.3246512413024902, + "learning_rate": 1.867666036816862e-06, + "loss": 5.2056, + "step": 9087 + }, + { + "epoch": 0.876374156219865, + "grad_norm": 1.4325480461120605, + "learning_rate": 1.8647947362898321e-06, + "loss": 4.9979, + "step": 9088 + }, + { + "epoch": 0.8764705882352941, + "grad_norm": 1.2220444679260254, + "learning_rate": 1.8619255590834634e-06, + "loss": 5.183, + "step": 9089 + }, + { + "epoch": 0.8765670202507232, + "grad_norm": 1.5137678384780884, + "learning_rate": 1.859058505461095e-06, + "loss": 4.9192, + "step": 9090 + }, + { + "epoch": 0.8766634522661524, + "grad_norm": 3.06164288520813, + "learning_rate": 1.8561935756858523e-06, + "loss": 4.7076, + "step": 9091 + }, + { + "epoch": 0.8767598842815815, + "grad_norm": 2.0815436840057373, + "learning_rate": 1.8533307700206754e-06, + "loss": 4.8083, + "step": 9092 + }, + { + "epoch": 0.8768563162970107, + "grad_norm": 1.5789579153060913, + "learning_rate": 1.8504700887283122e-06, + "loss": 4.9283, + "step": 9093 + }, + { + "epoch": 0.8769527483124397, + "grad_norm": 1.4344104528427124, + "learning_rate": 1.8476115320713138e-06, + "loss": 4.8328, + "step": 9094 + }, + { + "epoch": 0.8770491803278688, + "grad_norm": 2.0195717811584473, + "learning_rate": 1.8447551003120318e-06, + "loss": 5.0838, + "step": 9095 + }, + { + "epoch": 0.877145612343298, + "grad_norm": 1.4138697385787964, + "learning_rate": 1.8419007937126255e-06, + "loss": 5.4468, + "step": 9096 + }, + { + "epoch": 0.8772420443587271, + "grad_norm": 1.6025781631469727, + "learning_rate": 1.8390486125350631e-06, + "loss": 5.4928, + "step": 9097 + }, + { + "epoch": 0.8773384763741562, + "grad_norm": 1.4320337772369385, + "learning_rate": 1.8361985570411133e-06, + "loss": 5.4121, + "step": 9098 + }, + { + "epoch": 0.8774349083895854, + "grad_norm": 1.2222802639007568, + "learning_rate": 1.8333506274923528e-06, + "loss": 5.4745, + "step": 9099 + }, + { + "epoch": 0.8775313404050145, + "grad_norm": 1.2560739517211914, + "learning_rate": 1.8305048241501504e-06, + "loss": 5.3502, + "step": 9100 + }, + { + "epoch": 0.8776277724204435, + "grad_norm": 1.274191975593567, + "learning_rate": 1.8276611472757055e-06, + "loss": 5.4314, + "step": 9101 + }, + { + "epoch": 0.8777242044358727, + "grad_norm": 1.5207003355026245, + "learning_rate": 1.8248195971299953e-06, + "loss": 5.0734, + "step": 9102 + }, + { + "epoch": 0.8778206364513018, + "grad_norm": 1.3455429077148438, + "learning_rate": 1.8219801739738197e-06, + "loss": 5.3659, + "step": 9103 + }, + { + "epoch": 0.877917068466731, + "grad_norm": 1.2442554235458374, + "learning_rate": 1.8191428780677783e-06, + "loss": 5.2416, + "step": 9104 + }, + { + "epoch": 0.8780135004821601, + "grad_norm": 1.6384869813919067, + "learning_rate": 1.8163077096722687e-06, + "loss": 5.2011, + "step": 9105 + }, + { + "epoch": 0.8781099324975892, + "grad_norm": 2.1124353408813477, + "learning_rate": 1.8134746690475046e-06, + "loss": 5.0498, + "step": 9106 + }, + { + "epoch": 0.8782063645130184, + "grad_norm": 1.278843879699707, + "learning_rate": 1.8106437564535033e-06, + "loss": 5.1836, + "step": 9107 + }, + { + "epoch": 0.8783027965284474, + "grad_norm": 2.1491010189056396, + "learning_rate": 1.8078149721500654e-06, + "loss": 5.116, + "step": 9108 + }, + { + "epoch": 0.8783992285438765, + "grad_norm": 1.4244072437286377, + "learning_rate": 1.8049883163968356e-06, + "loss": 5.1784, + "step": 9109 + }, + { + "epoch": 0.8784956605593057, + "grad_norm": 2.4392669200897217, + "learning_rate": 1.8021637894532233e-06, + "loss": 5.5465, + "step": 9110 + }, + { + "epoch": 0.8785920925747348, + "grad_norm": 1.3750642538070679, + "learning_rate": 1.7993413915784684e-06, + "loss": 5.3536, + "step": 9111 + }, + { + "epoch": 0.8786885245901639, + "grad_norm": 1.4009324312210083, + "learning_rate": 1.7965211230316025e-06, + "loss": 5.3104, + "step": 9112 + }, + { + "epoch": 0.8787849566055931, + "grad_norm": 1.8719900846481323, + "learning_rate": 1.7937029840714715e-06, + "loss": 5.4515, + "step": 9113 + }, + { + "epoch": 0.8788813886210222, + "grad_norm": 1.7264145612716675, + "learning_rate": 1.790886974956718e-06, + "loss": 5.4812, + "step": 9114 + }, + { + "epoch": 0.8789778206364514, + "grad_norm": 1.7237805128097534, + "learning_rate": 1.7880730959457915e-06, + "loss": 5.6129, + "step": 9115 + }, + { + "epoch": 0.8790742526518804, + "grad_norm": 1.463427186012268, + "learning_rate": 1.785261347296946e-06, + "loss": 5.3401, + "step": 9116 + }, + { + "epoch": 0.8791706846673095, + "grad_norm": 1.1471024751663208, + "learning_rate": 1.7824517292682418e-06, + "loss": 5.362, + "step": 9117 + }, + { + "epoch": 0.8792671166827387, + "grad_norm": 1.6610872745513916, + "learning_rate": 1.779644242117548e-06, + "loss": 5.2094, + "step": 9118 + }, + { + "epoch": 0.8793635486981678, + "grad_norm": 1.6860238313674927, + "learning_rate": 1.7768388861025165e-06, + "loss": 5.2243, + "step": 9119 + }, + { + "epoch": 0.8794599807135969, + "grad_norm": 1.4044877290725708, + "learning_rate": 1.7740356614806359e-06, + "loss": 5.3494, + "step": 9120 + }, + { + "epoch": 0.8795564127290261, + "grad_norm": 1.3481440544128418, + "learning_rate": 1.7712345685091724e-06, + "loss": 5.341, + "step": 9121 + }, + { + "epoch": 0.8796528447444552, + "grad_norm": 1.4084315299987793, + "learning_rate": 1.7684356074452123e-06, + "loss": 4.9756, + "step": 9122 + }, + { + "epoch": 0.8797492767598842, + "grad_norm": 2.1908984184265137, + "learning_rate": 1.7656387785456363e-06, + "loss": 5.0581, + "step": 9123 + }, + { + "epoch": 0.8798457087753134, + "grad_norm": 1.3282809257507324, + "learning_rate": 1.7628440820671417e-06, + "loss": 5.5053, + "step": 9124 + }, + { + "epoch": 0.8799421407907425, + "grad_norm": 1.3089568614959717, + "learning_rate": 1.760051518266209e-06, + "loss": 5.3504, + "step": 9125 + }, + { + "epoch": 0.8800385728061717, + "grad_norm": 1.3987568616867065, + "learning_rate": 1.7572610873991508e-06, + "loss": 5.4054, + "step": 9126 + }, + { + "epoch": 0.8801350048216008, + "grad_norm": 1.4598439931869507, + "learning_rate": 1.754472789722056e-06, + "loss": 5.3812, + "step": 9127 + }, + { + "epoch": 0.8802314368370299, + "grad_norm": 1.3012641668319702, + "learning_rate": 1.7516866254908448e-06, + "loss": 5.4824, + "step": 9128 + }, + { + "epoch": 0.8803278688524591, + "grad_norm": 2.568263530731201, + "learning_rate": 1.7489025949612187e-06, + "loss": 4.9836, + "step": 9129 + }, + { + "epoch": 0.8804243008678881, + "grad_norm": 2.2910940647125244, + "learning_rate": 1.7461206983886952e-06, + "loss": 4.9987, + "step": 9130 + }, + { + "epoch": 0.8805207328833172, + "grad_norm": 2.4146170616149902, + "learning_rate": 1.7433409360285924e-06, + "loss": 5.0556, + "step": 9131 + }, + { + "epoch": 0.8806171648987464, + "grad_norm": 2.1179325580596924, + "learning_rate": 1.740563308136034e-06, + "loss": 5.0156, + "step": 9132 + }, + { + "epoch": 0.8807135969141755, + "grad_norm": 1.827635407447815, + "learning_rate": 1.7377878149659466e-06, + "loss": 4.9768, + "step": 9133 + }, + { + "epoch": 0.8808100289296046, + "grad_norm": 1.798624873161316, + "learning_rate": 1.7350144567730624e-06, + "loss": 4.994, + "step": 9134 + }, + { + "epoch": 0.8809064609450338, + "grad_norm": 1.8188886642456055, + "learning_rate": 1.7322432338119227e-06, + "loss": 5.0282, + "step": 9135 + }, + { + "epoch": 0.8810028929604629, + "grad_norm": 1.3600789308547974, + "learning_rate": 1.7294741463368514e-06, + "loss": 5.2021, + "step": 9136 + }, + { + "epoch": 0.881099324975892, + "grad_norm": 1.8127939701080322, + "learning_rate": 1.726707194602012e-06, + "loss": 5.2528, + "step": 9137 + }, + { + "epoch": 0.8811957569913211, + "grad_norm": 1.772892713546753, + "learning_rate": 1.7239423788613318e-06, + "loss": 5.3623, + "step": 9138 + }, + { + "epoch": 0.8812921890067502, + "grad_norm": 1.9302682876586914, + "learning_rate": 1.7211796993685776e-06, + "loss": 5.4002, + "step": 9139 + }, + { + "epoch": 0.8813886210221794, + "grad_norm": 1.967355489730835, + "learning_rate": 1.718419156377299e-06, + "loss": 5.4224, + "step": 9140 + }, + { + "epoch": 0.8814850530376085, + "grad_norm": 2.1028618812561035, + "learning_rate": 1.7156607501408517e-06, + "loss": 5.4121, + "step": 9141 + }, + { + "epoch": 0.8815814850530376, + "grad_norm": 2.048143148422241, + "learning_rate": 1.7129044809124056e-06, + "loss": 5.3121, + "step": 9142 + }, + { + "epoch": 0.8816779170684668, + "grad_norm": 1.4334262609481812, + "learning_rate": 1.7101503489449278e-06, + "loss": 5.3076, + "step": 9143 + }, + { + "epoch": 0.8817743490838958, + "grad_norm": 2.9754717350006104, + "learning_rate": 1.707398354491177e-06, + "loss": 4.8238, + "step": 9144 + }, + { + "epoch": 0.8818707810993249, + "grad_norm": 5.882772922515869, + "learning_rate": 1.704648497803743e-06, + "loss": 4.4837, + "step": 9145 + }, + { + "epoch": 0.8819672131147541, + "grad_norm": 1.1892898082733154, + "learning_rate": 1.7019007791349962e-06, + "loss": 5.6288, + "step": 9146 + }, + { + "epoch": 0.8820636451301832, + "grad_norm": 1.2367759943008423, + "learning_rate": 1.6991551987371208e-06, + "loss": 5.3774, + "step": 9147 + }, + { + "epoch": 0.8821600771456124, + "grad_norm": 2.0320262908935547, + "learning_rate": 1.6964117568621013e-06, + "loss": 5.501, + "step": 9148 + }, + { + "epoch": 0.8822565091610415, + "grad_norm": 1.6440043449401855, + "learning_rate": 1.6936704537617309e-06, + "loss": 5.5132, + "step": 9149 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 1.7692053318023682, + "learning_rate": 1.6909312896875973e-06, + "loss": 5.1658, + "step": 9150 + }, + { + "epoch": 0.8824493731918998, + "grad_norm": 2.049912452697754, + "learning_rate": 1.6881942648911076e-06, + "loss": 5.4381, + "step": 9151 + }, + { + "epoch": 0.8825458052073288, + "grad_norm": 1.520208477973938, + "learning_rate": 1.6854593796234496e-06, + "loss": 5.1382, + "step": 9152 + }, + { + "epoch": 0.8826422372227579, + "grad_norm": 1.3775347471237183, + "learning_rate": 1.6827266341356397e-06, + "loss": 5.2748, + "step": 9153 + }, + { + "epoch": 0.8827386692381871, + "grad_norm": 1.2726291418075562, + "learning_rate": 1.6799960286784766e-06, + "loss": 5.2015, + "step": 9154 + }, + { + "epoch": 0.8828351012536162, + "grad_norm": 1.4660074710845947, + "learning_rate": 1.677267563502577e-06, + "loss": 5.4448, + "step": 9155 + }, + { + "epoch": 0.8829315332690453, + "grad_norm": 1.4070161581039429, + "learning_rate": 1.6745412388583543e-06, + "loss": 5.5049, + "step": 9156 + }, + { + "epoch": 0.8830279652844745, + "grad_norm": 1.6093506813049316, + "learning_rate": 1.6718170549960248e-06, + "loss": 5.091, + "step": 9157 + }, + { + "epoch": 0.8831243972999036, + "grad_norm": 1.8348803520202637, + "learning_rate": 1.669095012165617e-06, + "loss": 4.8074, + "step": 9158 + }, + { + "epoch": 0.8832208293153327, + "grad_norm": 1.7874172925949097, + "learning_rate": 1.6663751106169523e-06, + "loss": 5.0655, + "step": 9159 + }, + { + "epoch": 0.8833172613307618, + "grad_norm": 1.7355448007583618, + "learning_rate": 1.6636573505996622e-06, + "loss": 5.0539, + "step": 9160 + }, + { + "epoch": 0.8834136933461909, + "grad_norm": 1.170723557472229, + "learning_rate": 1.6609417323631777e-06, + "loss": 4.8772, + "step": 9161 + }, + { + "epoch": 0.8835101253616201, + "grad_norm": 1.337350606918335, + "learning_rate": 1.658228256156738e-06, + "loss": 4.8255, + "step": 9162 + }, + { + "epoch": 0.8836065573770492, + "grad_norm": 1.5632250308990479, + "learning_rate": 1.6555169222293721e-06, + "loss": 4.636, + "step": 9163 + }, + { + "epoch": 0.8837029893924783, + "grad_norm": 1.9230200052261353, + "learning_rate": 1.652807730829939e-06, + "loss": 4.5525, + "step": 9164 + }, + { + "epoch": 0.8837994214079075, + "grad_norm": 1.8062875270843506, + "learning_rate": 1.6501006822070731e-06, + "loss": 4.4837, + "step": 9165 + }, + { + "epoch": 0.8838958534233365, + "grad_norm": 1.579635739326477, + "learning_rate": 1.6473957766092285e-06, + "loss": 4.4843, + "step": 9166 + }, + { + "epoch": 0.8839922854387656, + "grad_norm": 1.5773556232452393, + "learning_rate": 1.6446930142846539e-06, + "loss": 4.5126, + "step": 9167 + }, + { + "epoch": 0.8840887174541948, + "grad_norm": 1.4469659328460693, + "learning_rate": 1.6419923954814147e-06, + "loss": 4.524, + "step": 9168 + }, + { + "epoch": 0.8841851494696239, + "grad_norm": 1.9137629270553589, + "learning_rate": 1.6392939204473574e-06, + "loss": 4.9229, + "step": 9169 + }, + { + "epoch": 0.8842815814850531, + "grad_norm": 1.971616506576538, + "learning_rate": 1.6365975894301582e-06, + "loss": 5.065, + "step": 9170 + }, + { + "epoch": 0.8843780135004822, + "grad_norm": 2.013206720352173, + "learning_rate": 1.6339034026772697e-06, + "loss": 4.8032, + "step": 9171 + }, + { + "epoch": 0.8844744455159113, + "grad_norm": 1.717349886894226, + "learning_rate": 1.6312113604359747e-06, + "loss": 5.1084, + "step": 9172 + }, + { + "epoch": 0.8845708775313404, + "grad_norm": 1.4575800895690918, + "learning_rate": 1.6285214629533363e-06, + "loss": 5.2844, + "step": 9173 + }, + { + "epoch": 0.8846673095467695, + "grad_norm": 1.3951936960220337, + "learning_rate": 1.6258337104762323e-06, + "loss": 5.3469, + "step": 9174 + }, + { + "epoch": 0.8847637415621986, + "grad_norm": 2.407231330871582, + "learning_rate": 1.6231481032513402e-06, + "loss": 5.3421, + "step": 9175 + }, + { + "epoch": 0.8848601735776278, + "grad_norm": 1.6651372909545898, + "learning_rate": 1.6204646415251434e-06, + "loss": 5.0331, + "step": 9176 + }, + { + "epoch": 0.8849566055930569, + "grad_norm": 1.9917504787445068, + "learning_rate": 1.6177833255439256e-06, + "loss": 4.8929, + "step": 9177 + }, + { + "epoch": 0.885053037608486, + "grad_norm": 1.3635129928588867, + "learning_rate": 1.615104155553776e-06, + "loss": 4.9253, + "step": 9178 + }, + { + "epoch": 0.8851494696239152, + "grad_norm": 1.8527823686599731, + "learning_rate": 1.6124271318005896e-06, + "loss": 4.6834, + "step": 9179 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 1.944334626197815, + "learning_rate": 1.6097522545300447e-06, + "loss": 4.9979, + "step": 9180 + }, + { + "epoch": 0.8853423336547734, + "grad_norm": 1.394767165184021, + "learning_rate": 1.6070795239876618e-06, + "loss": 4.6802, + "step": 9181 + }, + { + "epoch": 0.8854387656702025, + "grad_norm": 1.2713584899902344, + "learning_rate": 1.6044089404187168e-06, + "loss": 5.115, + "step": 9182 + }, + { + "epoch": 0.8855351976856316, + "grad_norm": 1.5891274213790894, + "learning_rate": 1.6017405040683331e-06, + "loss": 5.1584, + "step": 9183 + }, + { + "epoch": 0.8856316297010608, + "grad_norm": 1.4713083505630493, + "learning_rate": 1.5990742151814036e-06, + "loss": 4.7289, + "step": 9184 + }, + { + "epoch": 0.8857280617164899, + "grad_norm": 1.9813276529312134, + "learning_rate": 1.596410074002641e-06, + "loss": 4.9166, + "step": 9185 + }, + { + "epoch": 0.885824493731919, + "grad_norm": 1.4293808937072754, + "learning_rate": 1.5937480807765554e-06, + "loss": 5.1185, + "step": 9186 + }, + { + "epoch": 0.8859209257473482, + "grad_norm": 1.521905541419983, + "learning_rate": 1.5910882357474677e-06, + "loss": 5.0238, + "step": 9187 + }, + { + "epoch": 0.8860173577627772, + "grad_norm": 1.2592649459838867, + "learning_rate": 1.58843053915948e-06, + "loss": 5.0388, + "step": 9188 + }, + { + "epoch": 0.8861137897782063, + "grad_norm": 1.6050934791564941, + "learning_rate": 1.5857749912565334e-06, + "loss": 4.9163, + "step": 9189 + }, + { + "epoch": 0.8862102217936355, + "grad_norm": 1.2939199209213257, + "learning_rate": 1.5831215922823327e-06, + "loss": 4.9418, + "step": 9190 + }, + { + "epoch": 0.8863066538090646, + "grad_norm": 1.9247229099273682, + "learning_rate": 1.580470342480414e-06, + "loss": 4.8239, + "step": 9191 + }, + { + "epoch": 0.8864030858244938, + "grad_norm": 1.4425148963928223, + "learning_rate": 1.5778212420940991e-06, + "loss": 4.9003, + "step": 9192 + }, + { + "epoch": 0.8864995178399229, + "grad_norm": 1.533152461051941, + "learning_rate": 1.5751742913665268e-06, + "loss": 5.0476, + "step": 9193 + }, + { + "epoch": 0.886595949855352, + "grad_norm": 1.5947763919830322, + "learning_rate": 1.5725294905406224e-06, + "loss": 4.9802, + "step": 9194 + }, + { + "epoch": 0.8866923818707811, + "grad_norm": 1.3212416172027588, + "learning_rate": 1.5698868398591332e-06, + "loss": 5.0873, + "step": 9195 + }, + { + "epoch": 0.8867888138862102, + "grad_norm": 1.6595340967178345, + "learning_rate": 1.5672463395645848e-06, + "loss": 4.9727, + "step": 9196 + }, + { + "epoch": 0.8868852459016393, + "grad_norm": 1.5212584733963013, + "learning_rate": 1.5646079898993333e-06, + "loss": 5.001, + "step": 9197 + }, + { + "epoch": 0.8869816779170685, + "grad_norm": 1.3879481554031372, + "learning_rate": 1.5619717911055127e-06, + "loss": 4.9561, + "step": 9198 + }, + { + "epoch": 0.8870781099324976, + "grad_norm": 1.5365809202194214, + "learning_rate": 1.559337743425071e-06, + "loss": 4.9715, + "step": 9199 + }, + { + "epoch": 0.8871745419479267, + "grad_norm": 1.4383729696273804, + "learning_rate": 1.5567058470997652e-06, + "loss": 4.6358, + "step": 9200 + }, + { + "epoch": 0.8872709739633559, + "grad_norm": 1.5097007751464844, + "learning_rate": 1.554076102371138e-06, + "loss": 5.1146, + "step": 9201 + }, + { + "epoch": 0.8873674059787849, + "grad_norm": 1.5191760063171387, + "learning_rate": 1.551448509480552e-06, + "loss": 4.8457, + "step": 9202 + }, + { + "epoch": 0.8874638379942141, + "grad_norm": 2.007131814956665, + "learning_rate": 1.5488230686691612e-06, + "loss": 4.8839, + "step": 9203 + }, + { + "epoch": 0.8875602700096432, + "grad_norm": 1.4355831146240234, + "learning_rate": 1.5461997801779233e-06, + "loss": 4.897, + "step": 9204 + }, + { + "epoch": 0.8876567020250723, + "grad_norm": 1.8317337036132812, + "learning_rate": 1.5435786442476036e-06, + "loss": 4.787, + "step": 9205 + }, + { + "epoch": 0.8877531340405015, + "grad_norm": 1.4162834882736206, + "learning_rate": 1.540959661118771e-06, + "loss": 4.9306, + "step": 9206 + }, + { + "epoch": 0.8878495660559306, + "grad_norm": 1.6223608255386353, + "learning_rate": 1.5383428310317777e-06, + "loss": 4.9745, + "step": 9207 + }, + { + "epoch": 0.8879459980713597, + "grad_norm": 1.3842893838882446, + "learning_rate": 1.535728154226812e-06, + "loss": 5.0607, + "step": 9208 + }, + { + "epoch": 0.8880424300867888, + "grad_norm": 1.4412975311279297, + "learning_rate": 1.5331156309438322e-06, + "loss": 4.9932, + "step": 9209 + }, + { + "epoch": 0.8881388621022179, + "grad_norm": 1.3540858030319214, + "learning_rate": 1.5305052614226156e-06, + "loss": 5.1111, + "step": 9210 + }, + { + "epoch": 0.888235294117647, + "grad_norm": 1.326709508895874, + "learning_rate": 1.5278970459027404e-06, + "loss": 5.2842, + "step": 9211 + }, + { + "epoch": 0.8883317261330762, + "grad_norm": 1.4769985675811768, + "learning_rate": 1.5252909846235896e-06, + "loss": 4.7418, + "step": 9212 + }, + { + "epoch": 0.8884281581485053, + "grad_norm": 1.2360271215438843, + "learning_rate": 1.5226870778243306e-06, + "loss": 4.9033, + "step": 9213 + }, + { + "epoch": 0.8885245901639345, + "grad_norm": 1.7106349468231201, + "learning_rate": 1.5200853257439667e-06, + "loss": 4.6749, + "step": 9214 + }, + { + "epoch": 0.8886210221793636, + "grad_norm": 1.3055815696716309, + "learning_rate": 1.5174857286212624e-06, + "loss": 5.2365, + "step": 9215 + }, + { + "epoch": 0.8887174541947926, + "grad_norm": 1.6915334463119507, + "learning_rate": 1.514888286694821e-06, + "loss": 5.3311, + "step": 9216 + }, + { + "epoch": 0.8888138862102218, + "grad_norm": 1.2021640539169312, + "learning_rate": 1.5122930002030272e-06, + "loss": 4.9776, + "step": 9217 + }, + { + "epoch": 0.8889103182256509, + "grad_norm": 1.346504807472229, + "learning_rate": 1.5096998693840708e-06, + "loss": 5.0094, + "step": 9218 + }, + { + "epoch": 0.88900675024108, + "grad_norm": 1.279015302658081, + "learning_rate": 1.5071088944759504e-06, + "loss": 5.1552, + "step": 9219 + }, + { + "epoch": 0.8891031822565092, + "grad_norm": 1.3668110370635986, + "learning_rate": 1.504520075716459e-06, + "loss": 5.4442, + "step": 9220 + }, + { + "epoch": 0.8891996142719383, + "grad_norm": 1.4488227367401123, + "learning_rate": 1.5019334133431984e-06, + "loss": 5.1832, + "step": 9221 + }, + { + "epoch": 0.8892960462873674, + "grad_norm": 1.5226621627807617, + "learning_rate": 1.4993489075935645e-06, + "loss": 4.9159, + "step": 9222 + }, + { + "epoch": 0.8893924783027966, + "grad_norm": 1.3010127544403076, + "learning_rate": 1.4967665587047708e-06, + "loss": 5.2283, + "step": 9223 + }, + { + "epoch": 0.8894889103182256, + "grad_norm": 1.631151795387268, + "learning_rate": 1.494186366913805e-06, + "loss": 4.7932, + "step": 9224 + }, + { + "epoch": 0.8895853423336548, + "grad_norm": 1.7223759889602661, + "learning_rate": 1.491608332457492e-06, + "loss": 5.1092, + "step": 9225 + }, + { + "epoch": 0.8896817743490839, + "grad_norm": 2.162414073944092, + "learning_rate": 1.4890324555724227e-06, + "loss": 5.5674, + "step": 9226 + }, + { + "epoch": 0.889778206364513, + "grad_norm": 1.2956202030181885, + "learning_rate": 1.4864587364950278e-06, + "loss": 5.0514, + "step": 9227 + }, + { + "epoch": 0.8898746383799422, + "grad_norm": 1.2546031475067139, + "learning_rate": 1.483887175461504e-06, + "loss": 5.0557, + "step": 9228 + }, + { + "epoch": 0.8899710703953713, + "grad_norm": 1.4834693670272827, + "learning_rate": 1.4813177727078714e-06, + "loss": 4.8753, + "step": 9229 + }, + { + "epoch": 0.8900675024108003, + "grad_norm": 1.6773521900177002, + "learning_rate": 1.4787505284699466e-06, + "loss": 5.0024, + "step": 9230 + }, + { + "epoch": 0.8901639344262295, + "grad_norm": 1.727249026298523, + "learning_rate": 1.4761854429833522e-06, + "loss": 4.8109, + "step": 9231 + }, + { + "epoch": 0.8902603664416586, + "grad_norm": 1.285367488861084, + "learning_rate": 1.4736225164834999e-06, + "loss": 4.7409, + "step": 9232 + }, + { + "epoch": 0.8903567984570877, + "grad_norm": 1.6537675857543945, + "learning_rate": 1.471061749205621e-06, + "loss": 4.7589, + "step": 9233 + }, + { + "epoch": 0.8904532304725169, + "grad_norm": 1.1308218240737915, + "learning_rate": 1.4685031413847332e-06, + "loss": 5.0684, + "step": 9234 + }, + { + "epoch": 0.890549662487946, + "grad_norm": 1.401899814605713, + "learning_rate": 1.4659466932556626e-06, + "loss": 4.8428, + "step": 9235 + }, + { + "epoch": 0.8906460945033752, + "grad_norm": 1.3554280996322632, + "learning_rate": 1.4633924050530407e-06, + "loss": 5.1611, + "step": 9236 + }, + { + "epoch": 0.8907425265188043, + "grad_norm": 1.1029282808303833, + "learning_rate": 1.4608402770112966e-06, + "loss": 4.9581, + "step": 9237 + }, + { + "epoch": 0.8908389585342333, + "grad_norm": 2.1393091678619385, + "learning_rate": 1.4582903093646572e-06, + "loss": 4.9595, + "step": 9238 + }, + { + "epoch": 0.8909353905496625, + "grad_norm": 1.6599512100219727, + "learning_rate": 1.4557425023471572e-06, + "loss": 4.9301, + "step": 9239 + }, + { + "epoch": 0.8910318225650916, + "grad_norm": 1.6023058891296387, + "learning_rate": 1.4531968561926345e-06, + "loss": 4.4584, + "step": 9240 + }, + { + "epoch": 0.8911282545805207, + "grad_norm": 1.8612892627716064, + "learning_rate": 1.450653371134722e-06, + "loss": 5.0117, + "step": 9241 + }, + { + "epoch": 0.8912246865959499, + "grad_norm": 1.4879013299942017, + "learning_rate": 1.4481120474068633e-06, + "loss": 4.814, + "step": 9242 + }, + { + "epoch": 0.891321118611379, + "grad_norm": 1.6280845403671265, + "learning_rate": 1.4455728852422857e-06, + "loss": 5.0331, + "step": 9243 + }, + { + "epoch": 0.891417550626808, + "grad_norm": 1.4146661758422852, + "learning_rate": 1.4430358848740443e-06, + "loss": 4.9518, + "step": 9244 + }, + { + "epoch": 0.8915139826422372, + "grad_norm": 1.4401744604110718, + "learning_rate": 1.44050104653497e-06, + "loss": 4.8092, + "step": 9245 + }, + { + "epoch": 0.8916104146576663, + "grad_norm": 1.8695107698440552, + "learning_rate": 1.437968370457715e-06, + "loss": 4.8275, + "step": 9246 + }, + { + "epoch": 0.8917068466730955, + "grad_norm": 2.197395086288452, + "learning_rate": 1.4354378568747218e-06, + "loss": 4.7456, + "step": 9247 + }, + { + "epoch": 0.8918032786885246, + "grad_norm": 1.4011704921722412, + "learning_rate": 1.43290950601824e-06, + "loss": 5.0589, + "step": 9248 + }, + { + "epoch": 0.8918997107039537, + "grad_norm": 1.2970359325408936, + "learning_rate": 1.430383318120318e-06, + "loss": 4.7175, + "step": 9249 + }, + { + "epoch": 0.8919961427193829, + "grad_norm": 1.3417026996612549, + "learning_rate": 1.4278592934128089e-06, + "loss": 4.7601, + "step": 9250 + }, + { + "epoch": 0.892092574734812, + "grad_norm": 1.3115689754486084, + "learning_rate": 1.4253374321273522e-06, + "loss": 4.7783, + "step": 9251 + }, + { + "epoch": 0.892189006750241, + "grad_norm": 1.4193495512008667, + "learning_rate": 1.4228177344954185e-06, + "loss": 5.1117, + "step": 9252 + }, + { + "epoch": 0.8922854387656702, + "grad_norm": 1.6856706142425537, + "learning_rate": 1.4203002007482508e-06, + "loss": 4.8538, + "step": 9253 + }, + { + "epoch": 0.8923818707810993, + "grad_norm": 1.698244571685791, + "learning_rate": 1.4177848311169085e-06, + "loss": 4.9646, + "step": 9254 + }, + { + "epoch": 0.8924783027965284, + "grad_norm": 2.1552696228027344, + "learning_rate": 1.4152716258322485e-06, + "loss": 4.7865, + "step": 9255 + }, + { + "epoch": 0.8925747348119576, + "grad_norm": 1.490374207496643, + "learning_rate": 1.4127605851249364e-06, + "loss": 4.8072, + "step": 9256 + }, + { + "epoch": 0.8926711668273867, + "grad_norm": 1.3874090909957886, + "learning_rate": 1.4102517092254187e-06, + "loss": 4.9251, + "step": 9257 + }, + { + "epoch": 0.8927675988428159, + "grad_norm": 1.4836112260818481, + "learning_rate": 1.4077449983639718e-06, + "loss": 4.4659, + "step": 9258 + }, + { + "epoch": 0.892864030858245, + "grad_norm": 1.7210006713867188, + "learning_rate": 1.4052404527706425e-06, + "loss": 5.1852, + "step": 9259 + }, + { + "epoch": 0.892960462873674, + "grad_norm": 1.5548393726348877, + "learning_rate": 1.4027380726753137e-06, + "loss": 5.3039, + "step": 9260 + }, + { + "epoch": 0.8930568948891032, + "grad_norm": 1.7521330118179321, + "learning_rate": 1.4002378583076376e-06, + "loss": 4.7456, + "step": 9261 + }, + { + "epoch": 0.8931533269045323, + "grad_norm": 1.4730353355407715, + "learning_rate": 1.3977398098970833e-06, + "loss": 4.7316, + "step": 9262 + }, + { + "epoch": 0.8932497589199614, + "grad_norm": 1.4046434164047241, + "learning_rate": 1.3952439276729207e-06, + "loss": 4.9933, + "step": 9263 + }, + { + "epoch": 0.8933461909353906, + "grad_norm": 1.1407660245895386, + "learning_rate": 1.392750211864216e-06, + "loss": 5.02, + "step": 9264 + }, + { + "epoch": 0.8934426229508197, + "grad_norm": 1.2102749347686768, + "learning_rate": 1.3902586626998422e-06, + "loss": 4.8446, + "step": 9265 + }, + { + "epoch": 0.8935390549662487, + "grad_norm": 1.2060352563858032, + "learning_rate": 1.3877692804084685e-06, + "loss": 4.9127, + "step": 9266 + }, + { + "epoch": 0.8936354869816779, + "grad_norm": 1.1450203657150269, + "learning_rate": 1.385282065218571e-06, + "loss": 4.8876, + "step": 9267 + }, + { + "epoch": 0.893731918997107, + "grad_norm": 1.2657732963562012, + "learning_rate": 1.3827970173584171e-06, + "loss": 5.1419, + "step": 9268 + }, + { + "epoch": 0.8938283510125362, + "grad_norm": 1.5937968492507935, + "learning_rate": 1.3803141370560879e-06, + "loss": 5.0467, + "step": 9269 + }, + { + "epoch": 0.8939247830279653, + "grad_norm": 1.6478550434112549, + "learning_rate": 1.3778334245394514e-06, + "loss": 4.8855, + "step": 9270 + }, + { + "epoch": 0.8940212150433944, + "grad_norm": 1.2483876943588257, + "learning_rate": 1.3753548800361975e-06, + "loss": 5.3803, + "step": 9271 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 1.9143887758255005, + "learning_rate": 1.372878503773789e-06, + "loss": 5.2169, + "step": 9272 + }, + { + "epoch": 0.8942140790742527, + "grad_norm": 1.8902132511138916, + "learning_rate": 1.3704042959795132e-06, + "loss": 5.0718, + "step": 9273 + }, + { + "epoch": 0.8943105110896817, + "grad_norm": 1.431678295135498, + "learning_rate": 1.367932256880447e-06, + "loss": 5.1909, + "step": 9274 + }, + { + "epoch": 0.8944069431051109, + "grad_norm": 1.2915270328521729, + "learning_rate": 1.3654623867034755e-06, + "loss": 4.7098, + "step": 9275 + }, + { + "epoch": 0.89450337512054, + "grad_norm": 1.50789475440979, + "learning_rate": 1.3629946856752674e-06, + "loss": 5.0027, + "step": 9276 + }, + { + "epoch": 0.8945998071359691, + "grad_norm": 1.3592230081558228, + "learning_rate": 1.3605291540223246e-06, + "loss": 4.892, + "step": 9277 + }, + { + "epoch": 0.8946962391513983, + "grad_norm": 1.7421557903289795, + "learning_rate": 1.3580657919709167e-06, + "loss": 4.8876, + "step": 9278 + }, + { + "epoch": 0.8947926711668274, + "grad_norm": 1.406378984451294, + "learning_rate": 1.3556045997471318e-06, + "loss": 5.2675, + "step": 9279 + }, + { + "epoch": 0.8948891031822566, + "grad_norm": 1.1467876434326172, + "learning_rate": 1.353145577576856e-06, + "loss": 5.1894, + "step": 9280 + }, + { + "epoch": 0.8949855351976856, + "grad_norm": 1.3485666513442993, + "learning_rate": 1.3506887256857726e-06, + "loss": 4.8571, + "step": 9281 + }, + { + "epoch": 0.8950819672131147, + "grad_norm": 1.3530867099761963, + "learning_rate": 1.3482340442993708e-06, + "loss": 4.9052, + "step": 9282 + }, + { + "epoch": 0.8951783992285439, + "grad_norm": 1.4229955673217773, + "learning_rate": 1.3457815336429369e-06, + "loss": 4.8746, + "step": 9283 + }, + { + "epoch": 0.895274831243973, + "grad_norm": 1.3832625150680542, + "learning_rate": 1.3433311939415632e-06, + "loss": 4.853, + "step": 9284 + }, + { + "epoch": 0.8953712632594021, + "grad_norm": 1.3558626174926758, + "learning_rate": 1.3408830254201337e-06, + "loss": 4.8407, + "step": 9285 + }, + { + "epoch": 0.8954676952748313, + "grad_norm": 1.3574631214141846, + "learning_rate": 1.3384370283033437e-06, + "loss": 4.8669, + "step": 9286 + }, + { + "epoch": 0.8955641272902604, + "grad_norm": 1.09125816822052, + "learning_rate": 1.3359932028156746e-06, + "loss": 4.9418, + "step": 9287 + }, + { + "epoch": 0.8956605593056894, + "grad_norm": 1.1193045377731323, + "learning_rate": 1.3335515491814331e-06, + "loss": 4.9415, + "step": 9288 + }, + { + "epoch": 0.8957569913211186, + "grad_norm": 1.3822134733200073, + "learning_rate": 1.3311120676246958e-06, + "loss": 5.1183, + "step": 9289 + }, + { + "epoch": 0.8958534233365477, + "grad_norm": 1.419032335281372, + "learning_rate": 1.328674758369361e-06, + "loss": 5.0902, + "step": 9290 + }, + { + "epoch": 0.8959498553519769, + "grad_norm": 1.2831734418869019, + "learning_rate": 1.326239621639122e-06, + "loss": 4.7627, + "step": 9291 + }, + { + "epoch": 0.896046287367406, + "grad_norm": 1.449323058128357, + "learning_rate": 1.3238066576574725e-06, + "loss": 4.873, + "step": 9292 + }, + { + "epoch": 0.8961427193828351, + "grad_norm": 1.5573112964630127, + "learning_rate": 1.3213758666477088e-06, + "loss": 4.782, + "step": 9293 + }, + { + "epoch": 0.8962391513982643, + "grad_norm": 1.7834290266036987, + "learning_rate": 1.3189472488329274e-06, + "loss": 4.814, + "step": 9294 + }, + { + "epoch": 0.8963355834136933, + "grad_norm": 1.1234439611434937, + "learning_rate": 1.316520804436014e-06, + "loss": 5.0361, + "step": 9295 + }, + { + "epoch": 0.8964320154291224, + "grad_norm": 1.7786400318145752, + "learning_rate": 1.3140965336796762e-06, + "loss": 4.8303, + "step": 9296 + }, + { + "epoch": 0.8965284474445516, + "grad_norm": 1.643121361732483, + "learning_rate": 1.3116744367864058e-06, + "loss": 4.5922, + "step": 9297 + }, + { + "epoch": 0.8966248794599807, + "grad_norm": 1.966553807258606, + "learning_rate": 1.3092545139784973e-06, + "loss": 4.9684, + "step": 9298 + }, + { + "epoch": 0.8967213114754098, + "grad_norm": 1.326323390007019, + "learning_rate": 1.3068367654780533e-06, + "loss": 5.0688, + "step": 9299 + }, + { + "epoch": 0.896817743490839, + "grad_norm": 1.2844029664993286, + "learning_rate": 1.3044211915069715e-06, + "loss": 4.711, + "step": 9300 + }, + { + "epoch": 0.8969141755062681, + "grad_norm": 1.5803872346878052, + "learning_rate": 1.3020077922869412e-06, + "loss": 5.2078, + "step": 9301 + }, + { + "epoch": 0.8970106075216973, + "grad_norm": 2.274977684020996, + "learning_rate": 1.2995965680394767e-06, + "loss": 5.167, + "step": 9302 + }, + { + "epoch": 0.8971070395371263, + "grad_norm": 1.8979005813598633, + "learning_rate": 1.2971875189858624e-06, + "loss": 5.1902, + "step": 9303 + }, + { + "epoch": 0.8972034715525554, + "grad_norm": 1.4629007577896118, + "learning_rate": 1.2947806453472071e-06, + "loss": 4.9938, + "step": 9304 + }, + { + "epoch": 0.8972999035679846, + "grad_norm": 1.888244867324829, + "learning_rate": 1.292375947344407e-06, + "loss": 4.8598, + "step": 9305 + }, + { + "epoch": 0.8973963355834137, + "grad_norm": 1.6010957956314087, + "learning_rate": 1.2899734251981627e-06, + "loss": 5.1333, + "step": 9306 + }, + { + "epoch": 0.8974927675988428, + "grad_norm": 1.6141763925552368, + "learning_rate": 1.2875730791289758e-06, + "loss": 5.0125, + "step": 9307 + }, + { + "epoch": 0.897589199614272, + "grad_norm": 1.6787148714065552, + "learning_rate": 1.2851749093571452e-06, + "loss": 4.992, + "step": 9308 + }, + { + "epoch": 0.897685631629701, + "grad_norm": 1.2783987522125244, + "learning_rate": 1.2827789161027754e-06, + "loss": 5.145, + "step": 9309 + }, + { + "epoch": 0.8977820636451301, + "grad_norm": 1.4056483507156372, + "learning_rate": 1.2803850995857653e-06, + "loss": 5.4177, + "step": 9310 + }, + { + "epoch": 0.8978784956605593, + "grad_norm": 1.4792335033416748, + "learning_rate": 1.277993460025817e-06, + "loss": 5.0026, + "step": 9311 + }, + { + "epoch": 0.8979749276759884, + "grad_norm": 1.2825958728790283, + "learning_rate": 1.2756039976424272e-06, + "loss": 4.7114, + "step": 9312 + }, + { + "epoch": 0.8980713596914176, + "grad_norm": 1.605168104171753, + "learning_rate": 1.2732167126549094e-06, + "loss": 4.8731, + "step": 9313 + }, + { + "epoch": 0.8981677917068467, + "grad_norm": 1.4261029958724976, + "learning_rate": 1.270831605282352e-06, + "loss": 5.0728, + "step": 9314 + }, + { + "epoch": 0.8982642237222758, + "grad_norm": 1.3311222791671753, + "learning_rate": 1.2684486757436692e-06, + "loss": 4.7946, + "step": 9315 + }, + { + "epoch": 0.898360655737705, + "grad_norm": 1.6266632080078125, + "learning_rate": 1.2660679242575579e-06, + "loss": 4.9926, + "step": 9316 + }, + { + "epoch": 0.898457087753134, + "grad_norm": 1.4032164812088013, + "learning_rate": 1.2636893510425186e-06, + "loss": 5.0867, + "step": 9317 + }, + { + "epoch": 0.8985535197685631, + "grad_norm": 1.4177254438400269, + "learning_rate": 1.261312956316857e-06, + "loss": 4.875, + "step": 9318 + }, + { + "epoch": 0.8986499517839923, + "grad_norm": 1.4936739206314087, + "learning_rate": 1.2589387402986764e-06, + "loss": 4.9147, + "step": 9319 + }, + { + "epoch": 0.8987463837994214, + "grad_norm": 2.362788438796997, + "learning_rate": 1.256566703205872e-06, + "loss": 4.8137, + "step": 9320 + }, + { + "epoch": 0.8988428158148505, + "grad_norm": 2.3133795261383057, + "learning_rate": 1.2541968452561587e-06, + "loss": 4.7287, + "step": 9321 + }, + { + "epoch": 0.8989392478302797, + "grad_norm": 1.6991230249404907, + "learning_rate": 1.251829166667029e-06, + "loss": 4.8968, + "step": 9322 + }, + { + "epoch": 0.8990356798457088, + "grad_norm": 1.5038782358169556, + "learning_rate": 1.2494636676557897e-06, + "loss": 4.969, + "step": 9323 + }, + { + "epoch": 0.899132111861138, + "grad_norm": 2.2908034324645996, + "learning_rate": 1.2471003484395422e-06, + "loss": 5.215, + "step": 9324 + }, + { + "epoch": 0.899228543876567, + "grad_norm": 1.3732918500900269, + "learning_rate": 1.2447392092351905e-06, + "loss": 5.0634, + "step": 9325 + }, + { + "epoch": 0.8993249758919961, + "grad_norm": 1.5985301733016968, + "learning_rate": 1.2423802502594368e-06, + "loss": 4.9434, + "step": 9326 + }, + { + "epoch": 0.8994214079074253, + "grad_norm": 1.2282360792160034, + "learning_rate": 1.2400234717287824e-06, + "loss": 5.0767, + "step": 9327 + }, + { + "epoch": 0.8995178399228544, + "grad_norm": 1.4497220516204834, + "learning_rate": 1.2376688738595293e-06, + "loss": 4.6699, + "step": 9328 + }, + { + "epoch": 0.8996142719382835, + "grad_norm": 1.2238926887512207, + "learning_rate": 1.23531645686778e-06, + "loss": 5.1258, + "step": 9329 + }, + { + "epoch": 0.8997107039537127, + "grad_norm": 1.1317799091339111, + "learning_rate": 1.2329662209694393e-06, + "loss": 5.1133, + "step": 9330 + }, + { + "epoch": 0.8998071359691417, + "grad_norm": 1.1579923629760742, + "learning_rate": 1.2306181663802013e-06, + "loss": 5.1246, + "step": 9331 + }, + { + "epoch": 0.8999035679845708, + "grad_norm": 1.3207650184631348, + "learning_rate": 1.2282722933155798e-06, + "loss": 4.7459, + "step": 9332 + }, + { + "epoch": 0.9, + "grad_norm": 1.5939009189605713, + "learning_rate": 1.2259286019908639e-06, + "loss": 5.3315, + "step": 9333 + }, + { + "epoch": 0.9000964320154291, + "grad_norm": 1.2375608682632446, + "learning_rate": 1.2235870926211619e-06, + "loss": 5.421, + "step": 9334 + }, + { + "epoch": 0.9001928640308583, + "grad_norm": 1.4534292221069336, + "learning_rate": 1.2212477654213712e-06, + "loss": 5.2315, + "step": 9335 + }, + { + "epoch": 0.9002892960462874, + "grad_norm": 1.252114176750183, + "learning_rate": 1.2189106206061923e-06, + "loss": 4.9778, + "step": 9336 + }, + { + "epoch": 0.9003857280617165, + "grad_norm": 1.2596338987350464, + "learning_rate": 1.216575658390129e-06, + "loss": 4.8839, + "step": 9337 + }, + { + "epoch": 0.9004821600771457, + "grad_norm": 1.3021783828735352, + "learning_rate": 1.2142428789874817e-06, + "loss": 5.2382, + "step": 9338 + }, + { + "epoch": 0.9005785920925747, + "grad_norm": 1.2910877466201782, + "learning_rate": 1.211912282612343e-06, + "loss": 5.0118, + "step": 9339 + }, + { + "epoch": 0.9006750241080038, + "grad_norm": 2.3663854598999023, + "learning_rate": 1.2095838694786198e-06, + "loss": 5.1552, + "step": 9340 + }, + { + "epoch": 0.900771456123433, + "grad_norm": 1.1507065296173096, + "learning_rate": 1.207257639800008e-06, + "loss": 5.0914, + "step": 9341 + }, + { + "epoch": 0.9008678881388621, + "grad_norm": 1.231925129890442, + "learning_rate": 1.204933593790003e-06, + "loss": 5.3191, + "step": 9342 + }, + { + "epoch": 0.9009643201542912, + "grad_norm": 1.3341739177703857, + "learning_rate": 1.2026117316619067e-06, + "loss": 5.5303, + "step": 9343 + }, + { + "epoch": 0.9010607521697204, + "grad_norm": 1.4181703329086304, + "learning_rate": 1.2002920536288153e-06, + "loss": 5.5194, + "step": 9344 + }, + { + "epoch": 0.9011571841851495, + "grad_norm": 1.5462771654129028, + "learning_rate": 1.197974559903628e-06, + "loss": 4.8877, + "step": 9345 + }, + { + "epoch": 0.9012536162005786, + "grad_norm": 1.6458094120025635, + "learning_rate": 1.1956592506990383e-06, + "loss": 4.9228, + "step": 9346 + }, + { + "epoch": 0.9013500482160077, + "grad_norm": 1.1546496152877808, + "learning_rate": 1.1933461262275458e-06, + "loss": 5.1104, + "step": 9347 + }, + { + "epoch": 0.9014464802314368, + "grad_norm": 1.7576338052749634, + "learning_rate": 1.1910351867014418e-06, + "loss": 5.4646, + "step": 9348 + }, + { + "epoch": 0.901542912246866, + "grad_norm": 1.5691896677017212, + "learning_rate": 1.1887264323328318e-06, + "loss": 4.6868, + "step": 9349 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 1.2485802173614502, + "learning_rate": 1.186419863333596e-06, + "loss": 5.4279, + "step": 9350 + }, + { + "epoch": 0.9017357762777242, + "grad_norm": 1.0792243480682373, + "learning_rate": 1.1841154799154374e-06, + "loss": 5.1965, + "step": 9351 + }, + { + "epoch": 0.9018322082931534, + "grad_norm": 1.2555902004241943, + "learning_rate": 1.181813282289848e-06, + "loss": 4.9034, + "step": 9352 + }, + { + "epoch": 0.9019286403085824, + "grad_norm": 1.3324475288391113, + "learning_rate": 1.1795132706681173e-06, + "loss": 5.0438, + "step": 9353 + }, + { + "epoch": 0.9020250723240115, + "grad_norm": 1.6272205114364624, + "learning_rate": 1.1772154452613426e-06, + "loss": 4.9366, + "step": 9354 + }, + { + "epoch": 0.9021215043394407, + "grad_norm": 2.1759533882141113, + "learning_rate": 1.1749198062804168e-06, + "loss": 4.822, + "step": 9355 + }, + { + "epoch": 0.9022179363548698, + "grad_norm": 1.5232763290405273, + "learning_rate": 1.1726263539360182e-06, + "loss": 4.8781, + "step": 9356 + }, + { + "epoch": 0.902314368370299, + "grad_norm": 1.2192771434783936, + "learning_rate": 1.1703350884386566e-06, + "loss": 4.9805, + "step": 9357 + }, + { + "epoch": 0.9024108003857281, + "grad_norm": 1.4568099975585938, + "learning_rate": 1.1680460099986023e-06, + "loss": 4.6081, + "step": 9358 + }, + { + "epoch": 0.9025072324011572, + "grad_norm": 1.7605290412902832, + "learning_rate": 1.1657591188259599e-06, + "loss": 4.8092, + "step": 9359 + }, + { + "epoch": 0.9026036644165863, + "grad_norm": 1.432968258857727, + "learning_rate": 1.1634744151306055e-06, + "loss": 4.7419, + "step": 9360 + }, + { + "epoch": 0.9027000964320154, + "grad_norm": 2.058073043823242, + "learning_rate": 1.1611918991222325e-06, + "loss": 5.1082, + "step": 9361 + }, + { + "epoch": 0.9027965284474445, + "grad_norm": 1.572981834411621, + "learning_rate": 1.1589115710103266e-06, + "loss": 5.0042, + "step": 9362 + }, + { + "epoch": 0.9028929604628737, + "grad_norm": 1.6013078689575195, + "learning_rate": 1.1566334310041753e-06, + "loss": 4.7704, + "step": 9363 + }, + { + "epoch": 0.9029893924783028, + "grad_norm": 1.5948152542114258, + "learning_rate": 1.1543574793128564e-06, + "loss": 5.5242, + "step": 9364 + }, + { + "epoch": 0.9030858244937319, + "grad_norm": 1.5289711952209473, + "learning_rate": 1.1520837161452636e-06, + "loss": 5.1028, + "step": 9365 + }, + { + "epoch": 0.9031822565091611, + "grad_norm": 1.4164339303970337, + "learning_rate": 1.1498121417100716e-06, + "loss": 5.1606, + "step": 9366 + }, + { + "epoch": 0.9032786885245901, + "grad_norm": 1.1662178039550781, + "learning_rate": 1.1475427562157693e-06, + "loss": 4.9921, + "step": 9367 + }, + { + "epoch": 0.9033751205400193, + "grad_norm": 1.378557562828064, + "learning_rate": 1.1452755598706315e-06, + "loss": 5.2385, + "step": 9368 + }, + { + "epoch": 0.9034715525554484, + "grad_norm": 1.390957236289978, + "learning_rate": 1.143010552882745e-06, + "loss": 5.132, + "step": 9369 + }, + { + "epoch": 0.9035679845708775, + "grad_norm": 1.5550858974456787, + "learning_rate": 1.1407477354599878e-06, + "loss": 4.7799, + "step": 9370 + }, + { + "epoch": 0.9036644165863067, + "grad_norm": 1.528220295906067, + "learning_rate": 1.138487107810035e-06, + "loss": 5.2857, + "step": 9371 + }, + { + "epoch": 0.9037608486017358, + "grad_norm": 1.5629205703735352, + "learning_rate": 1.1362286701403684e-06, + "loss": 5.1216, + "step": 9372 + }, + { + "epoch": 0.9038572806171649, + "grad_norm": 1.3859835863113403, + "learning_rate": 1.133972422658261e-06, + "loss": 5.0217, + "step": 9373 + }, + { + "epoch": 0.903953712632594, + "grad_norm": 1.5214773416519165, + "learning_rate": 1.131718365570797e-06, + "loss": 5.0457, + "step": 9374 + }, + { + "epoch": 0.9040501446480231, + "grad_norm": 1.8536806106567383, + "learning_rate": 1.129466499084833e-06, + "loss": 4.8844, + "step": 9375 + }, + { + "epoch": 0.9041465766634522, + "grad_norm": 1.3307251930236816, + "learning_rate": 1.127216823407065e-06, + "loss": 4.9322, + "step": 9376 + }, + { + "epoch": 0.9042430086788814, + "grad_norm": 2.0037457942962646, + "learning_rate": 1.1249693387439475e-06, + "loss": 4.9758, + "step": 9377 + }, + { + "epoch": 0.9043394406943105, + "grad_norm": 1.186913251876831, + "learning_rate": 1.1227240453017597e-06, + "loss": 4.8036, + "step": 9378 + }, + { + "epoch": 0.9044358727097397, + "grad_norm": 1.151732087135315, + "learning_rate": 1.1204809432865725e-06, + "loss": 5.3496, + "step": 9379 + }, + { + "epoch": 0.9045323047251688, + "grad_norm": 1.3095277547836304, + "learning_rate": 1.1182400329042525e-06, + "loss": 5.3093, + "step": 9380 + }, + { + "epoch": 0.9046287367405978, + "grad_norm": 1.3302781581878662, + "learning_rate": 1.116001314360468e-06, + "loss": 4.9714, + "step": 9381 + }, + { + "epoch": 0.904725168756027, + "grad_norm": 1.2602410316467285, + "learning_rate": 1.113764787860691e-06, + "loss": 4.9109, + "step": 9382 + }, + { + "epoch": 0.9048216007714561, + "grad_norm": 1.1502735614776611, + "learning_rate": 1.1115304536101738e-06, + "loss": 4.8269, + "step": 9383 + }, + { + "epoch": 0.9049180327868852, + "grad_norm": 1.308412790298462, + "learning_rate": 1.109298311814e-06, + "loss": 5.2517, + "step": 9384 + }, + { + "epoch": 0.9050144648023144, + "grad_norm": 1.3635971546173096, + "learning_rate": 1.1070683626770162e-06, + "loss": 5.1176, + "step": 9385 + }, + { + "epoch": 0.9051108968177435, + "grad_norm": 1.2486929893493652, + "learning_rate": 1.1048406064038896e-06, + "loss": 5.0571, + "step": 9386 + }, + { + "epoch": 0.9052073288331726, + "grad_norm": 1.5159001350402832, + "learning_rate": 1.1026150431990846e-06, + "loss": 4.6759, + "step": 9387 + }, + { + "epoch": 0.9053037608486018, + "grad_norm": 1.2443387508392334, + "learning_rate": 1.1003916732668567e-06, + "loss": 4.8933, + "step": 9388 + }, + { + "epoch": 0.9054001928640308, + "grad_norm": 1.1083340644836426, + "learning_rate": 1.0981704968112655e-06, + "loss": 4.9861, + "step": 9389 + }, + { + "epoch": 0.90549662487946, + "grad_norm": 1.8336375951766968, + "learning_rate": 1.0959515140361698e-06, + "loss": 5.0253, + "step": 9390 + }, + { + "epoch": 0.9055930568948891, + "grad_norm": 1.3989577293395996, + "learning_rate": 1.0937347251452207e-06, + "loss": 4.8516, + "step": 9391 + }, + { + "epoch": 0.9056894889103182, + "grad_norm": 2.018498182296753, + "learning_rate": 1.0915201303418748e-06, + "loss": 5.3027, + "step": 9392 + }, + { + "epoch": 0.9057859209257474, + "grad_norm": 1.2629802227020264, + "learning_rate": 1.0893077298293891e-06, + "loss": 5.0228, + "step": 9393 + }, + { + "epoch": 0.9058823529411765, + "grad_norm": 1.8867415189743042, + "learning_rate": 1.0870975238108038e-06, + "loss": 5.1909, + "step": 9394 + }, + { + "epoch": 0.9059787849566056, + "grad_norm": 1.3067058324813843, + "learning_rate": 1.0848895124889818e-06, + "loss": 4.6923, + "step": 9395 + }, + { + "epoch": 0.9060752169720347, + "grad_norm": 1.6754573583602905, + "learning_rate": 1.0826836960665609e-06, + "loss": 4.485, + "step": 9396 + }, + { + "epoch": 0.9061716489874638, + "grad_norm": 2.26078200340271, + "learning_rate": 1.080480074745996e-06, + "loss": 5.0689, + "step": 9397 + }, + { + "epoch": 0.9062680810028929, + "grad_norm": 1.5070548057556152, + "learning_rate": 1.0782786487295276e-06, + "loss": 4.9053, + "step": 9398 + }, + { + "epoch": 0.9063645130183221, + "grad_norm": 1.1123216152191162, + "learning_rate": 1.0760794182192058e-06, + "loss": 4.9831, + "step": 9399 + }, + { + "epoch": 0.9064609450337512, + "grad_norm": 1.3938753604888916, + "learning_rate": 1.0738823834168632e-06, + "loss": 5.0729, + "step": 9400 + }, + { + "epoch": 0.9065573770491804, + "grad_norm": 1.504392385482788, + "learning_rate": 1.0716875445241554e-06, + "loss": 5.0874, + "step": 9401 + }, + { + "epoch": 0.9066538090646095, + "grad_norm": 1.3863041400909424, + "learning_rate": 1.0694949017425043e-06, + "loss": 4.7707, + "step": 9402 + }, + { + "epoch": 0.9067502410800385, + "grad_norm": 1.2487834692001343, + "learning_rate": 1.067304455273166e-06, + "loss": 4.9051, + "step": 9403 + }, + { + "epoch": 0.9068466730954677, + "grad_norm": 1.5236520767211914, + "learning_rate": 1.0651162053171626e-06, + "loss": 5.146, + "step": 9404 + }, + { + "epoch": 0.9069431051108968, + "grad_norm": 1.2013943195343018, + "learning_rate": 1.0629301520753365e-06, + "loss": 4.8655, + "step": 9405 + }, + { + "epoch": 0.9070395371263259, + "grad_norm": 1.1712003946304321, + "learning_rate": 1.0607462957483216e-06, + "loss": 4.9707, + "step": 9406 + }, + { + "epoch": 0.9071359691417551, + "grad_norm": 1.2594482898712158, + "learning_rate": 1.0585646365365464e-06, + "loss": 5.0729, + "step": 9407 + }, + { + "epoch": 0.9072324011571842, + "grad_norm": 1.6446044445037842, + "learning_rate": 1.056385174640237e-06, + "loss": 4.8326, + "step": 9408 + }, + { + "epoch": 0.9073288331726133, + "grad_norm": 1.1920901536941528, + "learning_rate": 1.0542079102594332e-06, + "loss": 5.0121, + "step": 9409 + }, + { + "epoch": 0.9074252651880425, + "grad_norm": 1.947411060333252, + "learning_rate": 1.0520328435939531e-06, + "loss": 5.1272, + "step": 9410 + }, + { + "epoch": 0.9075216972034715, + "grad_norm": 1.191989779472351, + "learning_rate": 1.0498599748434229e-06, + "loss": 5.1037, + "step": 9411 + }, + { + "epoch": 0.9076181292189007, + "grad_norm": 1.250115156173706, + "learning_rate": 1.0476893042072638e-06, + "loss": 4.9474, + "step": 9412 + }, + { + "epoch": 0.9077145612343298, + "grad_norm": 2.453993320465088, + "learning_rate": 1.0455208318847027e-06, + "loss": 4.982, + "step": 9413 + }, + { + "epoch": 0.9078109932497589, + "grad_norm": 1.2101140022277832, + "learning_rate": 1.043354558074755e-06, + "loss": 5.0436, + "step": 9414 + }, + { + "epoch": 0.9079074252651881, + "grad_norm": 1.306239366531372, + "learning_rate": 1.0411904829762399e-06, + "loss": 4.8598, + "step": 9415 + }, + { + "epoch": 0.9080038572806172, + "grad_norm": 2.010816812515259, + "learning_rate": 1.039028606787773e-06, + "loss": 4.7013, + "step": 9416 + }, + { + "epoch": 0.9081002892960462, + "grad_norm": 1.2831192016601562, + "learning_rate": 1.036868929707771e-06, + "loss": 4.9118, + "step": 9417 + }, + { + "epoch": 0.9081967213114754, + "grad_norm": 1.5060203075408936, + "learning_rate": 1.0347114519344475e-06, + "loss": 4.9471, + "step": 9418 + }, + { + "epoch": 0.9082931533269045, + "grad_norm": 1.2892889976501465, + "learning_rate": 1.0325561736658052e-06, + "loss": 5.3132, + "step": 9419 + }, + { + "epoch": 0.9083895853423336, + "grad_norm": 1.8258072137832642, + "learning_rate": 1.0304030950996634e-06, + "loss": 5.117, + "step": 9420 + }, + { + "epoch": 0.9084860173577628, + "grad_norm": 2.744342088699341, + "learning_rate": 1.02825221643362e-06, + "loss": 5.4949, + "step": 9421 + }, + { + "epoch": 0.9085824493731919, + "grad_norm": 2.399951696395874, + "learning_rate": 1.0261035378650862e-06, + "loss": 5.4657, + "step": 9422 + }, + { + "epoch": 0.9086788813886211, + "grad_norm": 1.6543097496032715, + "learning_rate": 1.0239570595912602e-06, + "loss": 4.903, + "step": 9423 + }, + { + "epoch": 0.9087753134040502, + "grad_norm": 1.4403131008148193, + "learning_rate": 1.0218127818091477e-06, + "loss": 5.2998, + "step": 9424 + }, + { + "epoch": 0.9088717454194792, + "grad_norm": 1.375948190689087, + "learning_rate": 1.0196707047155446e-06, + "loss": 5.1134, + "step": 9425 + }, + { + "epoch": 0.9089681774349084, + "grad_norm": 1.528684139251709, + "learning_rate": 1.0175308285070517e-06, + "loss": 4.8152, + "step": 9426 + }, + { + "epoch": 0.9090646094503375, + "grad_norm": 1.8966940641403198, + "learning_rate": 1.0153931533800564e-06, + "loss": 4.8837, + "step": 9427 + }, + { + "epoch": 0.9091610414657666, + "grad_norm": 2.3610680103302, + "learning_rate": 1.013257679530763e-06, + "loss": 4.8457, + "step": 9428 + }, + { + "epoch": 0.9092574734811958, + "grad_norm": 2.3926610946655273, + "learning_rate": 1.0111244071551534e-06, + "loss": 4.8197, + "step": 9429 + }, + { + "epoch": 0.9093539054966249, + "grad_norm": 1.460407018661499, + "learning_rate": 1.0089933364490185e-06, + "loss": 4.8061, + "step": 9430 + }, + { + "epoch": 0.909450337512054, + "grad_norm": 1.1525193452835083, + "learning_rate": 1.0068644676079513e-06, + "loss": 4.9631, + "step": 9431 + }, + { + "epoch": 0.9095467695274831, + "grad_norm": 1.0835967063903809, + "learning_rate": 1.0047378008273295e-06, + "loss": 4.8337, + "step": 9432 + }, + { + "epoch": 0.9096432015429122, + "grad_norm": 1.9978106021881104, + "learning_rate": 1.002613336302341e-06, + "loss": 4.8595, + "step": 9433 + }, + { + "epoch": 0.9097396335583414, + "grad_norm": 1.9606380462646484, + "learning_rate": 1.0004910742279633e-06, + "loss": 4.7997, + "step": 9434 + }, + { + "epoch": 0.9098360655737705, + "grad_norm": 1.0746546983718872, + "learning_rate": 9.98371014798974e-07, + "loss": 4.9411, + "step": 9435 + }, + { + "epoch": 0.9099324975891996, + "grad_norm": 1.3168883323669434, + "learning_rate": 9.96253158209956e-07, + "loss": 5.0536, + "step": 9436 + }, + { + "epoch": 0.9100289296046288, + "grad_norm": 1.182298183441162, + "learning_rate": 9.941375046552791e-07, + "loss": 4.8689, + "step": 9437 + }, + { + "epoch": 0.9101253616200579, + "grad_norm": 1.3579990863800049, + "learning_rate": 9.920240543291104e-07, + "loss": 5.2462, + "step": 9438 + }, + { + "epoch": 0.9102217936354869, + "grad_norm": 1.6223244667053223, + "learning_rate": 9.899128074254333e-07, + "loss": 5.2705, + "step": 9439 + }, + { + "epoch": 0.9103182256509161, + "grad_norm": 2.1202785968780518, + "learning_rate": 9.878037641380011e-07, + "loss": 5.5692, + "step": 9440 + }, + { + "epoch": 0.9104146576663452, + "grad_norm": 1.6555277109146118, + "learning_rate": 9.85696924660387e-07, + "loss": 4.7617, + "step": 9441 + }, + { + "epoch": 0.9105110896817743, + "grad_norm": 1.2717816829681396, + "learning_rate": 9.835922891859528e-07, + "loss": 4.8132, + "step": 9442 + }, + { + "epoch": 0.9106075216972035, + "grad_norm": 1.6401565074920654, + "learning_rate": 9.814898579078607e-07, + "loss": 5.0012, + "step": 9443 + }, + { + "epoch": 0.9107039537126326, + "grad_norm": 1.9078588485717773, + "learning_rate": 9.793896310190592e-07, + "loss": 4.7038, + "step": 9444 + }, + { + "epoch": 0.9108003857280618, + "grad_norm": 1.9597986936569214, + "learning_rate": 9.772916087123219e-07, + "loss": 4.6967, + "step": 9445 + }, + { + "epoch": 0.9108968177434908, + "grad_norm": 1.4029109477996826, + "learning_rate": 9.75195791180189e-07, + "loss": 5.3566, + "step": 9446 + }, + { + "epoch": 0.9109932497589199, + "grad_norm": 1.239281415939331, + "learning_rate": 9.73102178615018e-07, + "loss": 5.0161, + "step": 9447 + }, + { + "epoch": 0.9110896817743491, + "grad_norm": 1.4550964832305908, + "learning_rate": 9.710107712089578e-07, + "loss": 5.0687, + "step": 9448 + }, + { + "epoch": 0.9111861137897782, + "grad_norm": 1.57069730758667, + "learning_rate": 9.689215691539522e-07, + "loss": 4.978, + "step": 9449 + }, + { + "epoch": 0.9112825458052073, + "grad_norm": 1.2587311267852783, + "learning_rate": 9.668345726417478e-07, + "loss": 5.0524, + "step": 9450 + }, + { + "epoch": 0.9113789778206365, + "grad_norm": 1.2492027282714844, + "learning_rate": 9.647497818638884e-07, + "loss": 5.2171, + "step": 9451 + }, + { + "epoch": 0.9114754098360656, + "grad_norm": 1.56025230884552, + "learning_rate": 9.626671970117073e-07, + "loss": 4.9442, + "step": 9452 + }, + { + "epoch": 0.9115718418514946, + "grad_norm": 1.438751459121704, + "learning_rate": 9.60586818276349e-07, + "loss": 4.8042, + "step": 9453 + }, + { + "epoch": 0.9116682738669238, + "grad_norm": 1.6078579425811768, + "learning_rate": 9.585086458487464e-07, + "loss": 4.8033, + "step": 9454 + }, + { + "epoch": 0.9117647058823529, + "grad_norm": 1.2144399881362915, + "learning_rate": 9.564326799196276e-07, + "loss": 4.9581, + "step": 9455 + }, + { + "epoch": 0.9118611378977821, + "grad_norm": 1.4637975692749023, + "learning_rate": 9.54358920679524e-07, + "loss": 4.7184, + "step": 9456 + }, + { + "epoch": 0.9119575699132112, + "grad_norm": 1.187421441078186, + "learning_rate": 9.522873683187633e-07, + "loss": 4.7729, + "step": 9457 + }, + { + "epoch": 0.9120540019286403, + "grad_norm": 1.361188530921936, + "learning_rate": 9.502180230274716e-07, + "loss": 5.1074, + "step": 9458 + }, + { + "epoch": 0.9121504339440695, + "grad_norm": 1.919714093208313, + "learning_rate": 9.48150884995569e-07, + "loss": 5.173, + "step": 9459 + }, + { + "epoch": 0.9122468659594986, + "grad_norm": 2.2194814682006836, + "learning_rate": 9.460859544127731e-07, + "loss": 5.2155, + "step": 9460 + }, + { + "epoch": 0.9123432979749276, + "grad_norm": 1.5262560844421387, + "learning_rate": 9.440232314686049e-07, + "loss": 5.3064, + "step": 9461 + }, + { + "epoch": 0.9124397299903568, + "grad_norm": 1.5049433708190918, + "learning_rate": 9.419627163523792e-07, + "loss": 5.2833, + "step": 9462 + }, + { + "epoch": 0.9125361620057859, + "grad_norm": 1.445968747138977, + "learning_rate": 9.399044092531977e-07, + "loss": 5.2658, + "step": 9463 + }, + { + "epoch": 0.912632594021215, + "grad_norm": 1.2124087810516357, + "learning_rate": 9.378483103599817e-07, + "loss": 5.0249, + "step": 9464 + }, + { + "epoch": 0.9127290260366442, + "grad_norm": 1.4352644681930542, + "learning_rate": 9.357944198614299e-07, + "loss": 5.5445, + "step": 9465 + }, + { + "epoch": 0.9128254580520733, + "grad_norm": 1.4461787939071655, + "learning_rate": 9.337427379460473e-07, + "loss": 5.4384, + "step": 9466 + }, + { + "epoch": 0.9129218900675025, + "grad_norm": 1.2726516723632812, + "learning_rate": 9.316932648021386e-07, + "loss": 5.2694, + "step": 9467 + }, + { + "epoch": 0.9130183220829315, + "grad_norm": 1.1357694864273071, + "learning_rate": 9.296460006177954e-07, + "loss": 5.1938, + "step": 9468 + }, + { + "epoch": 0.9131147540983606, + "grad_norm": 1.4200830459594727, + "learning_rate": 9.276009455809171e-07, + "loss": 5.1086, + "step": 9469 + }, + { + "epoch": 0.9132111861137898, + "grad_norm": 1.2288333177566528, + "learning_rate": 9.255580998792007e-07, + "loss": 5.1374, + "step": 9470 + }, + { + "epoch": 0.9133076181292189, + "grad_norm": 2.0148873329162598, + "learning_rate": 9.235174637001215e-07, + "loss": 5.0446, + "step": 9471 + }, + { + "epoch": 0.913404050144648, + "grad_norm": 1.361180067062378, + "learning_rate": 9.21479037230985e-07, + "loss": 5.1377, + "step": 9472 + }, + { + "epoch": 0.9135004821600772, + "grad_norm": 1.513119101524353, + "learning_rate": 9.19442820658864e-07, + "loss": 5.321, + "step": 9473 + }, + { + "epoch": 0.9135969141755063, + "grad_norm": 1.358230471611023, + "learning_rate": 9.174088141706422e-07, + "loss": 5.2413, + "step": 9474 + }, + { + "epoch": 0.9136933461909353, + "grad_norm": 3.0236258506774902, + "learning_rate": 9.153770179529981e-07, + "loss": 5.2886, + "step": 9475 + }, + { + "epoch": 0.9137897782063645, + "grad_norm": 1.4005029201507568, + "learning_rate": 9.133474321924074e-07, + "loss": 5.357, + "step": 9476 + }, + { + "epoch": 0.9138862102217936, + "grad_norm": 1.417428970336914, + "learning_rate": 9.113200570751434e-07, + "loss": 5.3778, + "step": 9477 + }, + { + "epoch": 0.9139826422372228, + "grad_norm": 1.5636577606201172, + "learning_rate": 9.092948927872769e-07, + "loss": 5.38, + "step": 9478 + }, + { + "epoch": 0.9140790742526519, + "grad_norm": 1.5150421857833862, + "learning_rate": 9.072719395146728e-07, + "loss": 5.1539, + "step": 9479 + }, + { + "epoch": 0.914175506268081, + "grad_norm": 1.6083428859710693, + "learning_rate": 9.052511974429994e-07, + "loss": 5.2546, + "step": 9480 + }, + { + "epoch": 0.9142719382835102, + "grad_norm": 1.466148853302002, + "learning_rate": 9.032326667577168e-07, + "loss": 5.347, + "step": 9481 + }, + { + "epoch": 0.9143683702989392, + "grad_norm": 2.007742166519165, + "learning_rate": 9.012163476440739e-07, + "loss": 5.2049, + "step": 9482 + }, + { + "epoch": 0.9144648023143683, + "grad_norm": 1.514154076576233, + "learning_rate": 8.992022402871419e-07, + "loss": 5.2495, + "step": 9483 + }, + { + "epoch": 0.9145612343297975, + "grad_norm": 1.0964233875274658, + "learning_rate": 8.971903448717623e-07, + "loss": 5.5074, + "step": 9484 + }, + { + "epoch": 0.9146576663452266, + "grad_norm": 1.4979835748672485, + "learning_rate": 8.95180661582587e-07, + "loss": 5.0614, + "step": 9485 + }, + { + "epoch": 0.9147540983606557, + "grad_norm": 1.181242823600769, + "learning_rate": 8.931731906040631e-07, + "loss": 5.281, + "step": 9486 + }, + { + "epoch": 0.9148505303760849, + "grad_norm": 1.5152631998062134, + "learning_rate": 8.911679321204347e-07, + "loss": 5.2306, + "step": 9487 + }, + { + "epoch": 0.914946962391514, + "grad_norm": 1.4129780530929565, + "learning_rate": 8.891648863157353e-07, + "loss": 5.1994, + "step": 9488 + }, + { + "epoch": 0.9150433944069432, + "grad_norm": 1.3875346183776855, + "learning_rate": 8.871640533738124e-07, + "loss": 5.342, + "step": 9489 + }, + { + "epoch": 0.9151398264223722, + "grad_norm": 2.293142080307007, + "learning_rate": 8.851654334782883e-07, + "loss": 5.4868, + "step": 9490 + }, + { + "epoch": 0.9152362584378013, + "grad_norm": 1.6843620538711548, + "learning_rate": 8.831690268126052e-07, + "loss": 5.0852, + "step": 9491 + }, + { + "epoch": 0.9153326904532305, + "grad_norm": 1.7650736570358276, + "learning_rate": 8.811748335599862e-07, + "loss": 5.3766, + "step": 9492 + }, + { + "epoch": 0.9154291224686596, + "grad_norm": 1.3668419122695923, + "learning_rate": 8.791828539034513e-07, + "loss": 5.2717, + "step": 9493 + }, + { + "epoch": 0.9155255544840887, + "grad_norm": 1.3553602695465088, + "learning_rate": 8.771930880258295e-07, + "loss": 5.3207, + "step": 9494 + }, + { + "epoch": 0.9156219864995179, + "grad_norm": 1.5840458869934082, + "learning_rate": 8.752055361097355e-07, + "loss": 5.1647, + "step": 9495 + }, + { + "epoch": 0.915718418514947, + "grad_norm": 1.565151333808899, + "learning_rate": 8.732201983375821e-07, + "loss": 5.053, + "step": 9496 + }, + { + "epoch": 0.915814850530376, + "grad_norm": 1.3129600286483765, + "learning_rate": 8.712370748915871e-07, + "loss": 5.3078, + "step": 9497 + }, + { + "epoch": 0.9159112825458052, + "grad_norm": 1.31063973903656, + "learning_rate": 8.692561659537552e-07, + "loss": 5.533, + "step": 9498 + }, + { + "epoch": 0.9160077145612343, + "grad_norm": 2.1029603481292725, + "learning_rate": 8.672774717058879e-07, + "loss": 5.4085, + "step": 9499 + }, + { + "epoch": 0.9161041465766635, + "grad_norm": 2.090866804122925, + "learning_rate": 8.653009923295985e-07, + "loss": 5.0231, + "step": 9500 + }, + { + "epoch": 0.9162005785920926, + "grad_norm": 1.7783583402633667, + "learning_rate": 8.633267280062723e-07, + "loss": 5.4058, + "step": 9501 + }, + { + "epoch": 0.9162970106075217, + "grad_norm": 1.2073822021484375, + "learning_rate": 8.613546789171173e-07, + "loss": 5.3291, + "step": 9502 + }, + { + "epoch": 0.9163934426229509, + "grad_norm": 1.765831708908081, + "learning_rate": 8.593848452431163e-07, + "loss": 5.3391, + "step": 9503 + }, + { + "epoch": 0.9164898746383799, + "grad_norm": 1.299017310142517, + "learning_rate": 8.574172271650638e-07, + "loss": 5.33, + "step": 9504 + }, + { + "epoch": 0.916586306653809, + "grad_norm": 1.2638050317764282, + "learning_rate": 8.554518248635429e-07, + "loss": 5.0001, + "step": 9505 + }, + { + "epoch": 0.9166827386692382, + "grad_norm": 1.0932031869888306, + "learning_rate": 8.5348863851894e-07, + "loss": 5.1864, + "step": 9506 + }, + { + "epoch": 0.9167791706846673, + "grad_norm": 1.315165638923645, + "learning_rate": 8.51527668311422e-07, + "loss": 5.2744, + "step": 9507 + }, + { + "epoch": 0.9168756027000964, + "grad_norm": 2.3989856243133545, + "learning_rate": 8.495689144209812e-07, + "loss": 5.244, + "step": 9508 + }, + { + "epoch": 0.9169720347155256, + "grad_norm": 1.4600982666015625, + "learning_rate": 8.476123770273764e-07, + "loss": 5.1366, + "step": 9509 + }, + { + "epoch": 0.9170684667309547, + "grad_norm": 1.4453085660934448, + "learning_rate": 8.456580563101807e-07, + "loss": 5.1697, + "step": 9510 + }, + { + "epoch": 0.9171648987463838, + "grad_norm": 1.130059838294983, + "learning_rate": 8.437059524487617e-07, + "loss": 5.3531, + "step": 9511 + }, + { + "epoch": 0.9172613307618129, + "grad_norm": 1.0820258855819702, + "learning_rate": 8.417560656222762e-07, + "loss": 5.3078, + "step": 9512 + }, + { + "epoch": 0.917357762777242, + "grad_norm": 1.5151493549346924, + "learning_rate": 8.398083960096864e-07, + "loss": 5.2904, + "step": 9513 + }, + { + "epoch": 0.9174541947926712, + "grad_norm": 1.3133749961853027, + "learning_rate": 8.378629437897495e-07, + "loss": 5.6274, + "step": 9514 + }, + { + "epoch": 0.9175506268081003, + "grad_norm": 1.071842908859253, + "learning_rate": 8.359197091410059e-07, + "loss": 5.2766, + "step": 9515 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 2.4015719890594482, + "learning_rate": 8.339786922418158e-07, + "loss": 5.1192, + "step": 9516 + }, + { + "epoch": 0.9177434908389586, + "grad_norm": 1.3450422286987305, + "learning_rate": 8.320398932703144e-07, + "loss": 5.5077, + "step": 9517 + }, + { + "epoch": 0.9178399228543876, + "grad_norm": 1.503069519996643, + "learning_rate": 8.301033124044482e-07, + "loss": 5.041, + "step": 9518 + }, + { + "epoch": 0.9179363548698167, + "grad_norm": 1.1484239101409912, + "learning_rate": 8.281689498219503e-07, + "loss": 5.2876, + "step": 9519 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 1.635089635848999, + "learning_rate": 8.262368057003533e-07, + "loss": 5.4702, + "step": 9520 + }, + { + "epoch": 0.918129218900675, + "grad_norm": 1.2992643117904663, + "learning_rate": 8.243068802169906e-07, + "loss": 5.4621, + "step": 9521 + }, + { + "epoch": 0.9182256509161042, + "grad_norm": 1.2698355913162231, + "learning_rate": 8.2237917354899e-07, + "loss": 5.2873, + "step": 9522 + }, + { + "epoch": 0.9183220829315333, + "grad_norm": 1.799552321434021, + "learning_rate": 8.204536858732681e-07, + "loss": 5.2667, + "step": 9523 + }, + { + "epoch": 0.9184185149469624, + "grad_norm": 2.329568386077881, + "learning_rate": 8.185304173665448e-07, + "loss": 5.297, + "step": 9524 + }, + { + "epoch": 0.9185149469623916, + "grad_norm": 1.4802980422973633, + "learning_rate": 8.166093682053427e-07, + "loss": 5.3762, + "step": 9525 + }, + { + "epoch": 0.9186113789778206, + "grad_norm": 1.4037561416625977, + "learning_rate": 8.146905385659625e-07, + "loss": 5.4287, + "step": 9526 + }, + { + "epoch": 0.9187078109932497, + "grad_norm": 1.0877048969268799, + "learning_rate": 8.127739286245217e-07, + "loss": 5.4829, + "step": 9527 + }, + { + "epoch": 0.9188042430086789, + "grad_norm": 1.4215543270111084, + "learning_rate": 8.108595385569185e-07, + "loss": 5.5739, + "step": 9528 + }, + { + "epoch": 0.918900675024108, + "grad_norm": 1.1285585165023804, + "learning_rate": 8.089473685388565e-07, + "loss": 5.4562, + "step": 9529 + }, + { + "epoch": 0.9189971070395371, + "grad_norm": 2.819661855697632, + "learning_rate": 8.070374187458262e-07, + "loss": 5.4196, + "step": 9530 + }, + { + "epoch": 0.9190935390549663, + "grad_norm": 2.905761241912842, + "learning_rate": 8.051296893531318e-07, + "loss": 5.4682, + "step": 9531 + }, + { + "epoch": 0.9191899710703954, + "grad_norm": 1.6261086463928223, + "learning_rate": 8.032241805358471e-07, + "loss": 5.3087, + "step": 9532 + }, + { + "epoch": 0.9192864030858245, + "grad_norm": 1.5906676054000854, + "learning_rate": 8.013208924688736e-07, + "loss": 5.2239, + "step": 9533 + }, + { + "epoch": 0.9193828351012536, + "grad_norm": 1.1797351837158203, + "learning_rate": 7.994198253268776e-07, + "loss": 5.2834, + "step": 9534 + }, + { + "epoch": 0.9194792671166827, + "grad_norm": 1.283790946006775, + "learning_rate": 7.975209792843497e-07, + "loss": 5.3045, + "step": 9535 + }, + { + "epoch": 0.9195756991321119, + "grad_norm": 1.460800290107727, + "learning_rate": 7.956243545155561e-07, + "loss": 5.5078, + "step": 9536 + }, + { + "epoch": 0.919672131147541, + "grad_norm": 1.1989047527313232, + "learning_rate": 7.937299511945661e-07, + "loss": 5.455, + "step": 9537 + }, + { + "epoch": 0.9197685631629701, + "grad_norm": 1.2472271919250488, + "learning_rate": 7.918377694952517e-07, + "loss": 5.4606, + "step": 9538 + }, + { + "epoch": 0.9198649951783993, + "grad_norm": 1.2226046323776245, + "learning_rate": 7.899478095912682e-07, + "loss": 5.13, + "step": 9539 + }, + { + "epoch": 0.9199614271938283, + "grad_norm": 1.3110651969909668, + "learning_rate": 7.880600716560799e-07, + "loss": 5.3187, + "step": 9540 + }, + { + "epoch": 0.9200578592092574, + "grad_norm": 1.9691565036773682, + "learning_rate": 7.861745558629368e-07, + "loss": 5.0236, + "step": 9541 + }, + { + "epoch": 0.9201542912246866, + "grad_norm": 0.9840309619903564, + "learning_rate": 7.842912623848925e-07, + "loss": 5.3146, + "step": 9542 + }, + { + "epoch": 0.9202507232401157, + "grad_norm": 1.3791292905807495, + "learning_rate": 7.824101913947862e-07, + "loss": 5.2548, + "step": 9543 + }, + { + "epoch": 0.9203471552555449, + "grad_norm": 1.234338402748108, + "learning_rate": 7.805313430652717e-07, + "loss": 5.1388, + "step": 9544 + }, + { + "epoch": 0.920443587270974, + "grad_norm": 2.058500051498413, + "learning_rate": 7.786547175687747e-07, + "loss": 5.5252, + "step": 9545 + }, + { + "epoch": 0.920540019286403, + "grad_norm": 1.0630451440811157, + "learning_rate": 7.767803150775438e-07, + "loss": 5.0994, + "step": 9546 + }, + { + "epoch": 0.9206364513018322, + "grad_norm": 1.3680994510650635, + "learning_rate": 7.749081357635968e-07, + "loss": 5.5087, + "step": 9547 + }, + { + "epoch": 0.9207328833172613, + "grad_norm": 1.1200231313705444, + "learning_rate": 7.730381797987657e-07, + "loss": 5.3127, + "step": 9548 + }, + { + "epoch": 0.9208293153326904, + "grad_norm": 1.3059974908828735, + "learning_rate": 7.711704473546716e-07, + "loss": 5.3261, + "step": 9549 + }, + { + "epoch": 0.9209257473481196, + "grad_norm": 2.41706919670105, + "learning_rate": 7.693049386027356e-07, + "loss": 5.6611, + "step": 9550 + }, + { + "epoch": 0.9210221793635487, + "grad_norm": 2.400761842727661, + "learning_rate": 7.674416537141654e-07, + "loss": 5.5619, + "step": 9551 + }, + { + "epoch": 0.9211186113789778, + "grad_norm": 1.3180418014526367, + "learning_rate": 7.655805928599768e-07, + "loss": 5.4761, + "step": 9552 + }, + { + "epoch": 0.921215043394407, + "grad_norm": 1.2514814138412476, + "learning_rate": 7.63721756210975e-07, + "loss": 5.3907, + "step": 9553 + }, + { + "epoch": 0.921311475409836, + "grad_norm": 2.478695869445801, + "learning_rate": 7.618651439377567e-07, + "loss": 5.6264, + "step": 9554 + }, + { + "epoch": 0.9214079074252652, + "grad_norm": 1.4932106733322144, + "learning_rate": 7.600107562107245e-07, + "loss": 5.5037, + "step": 9555 + }, + { + "epoch": 0.9215043394406943, + "grad_norm": 1.7173165082931519, + "learning_rate": 7.581585932000729e-07, + "loss": 5.387, + "step": 9556 + }, + { + "epoch": 0.9216007714561234, + "grad_norm": 1.5988471508026123, + "learning_rate": 7.563086550757853e-07, + "loss": 5.2809, + "step": 9557 + }, + { + "epoch": 0.9216972034715526, + "grad_norm": 1.3674415349960327, + "learning_rate": 7.54460942007651e-07, + "loss": 5.3818, + "step": 9558 + }, + { + "epoch": 0.9217936354869817, + "grad_norm": 1.176294207572937, + "learning_rate": 7.526154541652508e-07, + "loss": 5.025, + "step": 9559 + }, + { + "epoch": 0.9218900675024108, + "grad_norm": 1.6597774028778076, + "learning_rate": 7.507721917179633e-07, + "loss": 5.2434, + "step": 9560 + }, + { + "epoch": 0.92198649951784, + "grad_norm": 1.5121930837631226, + "learning_rate": 7.489311548349559e-07, + "loss": 5.3525, + "step": 9561 + }, + { + "epoch": 0.922082931533269, + "grad_norm": 1.6575151681900024, + "learning_rate": 7.47092343685199e-07, + "loss": 5.1039, + "step": 9562 + }, + { + "epoch": 0.9221793635486981, + "grad_norm": 1.4529414176940918, + "learning_rate": 7.452557584374547e-07, + "loss": 5.3537, + "step": 9563 + }, + { + "epoch": 0.9222757955641273, + "grad_norm": 1.3335374593734741, + "learning_rate": 7.434213992602857e-07, + "loss": 5.4763, + "step": 9564 + }, + { + "epoch": 0.9223722275795564, + "grad_norm": 1.1428513526916504, + "learning_rate": 7.415892663220458e-07, + "loss": 5.344, + "step": 9565 + }, + { + "epoch": 0.9224686595949856, + "grad_norm": 1.5185375213623047, + "learning_rate": 7.397593597908841e-07, + "loss": 5.2204, + "step": 9566 + }, + { + "epoch": 0.9225650916104147, + "grad_norm": 1.5371172428131104, + "learning_rate": 7.379316798347497e-07, + "loss": 5.0286, + "step": 9567 + }, + { + "epoch": 0.9226615236258437, + "grad_norm": 1.1939537525177002, + "learning_rate": 7.361062266213859e-07, + "loss": 5.5101, + "step": 9568 + }, + { + "epoch": 0.9227579556412729, + "grad_norm": 1.3420183658599854, + "learning_rate": 7.342830003183282e-07, + "loss": 5.4212, + "step": 9569 + }, + { + "epoch": 0.922854387656702, + "grad_norm": 1.204100251197815, + "learning_rate": 7.324620010929068e-07, + "loss": 5.2646, + "step": 9570 + }, + { + "epoch": 0.9229508196721311, + "grad_norm": 1.2045685052871704, + "learning_rate": 7.3064322911226e-07, + "loss": 5.2987, + "step": 9571 + }, + { + "epoch": 0.9230472516875603, + "grad_norm": 1.7708675861358643, + "learning_rate": 7.288266845433017e-07, + "loss": 5.3439, + "step": 9572 + }, + { + "epoch": 0.9231436837029894, + "grad_norm": 1.2901525497436523, + "learning_rate": 7.270123675527596e-07, + "loss": 5.291, + "step": 9573 + }, + { + "epoch": 0.9232401157184185, + "grad_norm": 3.349365472793579, + "learning_rate": 7.252002783071476e-07, + "loss": 4.9028, + "step": 9574 + }, + { + "epoch": 0.9233365477338477, + "grad_norm": 3.313599109649658, + "learning_rate": 7.233904169727773e-07, + "loss": 4.8704, + "step": 9575 + }, + { + "epoch": 0.9234329797492767, + "grad_norm": 3.264676809310913, + "learning_rate": 7.215827837157519e-07, + "loss": 4.8607, + "step": 9576 + }, + { + "epoch": 0.9235294117647059, + "grad_norm": 1.434802770614624, + "learning_rate": 7.197773787019801e-07, + "loss": 5.3455, + "step": 9577 + }, + { + "epoch": 0.923625843780135, + "grad_norm": 1.1764955520629883, + "learning_rate": 7.179742020971519e-07, + "loss": 5.3548, + "step": 9578 + }, + { + "epoch": 0.9237222757955641, + "grad_norm": 1.928062081336975, + "learning_rate": 7.16173254066771e-07, + "loss": 5.3981, + "step": 9579 + }, + { + "epoch": 0.9238187078109933, + "grad_norm": 1.6658469438552856, + "learning_rate": 7.143745347761189e-07, + "loss": 5.2325, + "step": 9580 + }, + { + "epoch": 0.9239151398264224, + "grad_norm": 1.549422025680542, + "learning_rate": 7.125780443902775e-07, + "loss": 5.3217, + "step": 9581 + }, + { + "epoch": 0.9240115718418515, + "grad_norm": 1.426416277885437, + "learning_rate": 7.107837830741343e-07, + "loss": 5.3145, + "step": 9582 + }, + { + "epoch": 0.9241080038572806, + "grad_norm": 1.526237964630127, + "learning_rate": 7.089917509923577e-07, + "loss": 5.3774, + "step": 9583 + }, + { + "epoch": 0.9242044358727097, + "grad_norm": 2.7180862426757812, + "learning_rate": 7.072019483094216e-07, + "loss": 5.4153, + "step": 9584 + }, + { + "epoch": 0.9243008678881388, + "grad_norm": 2.031606435775757, + "learning_rate": 7.054143751895919e-07, + "loss": 5.4047, + "step": 9585 + }, + { + "epoch": 0.924397299903568, + "grad_norm": 1.9767435789108276, + "learning_rate": 7.03629031796929e-07, + "loss": 5.284, + "step": 9586 + }, + { + "epoch": 0.9244937319189971, + "grad_norm": 1.1660971641540527, + "learning_rate": 7.01845918295288e-07, + "loss": 5.313, + "step": 9587 + }, + { + "epoch": 0.9245901639344263, + "grad_norm": 1.4904214143753052, + "learning_rate": 7.00065034848324e-07, + "loss": 5.1899, + "step": 9588 + }, + { + "epoch": 0.9246865959498554, + "grad_norm": 1.2965515851974487, + "learning_rate": 6.982863816194785e-07, + "loss": 5.0949, + "step": 9589 + }, + { + "epoch": 0.9247830279652844, + "grad_norm": 2.165149211883545, + "learning_rate": 6.965099587720069e-07, + "loss": 5.0994, + "step": 9590 + }, + { + "epoch": 0.9248794599807136, + "grad_norm": 2.4815940856933594, + "learning_rate": 6.947357664689319e-07, + "loss": 5.1072, + "step": 9591 + }, + { + "epoch": 0.9249758919961427, + "grad_norm": 2.334987163543701, + "learning_rate": 6.929638048730952e-07, + "loss": 5.1511, + "step": 9592 + }, + { + "epoch": 0.9250723240115718, + "grad_norm": 2.2193491458892822, + "learning_rate": 6.911940741471224e-07, + "loss": 5.0742, + "step": 9593 + }, + { + "epoch": 0.925168756027001, + "grad_norm": 1.4809024333953857, + "learning_rate": 6.894265744534417e-07, + "loss": 5.1192, + "step": 9594 + }, + { + "epoch": 0.9252651880424301, + "grad_norm": 1.7548702955245972, + "learning_rate": 6.876613059542625e-07, + "loss": 5.0383, + "step": 9595 + }, + { + "epoch": 0.9253616200578592, + "grad_norm": 1.4958558082580566, + "learning_rate": 6.858982688116106e-07, + "loss": 5.5044, + "step": 9596 + }, + { + "epoch": 0.9254580520732884, + "grad_norm": 1.9475674629211426, + "learning_rate": 6.841374631872899e-07, + "loss": 5.7042, + "step": 9597 + }, + { + "epoch": 0.9255544840887174, + "grad_norm": 1.4518109560012817, + "learning_rate": 6.823788892429017e-07, + "loss": 5.491, + "step": 9598 + }, + { + "epoch": 0.9256509161041466, + "grad_norm": 1.371726393699646, + "learning_rate": 6.806225471398503e-07, + "loss": 5.2602, + "step": 9599 + }, + { + "epoch": 0.9257473481195757, + "grad_norm": 1.62505304813385, + "learning_rate": 6.788684370393261e-07, + "loss": 5.2618, + "step": 9600 + }, + { + "epoch": 0.9258437801350048, + "grad_norm": 1.3587156534194946, + "learning_rate": 6.771165591023254e-07, + "loss": 5.5422, + "step": 9601 + }, + { + "epoch": 0.925940212150434, + "grad_norm": 1.4461596012115479, + "learning_rate": 6.753669134896306e-07, + "loss": 5.2817, + "step": 9602 + }, + { + "epoch": 0.9260366441658631, + "grad_norm": 1.601845145225525, + "learning_rate": 6.736195003618218e-07, + "loss": 5.4111, + "step": 9603 + }, + { + "epoch": 0.9261330761812921, + "grad_norm": 1.3570204973220825, + "learning_rate": 6.718743198792732e-07, + "loss": 5.2201, + "step": 9604 + }, + { + "epoch": 0.9262295081967213, + "grad_norm": 1.4883999824523926, + "learning_rate": 6.701313722021568e-07, + "loss": 5.1922, + "step": 9605 + }, + { + "epoch": 0.9263259402121504, + "grad_norm": 1.1183950901031494, + "learning_rate": 6.683906574904364e-07, + "loss": 5.3172, + "step": 9606 + }, + { + "epoch": 0.9264223722275795, + "grad_norm": 2.241623878479004, + "learning_rate": 6.666521759038786e-07, + "loss": 5.3263, + "step": 9607 + }, + { + "epoch": 0.9265188042430087, + "grad_norm": 1.7655338048934937, + "learning_rate": 6.649159276020306e-07, + "loss": 5.4721, + "step": 9608 + }, + { + "epoch": 0.9266152362584378, + "grad_norm": 1.155097484588623, + "learning_rate": 6.631819127442485e-07, + "loss": 5.3861, + "step": 9609 + }, + { + "epoch": 0.926711668273867, + "grad_norm": 1.3974686861038208, + "learning_rate": 6.61450131489677e-07, + "loss": 5.3109, + "step": 9610 + }, + { + "epoch": 0.926808100289296, + "grad_norm": 1.4994356632232666, + "learning_rate": 6.597205839972559e-07, + "loss": 5.4088, + "step": 9611 + }, + { + "epoch": 0.9269045323047251, + "grad_norm": 1.3426833152770996, + "learning_rate": 6.579932704257191e-07, + "loss": 5.4115, + "step": 9612 + }, + { + "epoch": 0.9270009643201543, + "grad_norm": 1.4352906942367554, + "learning_rate": 6.562681909336066e-07, + "loss": 5.3165, + "step": 9613 + }, + { + "epoch": 0.9270973963355834, + "grad_norm": 1.308326005935669, + "learning_rate": 6.545453456792305e-07, + "loss": 5.2636, + "step": 9614 + }, + { + "epoch": 0.9271938283510125, + "grad_norm": 1.3679488897323608, + "learning_rate": 6.528247348207228e-07, + "loss": 5.3628, + "step": 9615 + }, + { + "epoch": 0.9272902603664417, + "grad_norm": 1.0344715118408203, + "learning_rate": 6.511063585159904e-07, + "loss": 5.2893, + "step": 9616 + }, + { + "epoch": 0.9273866923818708, + "grad_norm": 1.2355118989944458, + "learning_rate": 6.493902169227517e-07, + "loss": 5.3798, + "step": 9617 + }, + { + "epoch": 0.9274831243972999, + "grad_norm": 1.1134966611862183, + "learning_rate": 6.476763101985057e-07, + "loss": 5.4872, + "step": 9618 + }, + { + "epoch": 0.927579556412729, + "grad_norm": 1.5705887079238892, + "learning_rate": 6.459646385005569e-07, + "loss": 5.0077, + "step": 9619 + }, + { + "epoch": 0.9276759884281581, + "grad_norm": 1.092681884765625, + "learning_rate": 6.442552019859937e-07, + "loss": 5.0939, + "step": 9620 + }, + { + "epoch": 0.9277724204435873, + "grad_norm": 1.2167854309082031, + "learning_rate": 6.425480008117185e-07, + "loss": 5.5084, + "step": 9621 + }, + { + "epoch": 0.9278688524590164, + "grad_norm": 2.205430746078491, + "learning_rate": 6.408430351344002e-07, + "loss": 5.6283, + "step": 9622 + }, + { + "epoch": 0.9279652844744455, + "grad_norm": 1.5503418445587158, + "learning_rate": 6.39140305110536e-07, + "loss": 5.4732, + "step": 9623 + }, + { + "epoch": 0.9280617164898747, + "grad_norm": 1.1679236888885498, + "learning_rate": 6.374398108963842e-07, + "loss": 5.4962, + "step": 9624 + }, + { + "epoch": 0.9281581485053038, + "grad_norm": 1.6063385009765625, + "learning_rate": 6.357415526480254e-07, + "loss": 5.3267, + "step": 9625 + }, + { + "epoch": 0.9282545805207328, + "grad_norm": 1.4155001640319824, + "learning_rate": 6.340455305213155e-07, + "loss": 5.0944, + "step": 9626 + }, + { + "epoch": 0.928351012536162, + "grad_norm": 1.2834877967834473, + "learning_rate": 6.323517446719219e-07, + "loss": 5.1621, + "step": 9627 + }, + { + "epoch": 0.9284474445515911, + "grad_norm": 1.3392624855041504, + "learning_rate": 6.306601952552892e-07, + "loss": 5.4657, + "step": 9628 + }, + { + "epoch": 0.9285438765670202, + "grad_norm": 1.3792588710784912, + "learning_rate": 6.289708824266715e-07, + "loss": 5.4404, + "step": 9629 + }, + { + "epoch": 0.9286403085824494, + "grad_norm": 1.050121545791626, + "learning_rate": 6.272838063411141e-07, + "loss": 5.3967, + "step": 9630 + }, + { + "epoch": 0.9287367405978785, + "grad_norm": 1.272210955619812, + "learning_rate": 6.255989671534429e-07, + "loss": 5.3915, + "step": 9631 + }, + { + "epoch": 0.9288331726133077, + "grad_norm": 1.245459794998169, + "learning_rate": 6.239163650183066e-07, + "loss": 5.5318, + "step": 9632 + }, + { + "epoch": 0.9289296046287367, + "grad_norm": 1.7265673875808716, + "learning_rate": 6.222360000901178e-07, + "loss": 5.3598, + "step": 9633 + }, + { + "epoch": 0.9290260366441658, + "grad_norm": 1.2314324378967285, + "learning_rate": 6.205578725231087e-07, + "loss": 5.5014, + "step": 9634 + }, + { + "epoch": 0.929122468659595, + "grad_norm": 1.8051453828811646, + "learning_rate": 6.188819824712922e-07, + "loss": 5.1336, + "step": 9635 + }, + { + "epoch": 0.9292189006750241, + "grad_norm": 1.0664281845092773, + "learning_rate": 6.172083300884757e-07, + "loss": 5.321, + "step": 9636 + }, + { + "epoch": 0.9293153326904532, + "grad_norm": 1.731440544128418, + "learning_rate": 6.155369155282725e-07, + "loss": 5.2513, + "step": 9637 + }, + { + "epoch": 0.9294117647058824, + "grad_norm": 1.9227180480957031, + "learning_rate": 6.138677389440767e-07, + "loss": 5.2816, + "step": 9638 + }, + { + "epoch": 0.9295081967213115, + "grad_norm": 1.2579998970031738, + "learning_rate": 6.122008004890851e-07, + "loss": 5.2963, + "step": 9639 + }, + { + "epoch": 0.9296046287367405, + "grad_norm": 1.164323091506958, + "learning_rate": 6.105361003162891e-07, + "loss": 5.383, + "step": 9640 + }, + { + "epoch": 0.9297010607521697, + "grad_norm": 1.617182731628418, + "learning_rate": 6.088736385784694e-07, + "loss": 5.2184, + "step": 9641 + }, + { + "epoch": 0.9297974927675988, + "grad_norm": 1.7178207635879517, + "learning_rate": 6.072134154282066e-07, + "loss": 5.1282, + "step": 9642 + }, + { + "epoch": 0.929893924783028, + "grad_norm": 1.707876205444336, + "learning_rate": 6.055554310178735e-07, + "loss": 5.1472, + "step": 9643 + }, + { + "epoch": 0.9299903567984571, + "grad_norm": 1.2512867450714111, + "learning_rate": 6.038996854996398e-07, + "loss": 5.2661, + "step": 9644 + }, + { + "epoch": 0.9300867888138862, + "grad_norm": 2.2141025066375732, + "learning_rate": 6.022461790254646e-07, + "loss": 5.5744, + "step": 9645 + }, + { + "epoch": 0.9301832208293154, + "grad_norm": 1.1784958839416504, + "learning_rate": 6.005949117471072e-07, + "loss": 5.3286, + "step": 9646 + }, + { + "epoch": 0.9302796528447445, + "grad_norm": 2.013833522796631, + "learning_rate": 5.989458838161183e-07, + "loss": 5.1992, + "step": 9647 + }, + { + "epoch": 0.9303760848601735, + "grad_norm": 1.2633525133132935, + "learning_rate": 5.972990953838437e-07, + "loss": 5.5881, + "step": 9648 + }, + { + "epoch": 0.9304725168756027, + "grad_norm": 1.1650636196136475, + "learning_rate": 5.956545466014235e-07, + "loss": 5.3452, + "step": 9649 + }, + { + "epoch": 0.9305689488910318, + "grad_norm": 1.4777002334594727, + "learning_rate": 5.940122376197899e-07, + "loss": 5.3384, + "step": 9650 + }, + { + "epoch": 0.9306653809064609, + "grad_norm": 1.4455037117004395, + "learning_rate": 5.923721685896777e-07, + "loss": 5.2093, + "step": 9651 + }, + { + "epoch": 0.9307618129218901, + "grad_norm": 1.4110724925994873, + "learning_rate": 5.907343396616055e-07, + "loss": 5.2281, + "step": 9652 + }, + { + "epoch": 0.9308582449373192, + "grad_norm": 1.3665674924850464, + "learning_rate": 5.89098750985892e-07, + "loss": 5.129, + "step": 9653 + }, + { + "epoch": 0.9309546769527484, + "grad_norm": 1.6179991960525513, + "learning_rate": 5.874654027126503e-07, + "loss": 5.32, + "step": 9654 + }, + { + "epoch": 0.9310511089681774, + "grad_norm": 1.4277573823928833, + "learning_rate": 5.858342949917855e-07, + "loss": 5.4142, + "step": 9655 + }, + { + "epoch": 0.9311475409836065, + "grad_norm": 2.3780267238616943, + "learning_rate": 5.84205427973003e-07, + "loss": 5.6303, + "step": 9656 + }, + { + "epoch": 0.9312439729990357, + "grad_norm": 1.524432897567749, + "learning_rate": 5.825788018057971e-07, + "loss": 5.3839, + "step": 9657 + }, + { + "epoch": 0.9313404050144648, + "grad_norm": 2.371364116668701, + "learning_rate": 5.809544166394482e-07, + "loss": 5.5881, + "step": 9658 + }, + { + "epoch": 0.9314368370298939, + "grad_norm": 1.716518521308899, + "learning_rate": 5.793322726230565e-07, + "loss": 5.1291, + "step": 9659 + }, + { + "epoch": 0.9315332690453231, + "grad_norm": 2.4307031631469727, + "learning_rate": 5.777123699054864e-07, + "loss": 5.4359, + "step": 9660 + }, + { + "epoch": 0.9316297010607522, + "grad_norm": 2.4892613887786865, + "learning_rate": 5.760947086354191e-07, + "loss": 5.4363, + "step": 9661 + }, + { + "epoch": 0.9317261330761812, + "grad_norm": 1.2801059484481812, + "learning_rate": 5.74479288961316e-07, + "loss": 5.5566, + "step": 9662 + }, + { + "epoch": 0.9318225650916104, + "grad_norm": 1.1974493265151978, + "learning_rate": 5.72866111031442e-07, + "loss": 5.1153, + "step": 9663 + }, + { + "epoch": 0.9319189971070395, + "grad_norm": 1.375253438949585, + "learning_rate": 5.71255174993851e-07, + "loss": 5.118, + "step": 9664 + }, + { + "epoch": 0.9320154291224687, + "grad_norm": 1.5029903650283813, + "learning_rate": 5.696464809963941e-07, + "loss": 5.2847, + "step": 9665 + }, + { + "epoch": 0.9321118611378978, + "grad_norm": 1.6108289957046509, + "learning_rate": 5.680400291867116e-07, + "loss": 5.2754, + "step": 9666 + }, + { + "epoch": 0.9322082931533269, + "grad_norm": 1.4689278602600098, + "learning_rate": 5.664358197122494e-07, + "loss": 5.3035, + "step": 9667 + }, + { + "epoch": 0.9323047251687561, + "grad_norm": 1.2624635696411133, + "learning_rate": 5.648338527202312e-07, + "loss": 5.4586, + "step": 9668 + }, + { + "epoch": 0.9324011571841851, + "grad_norm": 1.3906042575836182, + "learning_rate": 5.63234128357687e-07, + "loss": 5.209, + "step": 9669 + }, + { + "epoch": 0.9324975891996142, + "grad_norm": 1.4918862581253052, + "learning_rate": 5.616366467714379e-07, + "loss": 5.4636, + "step": 9670 + }, + { + "epoch": 0.9325940212150434, + "grad_norm": 1.66942298412323, + "learning_rate": 5.600414081081001e-07, + "loss": 5.1948, + "step": 9671 + }, + { + "epoch": 0.9326904532304725, + "grad_norm": 1.3897818326950073, + "learning_rate": 5.584484125140815e-07, + "loss": 5.2043, + "step": 9672 + }, + { + "epoch": 0.9327868852459016, + "grad_norm": 1.2986732721328735, + "learning_rate": 5.568576601355819e-07, + "loss": 5.3122, + "step": 9673 + }, + { + "epoch": 0.9328833172613308, + "grad_norm": 1.2070475816726685, + "learning_rate": 5.552691511186064e-07, + "loss": 5.113, + "step": 9674 + }, + { + "epoch": 0.9329797492767599, + "grad_norm": 1.1387488842010498, + "learning_rate": 5.53682885608936e-07, + "loss": 5.2262, + "step": 9675 + }, + { + "epoch": 0.933076181292189, + "grad_norm": 1.7349095344543457, + "learning_rate": 5.520988637521679e-07, + "loss": 5.2902, + "step": 9676 + }, + { + "epoch": 0.9331726133076181, + "grad_norm": 1.1703871488571167, + "learning_rate": 5.505170856936693e-07, + "loss": 5.424, + "step": 9677 + }, + { + "epoch": 0.9332690453230472, + "grad_norm": 1.9515610933303833, + "learning_rate": 5.489375515786238e-07, + "loss": 5.3408, + "step": 9678 + }, + { + "epoch": 0.9333654773384764, + "grad_norm": 1.3099937438964844, + "learning_rate": 5.473602615519962e-07, + "loss": 5.4816, + "step": 9679 + }, + { + "epoch": 0.9334619093539055, + "grad_norm": 1.363250732421875, + "learning_rate": 5.457852157585458e-07, + "loss": 5.3436, + "step": 9680 + }, + { + "epoch": 0.9335583413693346, + "grad_norm": 1.268903374671936, + "learning_rate": 5.44212414342829e-07, + "loss": 5.2601, + "step": 9681 + }, + { + "epoch": 0.9336547733847638, + "grad_norm": 2.5796096324920654, + "learning_rate": 5.426418574492025e-07, + "loss": 4.9743, + "step": 9682 + }, + { + "epoch": 0.9337512054001929, + "grad_norm": 2.500518798828125, + "learning_rate": 5.410735452217957e-07, + "loss": 4.9775, + "step": 9683 + }, + { + "epoch": 0.9338476374156219, + "grad_norm": 1.4868146181106567, + "learning_rate": 5.395074778045628e-07, + "loss": 5.254, + "step": 9684 + }, + { + "epoch": 0.9339440694310511, + "grad_norm": 1.3550827503204346, + "learning_rate": 5.379436553412221e-07, + "loss": 5.2548, + "step": 9685 + }, + { + "epoch": 0.9340405014464802, + "grad_norm": 5.6449079513549805, + "learning_rate": 5.36382077975306e-07, + "loss": 4.9989, + "step": 9686 + }, + { + "epoch": 0.9341369334619094, + "grad_norm": 6.092577934265137, + "learning_rate": 5.348227458501332e-07, + "loss": 4.9537, + "step": 9687 + }, + { + "epoch": 0.9342333654773385, + "grad_norm": 1.8350131511688232, + "learning_rate": 5.332656591088169e-07, + "loss": 5.1007, + "step": 9688 + }, + { + "epoch": 0.9343297974927676, + "grad_norm": 1.702832818031311, + "learning_rate": 5.317108178942654e-07, + "loss": 5.1086, + "step": 9689 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 1.802568793296814, + "learning_rate": 5.30158222349178e-07, + "loss": 5.0939, + "step": 9690 + }, + { + "epoch": 0.9345226615236258, + "grad_norm": 2.0552637577056885, + "learning_rate": 5.286078726160549e-07, + "loss": 5.2442, + "step": 9691 + }, + { + "epoch": 0.9346190935390549, + "grad_norm": 1.9991785287857056, + "learning_rate": 5.270597688371793e-07, + "loss": 5.341, + "step": 9692 + }, + { + "epoch": 0.9347155255544841, + "grad_norm": 2.2506263256073, + "learning_rate": 5.255139111546376e-07, + "loss": 5.4748, + "step": 9693 + }, + { + "epoch": 0.9348119575699132, + "grad_norm": 2.1587584018707275, + "learning_rate": 5.239702997103024e-07, + "loss": 5.0266, + "step": 9694 + }, + { + "epoch": 0.9349083895853423, + "grad_norm": 2.0247092247009277, + "learning_rate": 5.22428934645855e-07, + "loss": 4.9731, + "step": 9695 + }, + { + "epoch": 0.9350048216007715, + "grad_norm": 1.8329533338546753, + "learning_rate": 5.208898161027487e-07, + "loss": 5.1707, + "step": 9696 + }, + { + "epoch": 0.9351012536162006, + "grad_norm": 2.0326120853424072, + "learning_rate": 5.193529442222456e-07, + "loss": 5.1319, + "step": 9697 + }, + { + "epoch": 0.9351976856316297, + "grad_norm": 1.7643587589263916, + "learning_rate": 5.178183191453995e-07, + "loss": 5.1575, + "step": 9698 + }, + { + "epoch": 0.9352941176470588, + "grad_norm": 1.522510051727295, + "learning_rate": 5.16285941013056e-07, + "loss": 5.2481, + "step": 9699 + }, + { + "epoch": 0.9353905496624879, + "grad_norm": 2.0693156719207764, + "learning_rate": 5.147558099658529e-07, + "loss": 5.197, + "step": 9700 + }, + { + "epoch": 0.9354869816779171, + "grad_norm": 2.570063829421997, + "learning_rate": 5.132279261442275e-07, + "loss": 5.3263, + "step": 9701 + }, + { + "epoch": 0.9355834136933462, + "grad_norm": 1.9761508703231812, + "learning_rate": 5.117022896884011e-07, + "loss": 5.0605, + "step": 9702 + }, + { + "epoch": 0.9356798457087753, + "grad_norm": 1.4697647094726562, + "learning_rate": 5.101789007384033e-07, + "loss": 5.0407, + "step": 9703 + }, + { + "epoch": 0.9357762777242045, + "grad_norm": 1.79662024974823, + "learning_rate": 5.086577594340391e-07, + "loss": 5.078, + "step": 9704 + }, + { + "epoch": 0.9358727097396335, + "grad_norm": 2.2189784049987793, + "learning_rate": 5.071388659149246e-07, + "loss": 5.187, + "step": 9705 + }, + { + "epoch": 0.9359691417550626, + "grad_norm": 1.2678219079971313, + "learning_rate": 5.056222203204591e-07, + "loss": 4.9395, + "step": 9706 + }, + { + "epoch": 0.9360655737704918, + "grad_norm": 1.2456583976745605, + "learning_rate": 5.041078227898371e-07, + "loss": 5.058, + "step": 9707 + }, + { + "epoch": 0.9361620057859209, + "grad_norm": 1.4862231016159058, + "learning_rate": 5.025956734620502e-07, + "loss": 5.1741, + "step": 9708 + }, + { + "epoch": 0.9362584378013501, + "grad_norm": 1.2327395677566528, + "learning_rate": 5.010857724758816e-07, + "loss": 5.121, + "step": 9709 + }, + { + "epoch": 0.9363548698167792, + "grad_norm": 1.2211956977844238, + "learning_rate": 4.995781199699068e-07, + "loss": 5.162, + "step": 9710 + }, + { + "epoch": 0.9364513018322083, + "grad_norm": 1.1901897192001343, + "learning_rate": 4.980727160824955e-07, + "loss": 4.9725, + "step": 9711 + }, + { + "epoch": 0.9365477338476375, + "grad_norm": 1.2354305982589722, + "learning_rate": 4.965695609518179e-07, + "loss": 5.049, + "step": 9712 + }, + { + "epoch": 0.9366441658630665, + "grad_norm": 1.366769790649414, + "learning_rate": 4.950686547158218e-07, + "loss": 4.8736, + "step": 9713 + }, + { + "epoch": 0.9367405978784956, + "grad_norm": 1.1740938425064087, + "learning_rate": 4.935699975122693e-07, + "loss": 4.9733, + "step": 9714 + }, + { + "epoch": 0.9368370298939248, + "grad_norm": 1.0929113626480103, + "learning_rate": 4.920735894786949e-07, + "loss": 5.0004, + "step": 9715 + }, + { + "epoch": 0.9369334619093539, + "grad_norm": 1.3458524942398071, + "learning_rate": 4.905794307524442e-07, + "loss": 4.9356, + "step": 9716 + }, + { + "epoch": 0.937029893924783, + "grad_norm": 1.6208328008651733, + "learning_rate": 4.890875214706491e-07, + "loss": 4.889, + "step": 9717 + }, + { + "epoch": 0.9371263259402122, + "grad_norm": 1.4655414819717407, + "learning_rate": 4.87597861770231e-07, + "loss": 5.0597, + "step": 9718 + }, + { + "epoch": 0.9372227579556413, + "grad_norm": 1.4439241886138916, + "learning_rate": 4.861104517879106e-07, + "loss": 4.8172, + "step": 9719 + }, + { + "epoch": 0.9373191899710704, + "grad_norm": 1.3327295780181885, + "learning_rate": 4.84625291660204e-07, + "loss": 5.0575, + "step": 9720 + }, + { + "epoch": 0.9374156219864995, + "grad_norm": 1.2514671087265015, + "learning_rate": 4.831423815234104e-07, + "loss": 5.0738, + "step": 9721 + }, + { + "epoch": 0.9375120540019286, + "grad_norm": 1.6358733177185059, + "learning_rate": 4.816617215136376e-07, + "loss": 5.0907, + "step": 9722 + }, + { + "epoch": 0.9376084860173578, + "grad_norm": 1.3713573217391968, + "learning_rate": 4.801833117667742e-07, + "loss": 4.8489, + "step": 9723 + }, + { + "epoch": 0.9377049180327869, + "grad_norm": 1.6819745302200317, + "learning_rate": 4.78707152418506e-07, + "loss": 4.9537, + "step": 9724 + }, + { + "epoch": 0.937801350048216, + "grad_norm": 2.42060923576355, + "learning_rate": 4.772332436043165e-07, + "loss": 4.9419, + "step": 9725 + }, + { + "epoch": 0.9378977820636452, + "grad_norm": 1.3478403091430664, + "learning_rate": 4.757615854594777e-07, + "loss": 5.0396, + "step": 9726 + }, + { + "epoch": 0.9379942140790742, + "grad_norm": 1.100579023361206, + "learning_rate": 4.7429217811905123e-07, + "loss": 4.969, + "step": 9727 + }, + { + "epoch": 0.9380906460945033, + "grad_norm": 1.214297890663147, + "learning_rate": 4.72825021717907e-07, + "loss": 5.0818, + "step": 9728 + }, + { + "epoch": 0.9381870781099325, + "grad_norm": 1.8608694076538086, + "learning_rate": 4.713601163906928e-07, + "loss": 5.0824, + "step": 9729 + }, + { + "epoch": 0.9382835101253616, + "grad_norm": 1.1421318054199219, + "learning_rate": 4.698974622718594e-07, + "loss": 4.9363, + "step": 9730 + }, + { + "epoch": 0.9383799421407908, + "grad_norm": 1.9980632066726685, + "learning_rate": 4.684370594956411e-07, + "loss": 5.0609, + "step": 9731 + }, + { + "epoch": 0.9384763741562199, + "grad_norm": 1.2929415702819824, + "learning_rate": 4.6697890819607794e-07, + "loss": 5.0816, + "step": 9732 + }, + { + "epoch": 0.938572806171649, + "grad_norm": 1.3968766927719116, + "learning_rate": 4.6552300850699613e-07, + "loss": 4.8427, + "step": 9733 + }, + { + "epoch": 0.9386692381870781, + "grad_norm": 1.9730929136276245, + "learning_rate": 4.640693605620139e-07, + "loss": 4.6553, + "step": 9734 + }, + { + "epoch": 0.9387656702025072, + "grad_norm": 1.4452860355377197, + "learning_rate": 4.626179644945494e-07, + "loss": 4.9707, + "step": 9735 + }, + { + "epoch": 0.9388621022179363, + "grad_norm": 1.2133047580718994, + "learning_rate": 4.611688204378045e-07, + "loss": 5.0932, + "step": 9736 + }, + { + "epoch": 0.9389585342333655, + "grad_norm": 1.3849819898605347, + "learning_rate": 4.5972192852478667e-07, + "loss": 5.0284, + "step": 9737 + }, + { + "epoch": 0.9390549662487946, + "grad_norm": 1.2445003986358643, + "learning_rate": 4.5827728888828134e-07, + "loss": 4.8487, + "step": 9738 + }, + { + "epoch": 0.9391513982642237, + "grad_norm": 1.3595980405807495, + "learning_rate": 4.5683490166088527e-07, + "loss": 4.932, + "step": 9739 + }, + { + "epoch": 0.9392478302796529, + "grad_norm": 1.1613163948059082, + "learning_rate": 4.5539476697497307e-07, + "loss": 5.0934, + "step": 9740 + }, + { + "epoch": 0.9393442622950819, + "grad_norm": 1.4454927444458008, + "learning_rate": 4.539568849627196e-07, + "loss": 5.0031, + "step": 9741 + }, + { + "epoch": 0.9394406943105111, + "grad_norm": 1.400826096534729, + "learning_rate": 4.5252125575608874e-07, + "loss": 4.8347, + "step": 9742 + }, + { + "epoch": 0.9395371263259402, + "grad_norm": 1.1591209173202515, + "learning_rate": 4.5108787948684725e-07, + "loss": 4.983, + "step": 9743 + }, + { + "epoch": 0.9396335583413693, + "grad_norm": 1.24640953540802, + "learning_rate": 4.496567562865428e-07, + "loss": 5.0997, + "step": 9744 + }, + { + "epoch": 0.9397299903567985, + "grad_norm": 1.3328498601913452, + "learning_rate": 4.482278862865286e-07, + "loss": 5.0583, + "step": 9745 + }, + { + "epoch": 0.9398264223722276, + "grad_norm": 1.1072797775268555, + "learning_rate": 4.468012696179358e-07, + "loss": 5.0773, + "step": 9746 + }, + { + "epoch": 0.9399228543876567, + "grad_norm": 1.5303702354431152, + "learning_rate": 4.453769064117069e-07, + "loss": 4.962, + "step": 9747 + }, + { + "epoch": 0.9400192864030859, + "grad_norm": 1.612363576889038, + "learning_rate": 4.439547967985624e-07, + "loss": 5.0065, + "step": 9748 + }, + { + "epoch": 0.9401157184185149, + "grad_norm": 1.140369176864624, + "learning_rate": 4.425349409090229e-07, + "loss": 5.1086, + "step": 9749 + }, + { + "epoch": 0.940212150433944, + "grad_norm": 1.0963211059570312, + "learning_rate": 4.4111733887340077e-07, + "loss": 4.9271, + "step": 9750 + }, + { + "epoch": 0.9403085824493732, + "grad_norm": 1.5641318559646606, + "learning_rate": 4.397019908218003e-07, + "loss": 4.9225, + "step": 9751 + }, + { + "epoch": 0.9404050144648023, + "grad_norm": 1.3607300519943237, + "learning_rate": 4.382888968841259e-07, + "loss": 4.8138, + "step": 9752 + }, + { + "epoch": 0.9405014464802315, + "grad_norm": 1.3408727645874023, + "learning_rate": 4.368780571900627e-07, + "loss": 5.0624, + "step": 9753 + }, + { + "epoch": 0.9405978784956606, + "grad_norm": 1.1016753911972046, + "learning_rate": 4.354694718691016e-07, + "loss": 5.1099, + "step": 9754 + }, + { + "epoch": 0.9406943105110896, + "grad_norm": 1.3517591953277588, + "learning_rate": 4.340631410505169e-07, + "loss": 5.1361, + "step": 9755 + }, + { + "epoch": 0.9407907425265188, + "grad_norm": 1.816523551940918, + "learning_rate": 4.326590648633833e-07, + "loss": 5.3595, + "step": 9756 + }, + { + "epoch": 0.9408871745419479, + "grad_norm": 1.8582658767700195, + "learning_rate": 4.312572434365614e-07, + "loss": 5.4652, + "step": 9757 + }, + { + "epoch": 0.940983606557377, + "grad_norm": 1.164038896560669, + "learning_rate": 4.298576768987123e-07, + "loss": 5.2333, + "step": 9758 + }, + { + "epoch": 0.9410800385728062, + "grad_norm": 1.3826197385787964, + "learning_rate": 4.284603653782832e-07, + "loss": 5.2127, + "step": 9759 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 3.709024667739868, + "learning_rate": 4.2706530900351874e-07, + "loss": 4.957, + "step": 9760 + }, + { + "epoch": 0.9412729026036644, + "grad_norm": 1.5233511924743652, + "learning_rate": 4.256725079024554e-07, + "loss": 5.2158, + "step": 9761 + }, + { + "epoch": 0.9413693346190936, + "grad_norm": 1.7625586986541748, + "learning_rate": 4.2428196220292703e-07, + "loss": 5.3872, + "step": 9762 + }, + { + "epoch": 0.9414657666345226, + "grad_norm": 1.4366251230239868, + "learning_rate": 4.2289367203254547e-07, + "loss": 5.0974, + "step": 9763 + }, + { + "epoch": 0.9415621986499518, + "grad_norm": 1.6124855279922485, + "learning_rate": 4.2150763751873936e-07, + "loss": 5.0506, + "step": 9764 + }, + { + "epoch": 0.9416586306653809, + "grad_norm": 2.806185722351074, + "learning_rate": 4.2012385878870684e-07, + "loss": 5.1861, + "step": 9765 + }, + { + "epoch": 0.94175506268081, + "grad_norm": 1.216876745223999, + "learning_rate": 4.1874233596945753e-07, + "loss": 5.3592, + "step": 9766 + }, + { + "epoch": 0.9418514946962392, + "grad_norm": 1.4856934547424316, + "learning_rate": 4.173630691877817e-07, + "loss": 5.403, + "step": 9767 + }, + { + "epoch": 0.9419479267116683, + "grad_norm": 1.296539068222046, + "learning_rate": 4.159860585702641e-07, + "loss": 5.3791, + "step": 9768 + }, + { + "epoch": 0.9420443587270974, + "grad_norm": 1.5471988916397095, + "learning_rate": 4.146113042432898e-07, + "loss": 5.4197, + "step": 9769 + }, + { + "epoch": 0.9421407907425265, + "grad_norm": 1.9180433750152588, + "learning_rate": 4.1323880633303003e-07, + "loss": 5.3605, + "step": 9770 + }, + { + "epoch": 0.9422372227579556, + "grad_norm": 1.2849125862121582, + "learning_rate": 4.118685649654508e-07, + "loss": 5.3643, + "step": 9771 + }, + { + "epoch": 0.9423336547733847, + "grad_norm": 1.7878957986831665, + "learning_rate": 4.1050058026631533e-07, + "loss": 5.3545, + "step": 9772 + }, + { + "epoch": 0.9424300867888139, + "grad_norm": 1.1901684999465942, + "learning_rate": 4.091348523611677e-07, + "loss": 5.3592, + "step": 9773 + }, + { + "epoch": 0.942526518804243, + "grad_norm": 1.4100167751312256, + "learning_rate": 4.0777138137535755e-07, + "loss": 5.1698, + "step": 9774 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 1.3291242122650146, + "learning_rate": 4.0641016743402373e-07, + "loss": 5.1586, + "step": 9775 + }, + { + "epoch": 0.9427193828351013, + "grad_norm": 1.0867606401443481, + "learning_rate": 4.050512106620913e-07, + "loss": 5.14, + "step": 9776 + }, + { + "epoch": 0.9428158148505303, + "grad_norm": 1.1423819065093994, + "learning_rate": 4.03694511184291e-07, + "loss": 5.3362, + "step": 9777 + }, + { + "epoch": 0.9429122468659595, + "grad_norm": 1.665927529335022, + "learning_rate": 4.023400691251317e-07, + "loss": 5.1589, + "step": 9778 + }, + { + "epoch": 0.9430086788813886, + "grad_norm": 1.4903039932250977, + "learning_rate": 4.009878846089277e-07, + "loss": 5.2361, + "step": 9779 + }, + { + "epoch": 0.9431051108968177, + "grad_norm": 1.174000859260559, + "learning_rate": 3.9963795775977976e-07, + "loss": 5.2463, + "step": 9780 + }, + { + "epoch": 0.9432015429122469, + "grad_norm": 1.53216552734375, + "learning_rate": 3.9829028870158324e-07, + "loss": 5.2403, + "step": 9781 + }, + { + "epoch": 0.943297974927676, + "grad_norm": 1.2277288436889648, + "learning_rate": 3.9694487755802257e-07, + "loss": 5.3144, + "step": 9782 + }, + { + "epoch": 0.9433944069431051, + "grad_norm": 1.3157461881637573, + "learning_rate": 3.9560172445257947e-07, + "loss": 5.3268, + "step": 9783 + }, + { + "epoch": 0.9434908389585343, + "grad_norm": 1.5463714599609375, + "learning_rate": 3.942608295085276e-07, + "loss": 5.416, + "step": 9784 + }, + { + "epoch": 0.9435872709739633, + "grad_norm": 1.5969213247299194, + "learning_rate": 3.929221928489324e-07, + "loss": 5.3421, + "step": 9785 + }, + { + "epoch": 0.9436837029893925, + "grad_norm": 1.2505085468292236, + "learning_rate": 3.915858145966511e-07, + "loss": 5.3527, + "step": 9786 + }, + { + "epoch": 0.9437801350048216, + "grad_norm": 1.2108970880508423, + "learning_rate": 3.9025169487433567e-07, + "loss": 5.4965, + "step": 9787 + }, + { + "epoch": 0.9438765670202507, + "grad_norm": 1.308861255645752, + "learning_rate": 3.889198338044298e-07, + "loss": 5.2388, + "step": 9788 + }, + { + "epoch": 0.9439729990356799, + "grad_norm": 1.2794727087020874, + "learning_rate": 3.875902315091745e-07, + "loss": 5.3386, + "step": 9789 + }, + { + "epoch": 0.944069431051109, + "grad_norm": 1.2339198589324951, + "learning_rate": 3.8626288811058895e-07, + "loss": 5.5339, + "step": 9790 + }, + { + "epoch": 0.944165863066538, + "grad_norm": 1.0815789699554443, + "learning_rate": 3.849378037305035e-07, + "loss": 5.2543, + "step": 9791 + }, + { + "epoch": 0.9442622950819672, + "grad_norm": 1.2631068229675293, + "learning_rate": 3.836149784905291e-07, + "loss": 5.4504, + "step": 9792 + }, + { + "epoch": 0.9443587270973963, + "grad_norm": 1.2109788656234741, + "learning_rate": 3.822944125120742e-07, + "loss": 5.1689, + "step": 9793 + }, + { + "epoch": 0.9444551591128254, + "grad_norm": 1.3031611442565918, + "learning_rate": 3.8097610591633916e-07, + "loss": 5.3344, + "step": 9794 + }, + { + "epoch": 0.9445515911282546, + "grad_norm": 1.3201969861984253, + "learning_rate": 3.79660058824316e-07, + "loss": 5.1293, + "step": 9795 + }, + { + "epoch": 0.9446480231436837, + "grad_norm": 2.195619583129883, + "learning_rate": 3.783462713567915e-07, + "loss": 5.1277, + "step": 9796 + }, + { + "epoch": 0.9447444551591129, + "grad_norm": 1.904288649559021, + "learning_rate": 3.770347436343413e-07, + "loss": 5.1556, + "step": 9797 + }, + { + "epoch": 0.944840887174542, + "grad_norm": 2.0334208011627197, + "learning_rate": 3.757254757773332e-07, + "loss": 5.2533, + "step": 9798 + }, + { + "epoch": 0.944937319189971, + "grad_norm": 1.7495379447937012, + "learning_rate": 3.744184679059376e-07, + "loss": 5.1964, + "step": 9799 + }, + { + "epoch": 0.9450337512054002, + "grad_norm": 1.8803249597549438, + "learning_rate": 3.731137201401058e-07, + "loss": 5.1531, + "step": 9800 + }, + { + "epoch": 0.9451301832208293, + "grad_norm": 1.6668821573257446, + "learning_rate": 3.7181123259958097e-07, + "loss": 5.146, + "step": 9801 + }, + { + "epoch": 0.9452266152362584, + "grad_norm": 1.0931507349014282, + "learning_rate": 3.705110054039146e-07, + "loss": 5.4594, + "step": 9802 + }, + { + "epoch": 0.9453230472516876, + "grad_norm": 1.3933029174804688, + "learning_rate": 3.6921303867243073e-07, + "loss": 5.4866, + "step": 9803 + }, + { + "epoch": 0.9454194792671167, + "grad_norm": 1.2233104705810547, + "learning_rate": 3.6791733252425906e-07, + "loss": 5.238, + "step": 9804 + }, + { + "epoch": 0.9455159112825458, + "grad_norm": 1.4146618843078613, + "learning_rate": 3.666238870783184e-07, + "loss": 5.1835, + "step": 9805 + }, + { + "epoch": 0.9456123432979749, + "grad_norm": 1.6634504795074463, + "learning_rate": 3.653327024533193e-07, + "loss": 5.4766, + "step": 9806 + }, + { + "epoch": 0.945708775313404, + "grad_norm": 1.468133807182312, + "learning_rate": 3.640437787677614e-07, + "loss": 5.1413, + "step": 9807 + }, + { + "epoch": 0.9458052073288332, + "grad_norm": 2.321977138519287, + "learning_rate": 3.6275711613994733e-07, + "loss": 5.3241, + "step": 9808 + }, + { + "epoch": 0.9459016393442623, + "grad_norm": 1.830726981163025, + "learning_rate": 3.6147271468795764e-07, + "loss": 5.1684, + "step": 9809 + }, + { + "epoch": 0.9459980713596914, + "grad_norm": 1.2867876291275024, + "learning_rate": 3.601905745296813e-07, + "loss": 5.3831, + "step": 9810 + }, + { + "epoch": 0.9460945033751206, + "grad_norm": 1.1906450986862183, + "learning_rate": 3.5891069578278535e-07, + "loss": 4.9897, + "step": 9811 + }, + { + "epoch": 0.9461909353905497, + "grad_norm": 1.0921577215194702, + "learning_rate": 3.5763307856473693e-07, + "loss": 5.3027, + "step": 9812 + }, + { + "epoch": 0.9462873674059787, + "grad_norm": 1.2448420524597168, + "learning_rate": 3.5635772299279503e-07, + "loss": 5.2028, + "step": 9813 + }, + { + "epoch": 0.9463837994214079, + "grad_norm": 1.1745965480804443, + "learning_rate": 3.5508462918401043e-07, + "loss": 5.2335, + "step": 9814 + }, + { + "epoch": 0.946480231436837, + "grad_norm": 1.1175990104675293, + "learning_rate": 3.53813797255223e-07, + "loss": 5.2554, + "step": 9815 + }, + { + "epoch": 0.9465766634522661, + "grad_norm": 1.3752480745315552, + "learning_rate": 3.5254522732307283e-07, + "loss": 5.4526, + "step": 9816 + }, + { + "epoch": 0.9466730954676953, + "grad_norm": 2.2301650047302246, + "learning_rate": 3.5127891950398895e-07, + "loss": 5.5635, + "step": 9817 + }, + { + "epoch": 0.9467695274831244, + "grad_norm": 1.4576371908187866, + "learning_rate": 3.5001487391418396e-07, + "loss": 5.2726, + "step": 9818 + }, + { + "epoch": 0.9468659594985536, + "grad_norm": 1.5566160678863525, + "learning_rate": 3.487530906696762e-07, + "loss": 5.4808, + "step": 9819 + }, + { + "epoch": 0.9469623915139826, + "grad_norm": 1.187080979347229, + "learning_rate": 3.474935698862675e-07, + "loss": 5.3653, + "step": 9820 + }, + { + "epoch": 0.9470588235294117, + "grad_norm": 1.4869540929794312, + "learning_rate": 3.4623631167955973e-07, + "loss": 5.3382, + "step": 9821 + }, + { + "epoch": 0.9471552555448409, + "grad_norm": 1.485589623451233, + "learning_rate": 3.4498131616493565e-07, + "loss": 5.2628, + "step": 9822 + }, + { + "epoch": 0.94725168756027, + "grad_norm": 1.0656465291976929, + "learning_rate": 3.4372858345758373e-07, + "loss": 5.3657, + "step": 9823 + }, + { + "epoch": 0.9473481195756991, + "grad_norm": 1.1689510345458984, + "learning_rate": 3.424781136724731e-07, + "loss": 5.4166, + "step": 9824 + }, + { + "epoch": 0.9474445515911283, + "grad_norm": 1.1888904571533203, + "learning_rate": 3.412299069243757e-07, + "loss": 5.1979, + "step": 9825 + }, + { + "epoch": 0.9475409836065574, + "grad_norm": 2.19627046585083, + "learning_rate": 3.3998396332784456e-07, + "loss": 5.5505, + "step": 9826 + }, + { + "epoch": 0.9476374156219864, + "grad_norm": 1.2650399208068848, + "learning_rate": 3.3874028299723803e-07, + "loss": 5.393, + "step": 9827 + }, + { + "epoch": 0.9477338476374156, + "grad_norm": 1.288772702217102, + "learning_rate": 3.374988660466899e-07, + "loss": 5.4833, + "step": 9828 + }, + { + "epoch": 0.9478302796528447, + "grad_norm": 1.2002593278884888, + "learning_rate": 3.362597125901451e-07, + "loss": 5.3847, + "step": 9829 + }, + { + "epoch": 0.9479267116682739, + "grad_norm": 1.6256297826766968, + "learning_rate": 3.350228227413266e-07, + "loss": 5.4286, + "step": 9830 + }, + { + "epoch": 0.948023143683703, + "grad_norm": 1.4211536645889282, + "learning_rate": 3.337881966137546e-07, + "loss": 5.5103, + "step": 9831 + }, + { + "epoch": 0.9481195756991321, + "grad_norm": 1.3939234018325806, + "learning_rate": 3.3255583432074134e-07, + "loss": 5.1044, + "step": 9832 + }, + { + "epoch": 0.9482160077145613, + "grad_norm": 1.0886236429214478, + "learning_rate": 3.3132573597539907e-07, + "loss": 5.3185, + "step": 9833 + }, + { + "epoch": 0.9483124397299904, + "grad_norm": 1.0889601707458496, + "learning_rate": 3.300979016906125e-07, + "loss": 5.1869, + "step": 9834 + }, + { + "epoch": 0.9484088717454194, + "grad_norm": 1.2510043382644653, + "learning_rate": 3.2887233157908037e-07, + "loss": 5.2409, + "step": 9835 + }, + { + "epoch": 0.9485053037608486, + "grad_norm": 1.3694097995758057, + "learning_rate": 3.276490257532794e-07, + "loss": 5.2561, + "step": 9836 + }, + { + "epoch": 0.9486017357762777, + "grad_norm": 1.8680750131607056, + "learning_rate": 3.264279843254836e-07, + "loss": 5.3049, + "step": 9837 + }, + { + "epoch": 0.9486981677917068, + "grad_norm": 1.3031249046325684, + "learning_rate": 3.2520920740775893e-07, + "loss": 5.0475, + "step": 9838 + }, + { + "epoch": 0.948794599807136, + "grad_norm": 1.0567110776901245, + "learning_rate": 3.2399269511196316e-07, + "loss": 5.4136, + "step": 9839 + }, + { + "epoch": 0.9488910318225651, + "grad_norm": 1.2570801973342896, + "learning_rate": 3.2277844754974584e-07, + "loss": 5.1919, + "step": 9840 + }, + { + "epoch": 0.9489874638379943, + "grad_norm": 1.7696733474731445, + "learning_rate": 3.2156646483255114e-07, + "loss": 5.2532, + "step": 9841 + }, + { + "epoch": 0.9490838958534233, + "grad_norm": 1.6076185703277588, + "learning_rate": 3.2035674707161234e-07, + "loss": 5.518, + "step": 9842 + }, + { + "epoch": 0.9491803278688524, + "grad_norm": 1.040403962135315, + "learning_rate": 3.191492943779545e-07, + "loss": 5.0565, + "step": 9843 + }, + { + "epoch": 0.9492767598842816, + "grad_norm": 1.0713144540786743, + "learning_rate": 3.1794410686240005e-07, + "loss": 5.4086, + "step": 9844 + }, + { + "epoch": 0.9493731918997107, + "grad_norm": 1.2786037921905518, + "learning_rate": 3.16741184635555e-07, + "loss": 5.0879, + "step": 9845 + }, + { + "epoch": 0.9494696239151398, + "grad_norm": 1.0157876014709473, + "learning_rate": 3.155405278078255e-07, + "loss": 5.4151, + "step": 9846 + }, + { + "epoch": 0.949566055930569, + "grad_norm": 1.2908713817596436, + "learning_rate": 3.1434213648940115e-07, + "loss": 5.4199, + "step": 9847 + }, + { + "epoch": 0.9496624879459981, + "grad_norm": 1.31087064743042, + "learning_rate": 3.131460107902745e-07, + "loss": 5.4046, + "step": 9848 + }, + { + "epoch": 0.9497589199614271, + "grad_norm": 1.10802161693573, + "learning_rate": 3.1195215082022166e-07, + "loss": 5.324, + "step": 9849 + }, + { + "epoch": 0.9498553519768563, + "grad_norm": 1.6523051261901855, + "learning_rate": 3.1076055668881886e-07, + "loss": 5.4515, + "step": 9850 + }, + { + "epoch": 0.9499517839922854, + "grad_norm": 1.5694693326950073, + "learning_rate": 3.095712285054175e-07, + "loss": 5.2893, + "step": 9851 + }, + { + "epoch": 0.9500482160077146, + "grad_norm": 1.184949517250061, + "learning_rate": 3.083841663791859e-07, + "loss": 5.3501, + "step": 9852 + }, + { + "epoch": 0.9501446480231437, + "grad_norm": 1.180708646774292, + "learning_rate": 3.07199370419059e-07, + "loss": 5.3283, + "step": 9853 + }, + { + "epoch": 0.9502410800385728, + "grad_norm": 1.2009575366973877, + "learning_rate": 3.0601684073378603e-07, + "loss": 5.2103, + "step": 9854 + }, + { + "epoch": 0.950337512054002, + "grad_norm": 1.5031352043151855, + "learning_rate": 3.04836577431894e-07, + "loss": 5.2067, + "step": 9855 + }, + { + "epoch": 0.950433944069431, + "grad_norm": 1.2826828956604004, + "learning_rate": 3.036585806217046e-07, + "loss": 5.325, + "step": 9856 + }, + { + "epoch": 0.9505303760848601, + "grad_norm": 1.4097399711608887, + "learning_rate": 3.0248285041133694e-07, + "loss": 5.3246, + "step": 9857 + }, + { + "epoch": 0.9506268081002893, + "grad_norm": 1.496782898902893, + "learning_rate": 3.013093869086936e-07, + "loss": 5.135, + "step": 9858 + }, + { + "epoch": 0.9507232401157184, + "grad_norm": 1.3298319578170776, + "learning_rate": 3.001381902214745e-07, + "loss": 5.2439, + "step": 9859 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 1.598124623298645, + "learning_rate": 2.9896926045717423e-07, + "loss": 5.3232, + "step": 9860 + }, + { + "epoch": 0.9509161041465767, + "grad_norm": 1.1590317487716675, + "learning_rate": 2.978025977230736e-07, + "loss": 5.335, + "step": 9861 + }, + { + "epoch": 0.9510125361620058, + "grad_norm": 1.3820542097091675, + "learning_rate": 2.966382021262426e-07, + "loss": 5.3179, + "step": 9862 + }, + { + "epoch": 0.951108968177435, + "grad_norm": 1.6753008365631104, + "learning_rate": 2.954760737735568e-07, + "loss": 5.1132, + "step": 9863 + }, + { + "epoch": 0.951205400192864, + "grad_norm": 1.1610345840454102, + "learning_rate": 2.94316212771667e-07, + "loss": 5.2866, + "step": 9864 + }, + { + "epoch": 0.9513018322082931, + "grad_norm": 1.1046557426452637, + "learning_rate": 2.931586192270325e-07, + "loss": 5.3412, + "step": 9865 + }, + { + "epoch": 0.9513982642237223, + "grad_norm": 1.2553386688232422, + "learning_rate": 2.920032932458877e-07, + "loss": 5.3939, + "step": 9866 + }, + { + "epoch": 0.9514946962391514, + "grad_norm": 1.4134125709533691, + "learning_rate": 2.9085023493427013e-07, + "loss": 5.4411, + "step": 9867 + }, + { + "epoch": 0.9515911282545805, + "grad_norm": 1.6061986684799194, + "learning_rate": 2.896994443980061e-07, + "loss": 5.3636, + "step": 9868 + }, + { + "epoch": 0.9516875602700097, + "grad_norm": 1.49628484249115, + "learning_rate": 2.8855092174271404e-07, + "loss": 5.2248, + "step": 9869 + }, + { + "epoch": 0.9517839922854388, + "grad_norm": 1.6387358903884888, + "learning_rate": 2.874046670738012e-07, + "loss": 5.3588, + "step": 9870 + }, + { + "epoch": 0.9518804243008678, + "grad_norm": 1.095948338508606, + "learning_rate": 2.8626068049647513e-07, + "loss": 5.5468, + "step": 9871 + }, + { + "epoch": 0.951976856316297, + "grad_norm": 1.259613037109375, + "learning_rate": 2.8511896211572407e-07, + "loss": 5.3846, + "step": 9872 + }, + { + "epoch": 0.9520732883317261, + "grad_norm": 1.2995593547821045, + "learning_rate": 2.8397951203633645e-07, + "loss": 5.235, + "step": 9873 + }, + { + "epoch": 0.9521697203471553, + "grad_norm": 1.0939209461212158, + "learning_rate": 2.8284233036288966e-07, + "loss": 5.1506, + "step": 9874 + }, + { + "epoch": 0.9522661523625844, + "grad_norm": 1.1627044677734375, + "learning_rate": 2.817074171997502e-07, + "loss": 5.1895, + "step": 9875 + }, + { + "epoch": 0.9523625843780135, + "grad_norm": 1.6477631330490112, + "learning_rate": 2.805747726510821e-07, + "loss": 5.2906, + "step": 9876 + }, + { + "epoch": 0.9524590163934427, + "grad_norm": 1.3334972858428955, + "learning_rate": 2.7944439682083547e-07, + "loss": 5.3311, + "step": 9877 + }, + { + "epoch": 0.9525554484088717, + "grad_norm": 1.4297128915786743, + "learning_rate": 2.7831628981275506e-07, + "loss": 5.5383, + "step": 9878 + }, + { + "epoch": 0.9526518804243008, + "grad_norm": 1.4162352085113525, + "learning_rate": 2.7719045173038307e-07, + "loss": 5.3714, + "step": 9879 + }, + { + "epoch": 0.95274831243973, + "grad_norm": 1.336152195930481, + "learning_rate": 2.7606688267703685e-07, + "loss": 5.3131, + "step": 9880 + }, + { + "epoch": 0.9528447444551591, + "grad_norm": 1.192915439605713, + "learning_rate": 2.7494558275584506e-07, + "loss": 5.3106, + "step": 9881 + }, + { + "epoch": 0.9529411764705882, + "grad_norm": 1.7235239744186401, + "learning_rate": 2.738265520697142e-07, + "loss": 5.2045, + "step": 9882 + }, + { + "epoch": 0.9530376084860174, + "grad_norm": 2.0634665489196777, + "learning_rate": 2.7270979072135104e-07, + "loss": 5.2131, + "step": 9883 + }, + { + "epoch": 0.9531340405014465, + "grad_norm": 2.2736878395080566, + "learning_rate": 2.7159529881324863e-07, + "loss": 5.6076, + "step": 9884 + }, + { + "epoch": 0.9532304725168756, + "grad_norm": 1.189435601234436, + "learning_rate": 2.7048307644769456e-07, + "loss": 5.3532, + "step": 9885 + }, + { + "epoch": 0.9533269045323047, + "grad_norm": 1.1290884017944336, + "learning_rate": 2.693731237267655e-07, + "loss": 5.318, + "step": 9886 + }, + { + "epoch": 0.9534233365477338, + "grad_norm": 1.2113018035888672, + "learning_rate": 2.6826544075233274e-07, + "loss": 5.4884, + "step": 9887 + }, + { + "epoch": 0.953519768563163, + "grad_norm": 1.275884985923767, + "learning_rate": 2.671600276260594e-07, + "loss": 5.208, + "step": 9888 + }, + { + "epoch": 0.9536162005785921, + "grad_norm": 1.5118716955184937, + "learning_rate": 2.6605688444939493e-07, + "loss": 5.1724, + "step": 9889 + }, + { + "epoch": 0.9537126325940212, + "grad_norm": 1.662596583366394, + "learning_rate": 2.649560113235888e-07, + "loss": 5.3099, + "step": 9890 + }, + { + "epoch": 0.9538090646094504, + "grad_norm": 1.4361668825149536, + "learning_rate": 2.6385740834967696e-07, + "loss": 5.4154, + "step": 9891 + }, + { + "epoch": 0.9539054966248794, + "grad_norm": 1.4819908142089844, + "learning_rate": 2.627610756284843e-07, + "loss": 5.4828, + "step": 9892 + }, + { + "epoch": 0.9540019286403085, + "grad_norm": 1.7916646003723145, + "learning_rate": 2.616670132606358e-07, + "loss": 5.1541, + "step": 9893 + }, + { + "epoch": 0.9540983606557377, + "grad_norm": 1.2187044620513916, + "learning_rate": 2.6057522134654013e-07, + "loss": 5.2936, + "step": 9894 + }, + { + "epoch": 0.9541947926711668, + "grad_norm": 1.0722545385360718, + "learning_rate": 2.594856999864004e-07, + "loss": 5.3617, + "step": 9895 + }, + { + "epoch": 0.954291224686596, + "grad_norm": 1.0855417251586914, + "learning_rate": 2.583984492802144e-07, + "loss": 5.2081, + "step": 9896 + }, + { + "epoch": 0.9543876567020251, + "grad_norm": 1.7495590448379517, + "learning_rate": 2.573134693277635e-07, + "loss": 4.9738, + "step": 9897 + }, + { + "epoch": 0.9544840887174542, + "grad_norm": 2.031156063079834, + "learning_rate": 2.562307602286346e-07, + "loss": 5.0986, + "step": 9898 + }, + { + "epoch": 0.9545805207328834, + "grad_norm": 1.748236894607544, + "learning_rate": 2.5515032208218714e-07, + "loss": 5.3604, + "step": 9899 + }, + { + "epoch": 0.9546769527483124, + "grad_norm": 1.3061444759368896, + "learning_rate": 2.540721549875891e-07, + "loss": 5.3251, + "step": 9900 + }, + { + "epoch": 0.9547733847637415, + "grad_norm": 1.1534713506698608, + "learning_rate": 2.5299625904379175e-07, + "loss": 5.1934, + "step": 9901 + }, + { + "epoch": 0.9548698167791707, + "grad_norm": 1.6840927600860596, + "learning_rate": 2.5192263434953844e-07, + "loss": 5.4891, + "step": 9902 + }, + { + "epoch": 0.9549662487945998, + "grad_norm": 1.3938264846801758, + "learning_rate": 2.508512810033642e-07, + "loss": 5.421, + "step": 9903 + }, + { + "epoch": 0.9550626808100289, + "grad_norm": 1.2018568515777588, + "learning_rate": 2.497821991035987e-07, + "loss": 5.2945, + "step": 9904 + }, + { + "epoch": 0.9551591128254581, + "grad_norm": 1.3387399911880493, + "learning_rate": 2.487153887483634e-07, + "loss": 5.073, + "step": 9905 + }, + { + "epoch": 0.9552555448408871, + "grad_norm": 1.3370559215545654, + "learning_rate": 2.4765085003556346e-07, + "loss": 5.4156, + "step": 9906 + }, + { + "epoch": 0.9553519768563163, + "grad_norm": 1.1006243228912354, + "learning_rate": 2.465885830629039e-07, + "loss": 5.3406, + "step": 9907 + }, + { + "epoch": 0.9554484088717454, + "grad_norm": 1.1470842361450195, + "learning_rate": 2.4552858792787627e-07, + "loss": 5.1917, + "step": 9908 + }, + { + "epoch": 0.9555448408871745, + "grad_norm": 1.3299304246902466, + "learning_rate": 2.4447086472776936e-07, + "loss": 5.3009, + "step": 9909 + }, + { + "epoch": 0.9556412729026037, + "grad_norm": 1.5531268119812012, + "learning_rate": 2.4341541355965556e-07, + "loss": 5.393, + "step": 9910 + }, + { + "epoch": 0.9557377049180328, + "grad_norm": 1.4552465677261353, + "learning_rate": 2.423622345204074e-07, + "loss": 5.4027, + "step": 9911 + }, + { + "epoch": 0.9558341369334619, + "grad_norm": 1.5830988883972168, + "learning_rate": 2.413113277066781e-07, + "loss": 5.3248, + "step": 9912 + }, + { + "epoch": 0.9559305689488911, + "grad_norm": 1.1869444847106934, + "learning_rate": 2.4026269321492655e-07, + "loss": 5.3748, + "step": 9913 + }, + { + "epoch": 0.9560270009643201, + "grad_norm": 1.0763038396835327, + "learning_rate": 2.392163311413842e-07, + "loss": 5.4159, + "step": 9914 + }, + { + "epoch": 0.9561234329797492, + "grad_norm": 1.4654313325881958, + "learning_rate": 2.3817224158209917e-07, + "loss": 5.3722, + "step": 9915 + }, + { + "epoch": 0.9562198649951784, + "grad_norm": 0.9975356459617615, + "learning_rate": 2.371304246328837e-07, + "loss": 5.1779, + "step": 9916 + }, + { + "epoch": 0.9563162970106075, + "grad_norm": 1.2290928363800049, + "learning_rate": 2.3609088038936133e-07, + "loss": 5.0297, + "step": 9917 + }, + { + "epoch": 0.9564127290260367, + "grad_norm": 1.5445501804351807, + "learning_rate": 2.350536089469363e-07, + "loss": 5.4248, + "step": 9918 + }, + { + "epoch": 0.9565091610414658, + "grad_norm": 1.2575788497924805, + "learning_rate": 2.3401861040081296e-07, + "loss": 5.0515, + "step": 9919 + }, + { + "epoch": 0.9566055930568949, + "grad_norm": 1.2954515218734741, + "learning_rate": 2.3298588484597929e-07, + "loss": 5.3395, + "step": 9920 + }, + { + "epoch": 0.956702025072324, + "grad_norm": 1.1985416412353516, + "learning_rate": 2.3195543237721773e-07, + "loss": 5.3233, + "step": 9921 + }, + { + "epoch": 0.9567984570877531, + "grad_norm": 1.3838186264038086, + "learning_rate": 2.309272530890999e-07, + "loss": 5.2553, + "step": 9922 + }, + { + "epoch": 0.9568948891031822, + "grad_norm": 1.268344759941101, + "learning_rate": 2.2990134707599477e-07, + "loss": 5.2473, + "step": 9923 + }, + { + "epoch": 0.9569913211186114, + "grad_norm": 1.5177050828933716, + "learning_rate": 2.288777144320603e-07, + "loss": 5.4609, + "step": 9924 + }, + { + "epoch": 0.9570877531340405, + "grad_norm": 1.0367518663406372, + "learning_rate": 2.2785635525123806e-07, + "loss": 5.3673, + "step": 9925 + }, + { + "epoch": 0.9571841851494696, + "grad_norm": 1.4583849906921387, + "learning_rate": 2.2683726962726692e-07, + "loss": 5.3436, + "step": 9926 + }, + { + "epoch": 0.9572806171648988, + "grad_norm": 1.2137105464935303, + "learning_rate": 2.258204576536832e-07, + "loss": 5.338, + "step": 9927 + }, + { + "epoch": 0.9573770491803278, + "grad_norm": 1.1425981521606445, + "learning_rate": 2.248059194238067e-07, + "loss": 5.2405, + "step": 9928 + }, + { + "epoch": 0.957473481195757, + "grad_norm": 1.6088683605194092, + "learning_rate": 2.23793655030749e-07, + "loss": 5.0676, + "step": 9929 + }, + { + "epoch": 0.9575699132111861, + "grad_norm": 1.6979190111160278, + "learning_rate": 2.2278366456741362e-07, + "loss": 5.1813, + "step": 9930 + }, + { + "epoch": 0.9576663452266152, + "grad_norm": 1.4771100282669067, + "learning_rate": 2.217759481264986e-07, + "loss": 5.507, + "step": 9931 + }, + { + "epoch": 0.9577627772420444, + "grad_norm": 1.0912843942642212, + "learning_rate": 2.2077050580049107e-07, + "loss": 5.4533, + "step": 9932 + }, + { + "epoch": 0.9578592092574735, + "grad_norm": 1.322654366493225, + "learning_rate": 2.1976733768166725e-07, + "loss": 5.0943, + "step": 9933 + }, + { + "epoch": 0.9579556412729026, + "grad_norm": 1.0288156270980835, + "learning_rate": 2.1876644386209789e-07, + "loss": 5.4974, + "step": 9934 + }, + { + "epoch": 0.9580520732883318, + "grad_norm": 1.307157039642334, + "learning_rate": 2.1776782443364563e-07, + "loss": 5.3566, + "step": 9935 + }, + { + "epoch": 0.9581485053037608, + "grad_norm": 2.3416192531585693, + "learning_rate": 2.1677147948795662e-07, + "loss": 5.1983, + "step": 9936 + }, + { + "epoch": 0.9582449373191899, + "grad_norm": 1.3305103778839111, + "learning_rate": 2.157774091164799e-07, + "loss": 5.4051, + "step": 9937 + }, + { + "epoch": 0.9583413693346191, + "grad_norm": 1.854491114616394, + "learning_rate": 2.147856134104481e-07, + "loss": 5.47, + "step": 9938 + }, + { + "epoch": 0.9584378013500482, + "grad_norm": 3.1513309478759766, + "learning_rate": 2.1379609246088562e-07, + "loss": 5.0907, + "step": 9939 + }, + { + "epoch": 0.9585342333654774, + "grad_norm": 1.3120163679122925, + "learning_rate": 2.128088463586142e-07, + "loss": 5.274, + "step": 9940 + }, + { + "epoch": 0.9586306653809065, + "grad_norm": 1.7904918193817139, + "learning_rate": 2.118238751942364e-07, + "loss": 5.0869, + "step": 9941 + }, + { + "epoch": 0.9587270973963355, + "grad_norm": 1.4450702667236328, + "learning_rate": 2.1084117905815492e-07, + "loss": 5.2797, + "step": 9942 + }, + { + "epoch": 0.9588235294117647, + "grad_norm": 1.0668994188308716, + "learning_rate": 2.0986075804055871e-07, + "loss": 5.3357, + "step": 9943 + }, + { + "epoch": 0.9589199614271938, + "grad_norm": 1.2327977418899536, + "learning_rate": 2.0888261223143135e-07, + "loss": 5.259, + "step": 9944 + }, + { + "epoch": 0.9590163934426229, + "grad_norm": 2.110821485519409, + "learning_rate": 2.0790674172054546e-07, + "loss": 5.2956, + "step": 9945 + }, + { + "epoch": 0.9591128254580521, + "grad_norm": 1.409550428390503, + "learning_rate": 2.0693314659746278e-07, + "loss": 5.2103, + "step": 9946 + }, + { + "epoch": 0.9592092574734812, + "grad_norm": 1.0025298595428467, + "learning_rate": 2.059618269515423e-07, + "loss": 5.2412, + "step": 9947 + }, + { + "epoch": 0.9593056894889104, + "grad_norm": 1.7306405305862427, + "learning_rate": 2.0499278287192668e-07, + "loss": 5.4746, + "step": 9948 + }, + { + "epoch": 0.9594021215043395, + "grad_norm": 1.7942514419555664, + "learning_rate": 2.0402601444755863e-07, + "loss": 5.6599, + "step": 9949 + }, + { + "epoch": 0.9594985535197685, + "grad_norm": 1.1919625997543335, + "learning_rate": 2.0306152176716164e-07, + "loss": 5.2649, + "step": 9950 + }, + { + "epoch": 0.9595949855351977, + "grad_norm": 1.3887269496917725, + "learning_rate": 2.0209930491925932e-07, + "loss": 5.178, + "step": 9951 + }, + { + "epoch": 0.9596914175506268, + "grad_norm": 1.2810472249984741, + "learning_rate": 2.011393639921616e-07, + "loss": 5.1816, + "step": 9952 + }, + { + "epoch": 0.9597878495660559, + "grad_norm": 1.3845107555389404, + "learning_rate": 2.0018169907397023e-07, + "loss": 5.0433, + "step": 9953 + }, + { + "epoch": 0.9598842815814851, + "grad_norm": 1.3944848775863647, + "learning_rate": 1.9922631025257877e-07, + "loss": 5.3489, + "step": 9954 + }, + { + "epoch": 0.9599807135969142, + "grad_norm": 1.049604892730713, + "learning_rate": 1.9827319761567264e-07, + "loss": 5.2419, + "step": 9955 + }, + { + "epoch": 0.9600771456123433, + "grad_norm": 1.2207733392715454, + "learning_rate": 1.9732236125072634e-07, + "loss": 5.5242, + "step": 9956 + }, + { + "epoch": 0.9601735776277724, + "grad_norm": 1.150579571723938, + "learning_rate": 1.9637380124500616e-07, + "loss": 5.2844, + "step": 9957 + }, + { + "epoch": 0.9602700096432015, + "grad_norm": 1.1825437545776367, + "learning_rate": 1.9542751768557022e-07, + "loss": 5.3813, + "step": 9958 + }, + { + "epoch": 0.9603664416586307, + "grad_norm": 1.1638010740280151, + "learning_rate": 1.9448351065926573e-07, + "loss": 5.3078, + "step": 9959 + }, + { + "epoch": 0.9604628736740598, + "grad_norm": 1.2963578701019287, + "learning_rate": 1.935417802527345e-07, + "loss": 5.1138, + "step": 9960 + }, + { + "epoch": 0.9605593056894889, + "grad_norm": 1.575345754623413, + "learning_rate": 1.9260232655241018e-07, + "loss": 5.4801, + "step": 9961 + }, + { + "epoch": 0.9606557377049181, + "grad_norm": 1.1096163988113403, + "learning_rate": 1.9166514964450988e-07, + "loss": 5.5118, + "step": 9962 + }, + { + "epoch": 0.9607521697203472, + "grad_norm": 2.050741672515869, + "learning_rate": 1.9073024961504538e-07, + "loss": 5.6069, + "step": 9963 + }, + { + "epoch": 0.9608486017357762, + "grad_norm": 1.4178133010864258, + "learning_rate": 1.897976265498258e-07, + "loss": 5.1955, + "step": 9964 + }, + { + "epoch": 0.9609450337512054, + "grad_norm": 1.1560649871826172, + "learning_rate": 1.888672805344438e-07, + "loss": 5.3663, + "step": 9965 + }, + { + "epoch": 0.9610414657666345, + "grad_norm": 2.217125654220581, + "learning_rate": 1.8793921165428385e-07, + "loss": 5.2829, + "step": 9966 + }, + { + "epoch": 0.9611378977820636, + "grad_norm": 1.2721534967422485, + "learning_rate": 1.8701341999452504e-07, + "loss": 5.4624, + "step": 9967 + }, + { + "epoch": 0.9612343297974928, + "grad_norm": 1.0812299251556396, + "learning_rate": 1.860899056401355e-07, + "loss": 5.5749, + "step": 9968 + }, + { + "epoch": 0.9613307618129219, + "grad_norm": 1.4552288055419922, + "learning_rate": 1.8516866867586969e-07, + "loss": 5.1057, + "step": 9969 + }, + { + "epoch": 0.9614271938283511, + "grad_norm": 1.4777604341506958, + "learning_rate": 1.842497091862877e-07, + "loss": 5.3331, + "step": 9970 + }, + { + "epoch": 0.9615236258437801, + "grad_norm": 1.1603130102157593, + "learning_rate": 1.8333302725571934e-07, + "loss": 5.1572, + "step": 9971 + }, + { + "epoch": 0.9616200578592092, + "grad_norm": 1.4587246179580688, + "learning_rate": 1.824186229683028e-07, + "loss": 5.2367, + "step": 9972 + }, + { + "epoch": 0.9617164898746384, + "grad_norm": 1.38663649559021, + "learning_rate": 1.815064964079599e-07, + "loss": 5.1075, + "step": 9973 + }, + { + "epoch": 0.9618129218900675, + "grad_norm": 1.2789568901062012, + "learning_rate": 1.8059664765840422e-07, + "loss": 5.3057, + "step": 9974 + }, + { + "epoch": 0.9619093539054966, + "grad_norm": 1.521555781364441, + "learning_rate": 1.7968907680314118e-07, + "loss": 5.1768, + "step": 9975 + }, + { + "epoch": 0.9620057859209258, + "grad_norm": 1.1323192119598389, + "learning_rate": 1.7878378392546524e-07, + "loss": 5.2577, + "step": 9976 + }, + { + "epoch": 0.9621022179363549, + "grad_norm": 1.2809720039367676, + "learning_rate": 1.7788076910845996e-07, + "loss": 5.3623, + "step": 9977 + }, + { + "epoch": 0.962198649951784, + "grad_norm": 2.1384615898132324, + "learning_rate": 1.7698003243501182e-07, + "loss": 5.6389, + "step": 9978 + }, + { + "epoch": 0.9622950819672131, + "grad_norm": 1.3929296731948853, + "learning_rate": 1.7608157398778247e-07, + "loss": 5.2561, + "step": 9979 + }, + { + "epoch": 0.9623915139826422, + "grad_norm": 1.5346969366073608, + "learning_rate": 1.751853938492337e-07, + "loss": 5.0822, + "step": 9980 + }, + { + "epoch": 0.9624879459980714, + "grad_norm": 1.3361661434173584, + "learning_rate": 1.7429149210161367e-07, + "loss": 5.2408, + "step": 9981 + }, + { + "epoch": 0.9625843780135005, + "grad_norm": 1.2390505075454712, + "learning_rate": 1.7339986882696502e-07, + "loss": 5.1824, + "step": 9982 + }, + { + "epoch": 0.9626808100289296, + "grad_norm": 1.9834283590316772, + "learning_rate": 1.7251052410711954e-07, + "loss": 5.3264, + "step": 9983 + }, + { + "epoch": 0.9627772420443588, + "grad_norm": 1.2382704019546509, + "learning_rate": 1.716234580237036e-07, + "loss": 5.2246, + "step": 9984 + }, + { + "epoch": 0.9628736740597879, + "grad_norm": 1.1969544887542725, + "learning_rate": 1.7073867065812432e-07, + "loss": 5.4596, + "step": 9985 + }, + { + "epoch": 0.9629701060752169, + "grad_norm": 1.400363564491272, + "learning_rate": 1.6985616209159173e-07, + "loss": 5.447, + "step": 9986 + }, + { + "epoch": 0.9630665380906461, + "grad_norm": 1.6409616470336914, + "learning_rate": 1.6897593240509934e-07, + "loss": 5.5584, + "step": 9987 + }, + { + "epoch": 0.9631629701060752, + "grad_norm": 1.7770899534225464, + "learning_rate": 1.680979816794326e-07, + "loss": 5.3869, + "step": 9988 + }, + { + "epoch": 0.9632594021215043, + "grad_norm": 1.199141502380371, + "learning_rate": 1.672223099951714e-07, + "loss": 5.1917, + "step": 9989 + }, + { + "epoch": 0.9633558341369335, + "grad_norm": 1.386657953262329, + "learning_rate": 1.663489174326821e-07, + "loss": 5.2729, + "step": 9990 + }, + { + "epoch": 0.9634522661523626, + "grad_norm": 1.0712543725967407, + "learning_rate": 1.6547780407212554e-07, + "loss": 5.1301, + "step": 9991 + }, + { + "epoch": 0.9635486981677918, + "grad_norm": 1.2046194076538086, + "learning_rate": 1.646089699934461e-07, + "loss": 5.349, + "step": 9992 + }, + { + "epoch": 0.9636451301832208, + "grad_norm": 1.17720627784729, + "learning_rate": 1.6374241527639112e-07, + "loss": 5.2356, + "step": 9993 + }, + { + "epoch": 0.9637415621986499, + "grad_norm": 1.051992416381836, + "learning_rate": 1.6287814000048862e-07, + "loss": 5.3039, + "step": 9994 + }, + { + "epoch": 0.9638379942140791, + "grad_norm": 1.4658501148223877, + "learning_rate": 1.6201614424506128e-07, + "loss": 5.1508, + "step": 9995 + }, + { + "epoch": 0.9639344262295082, + "grad_norm": 1.1645948886871338, + "learning_rate": 1.6115642808921804e-07, + "loss": 5.1756, + "step": 9996 + }, + { + "epoch": 0.9640308582449373, + "grad_norm": 1.4240474700927734, + "learning_rate": 1.6029899161187079e-07, + "loss": 5.2468, + "step": 9997 + }, + { + "epoch": 0.9641272902603665, + "grad_norm": 1.3341847658157349, + "learning_rate": 1.5944383489170656e-07, + "loss": 5.3903, + "step": 9998 + }, + { + "epoch": 0.9642237222757956, + "grad_norm": 1.287709355354309, + "learning_rate": 1.5859095800721257e-07, + "loss": 5.3977, + "step": 9999 + }, + { + "epoch": 0.9643201542912246, + "grad_norm": 1.1820942163467407, + "learning_rate": 1.577403610366679e-07, + "loss": 5.4921, + "step": 10000 + }, + { + "epoch": 0.9644165863066538, + "grad_norm": 0.9342209100723267, + "learning_rate": 1.5689204405813785e-07, + "loss": 5.204, + "step": 10001 + }, + { + "epoch": 0.9645130183220829, + "grad_norm": 0.9185497164726257, + "learning_rate": 1.5604600714947682e-07, + "loss": 5.5021, + "step": 10002 + }, + { + "epoch": 0.9646094503375121, + "grad_norm": 1.078493595123291, + "learning_rate": 1.5520225038833657e-07, + "loss": 5.4668, + "step": 10003 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 2.056882619857788, + "learning_rate": 1.5436077385215243e-07, + "loss": 5.0373, + "step": 10004 + }, + { + "epoch": 0.9648023143683703, + "grad_norm": 1.6654815673828125, + "learning_rate": 1.5352157761815977e-07, + "loss": 5.0144, + "step": 10005 + }, + { + "epoch": 0.9648987463837995, + "grad_norm": 1.358519434928894, + "learning_rate": 1.5268466176337204e-07, + "loss": 5.3066, + "step": 10006 + }, + { + "epoch": 0.9649951783992285, + "grad_norm": 1.210831642150879, + "learning_rate": 1.518500263646083e-07, + "loss": 5.5641, + "step": 10007 + }, + { + "epoch": 0.9650916104146576, + "grad_norm": 1.9719613790512085, + "learning_rate": 1.510176714984629e-07, + "loss": 5.1671, + "step": 10008 + }, + { + "epoch": 0.9651880424300868, + "grad_norm": 1.084535002708435, + "learning_rate": 1.5018759724133302e-07, + "loss": 5.0859, + "step": 10009 + }, + { + "epoch": 0.9652844744455159, + "grad_norm": 1.3286476135253906, + "learning_rate": 1.493598036693994e-07, + "loss": 5.2094, + "step": 10010 + }, + { + "epoch": 0.965380906460945, + "grad_norm": 2.0051262378692627, + "learning_rate": 1.4853429085863734e-07, + "loss": 5.3442, + "step": 10011 + }, + { + "epoch": 0.9654773384763742, + "grad_norm": 2.1022799015045166, + "learning_rate": 1.477110588848113e-07, + "loss": 5.3116, + "step": 10012 + }, + { + "epoch": 0.9655737704918033, + "grad_norm": 2.094756603240967, + "learning_rate": 1.468901078234747e-07, + "loss": 5.3063, + "step": 10013 + }, + { + "epoch": 0.9656702025072325, + "grad_norm": 1.1398682594299316, + "learning_rate": 1.4607143774997555e-07, + "loss": 5.2797, + "step": 10014 + }, + { + "epoch": 0.9657666345226615, + "grad_norm": 1.2985601425170898, + "learning_rate": 1.4525504873944828e-07, + "loss": 5.4041, + "step": 10015 + }, + { + "epoch": 0.9658630665380906, + "grad_norm": 1.3209110498428345, + "learning_rate": 1.4444094086682457e-07, + "loss": 5.291, + "step": 10016 + }, + { + "epoch": 0.9659594985535198, + "grad_norm": 1.4110478162765503, + "learning_rate": 1.436291142068169e-07, + "loss": 5.1659, + "step": 10017 + }, + { + "epoch": 0.9660559305689489, + "grad_norm": 1.247424602508545, + "learning_rate": 1.4281956883393787e-07, + "loss": 5.1295, + "step": 10018 + }, + { + "epoch": 0.966152362584378, + "grad_norm": 1.5903699398040771, + "learning_rate": 1.4201230482248086e-07, + "loss": 5.1648, + "step": 10019 + }, + { + "epoch": 0.9662487945998072, + "grad_norm": 1.4850927591323853, + "learning_rate": 1.4120732224654488e-07, + "loss": 5.2412, + "step": 10020 + }, + { + "epoch": 0.9663452266152363, + "grad_norm": 1.3914320468902588, + "learning_rate": 1.4040462117999865e-07, + "loss": 5.3106, + "step": 10021 + }, + { + "epoch": 0.9664416586306653, + "grad_norm": 1.6105468273162842, + "learning_rate": 1.396042016965221e-07, + "loss": 5.3891, + "step": 10022 + }, + { + "epoch": 0.9665380906460945, + "grad_norm": 1.301969051361084, + "learning_rate": 1.3880606386957318e-07, + "loss": 5.2186, + "step": 10023 + }, + { + "epoch": 0.9666345226615236, + "grad_norm": 1.2666258811950684, + "learning_rate": 1.3801020777240435e-07, + "loss": 5.4029, + "step": 10024 + }, + { + "epoch": 0.9667309546769528, + "grad_norm": 1.2258620262145996, + "learning_rate": 1.3721663347806002e-07, + "loss": 5.3091, + "step": 10025 + }, + { + "epoch": 0.9668273866923819, + "grad_norm": 1.5496736764907837, + "learning_rate": 1.3642534105937076e-07, + "loss": 5.3571, + "step": 10026 + }, + { + "epoch": 0.966923818707811, + "grad_norm": 1.4250679016113281, + "learning_rate": 1.3563633058896185e-07, + "loss": 5.3204, + "step": 10027 + }, + { + "epoch": 0.9670202507232402, + "grad_norm": 1.2233341932296753, + "learning_rate": 1.348496021392476e-07, + "loss": 5.2774, + "step": 10028 + }, + { + "epoch": 0.9671166827386692, + "grad_norm": 1.445744276046753, + "learning_rate": 1.340651557824313e-07, + "loss": 5.3987, + "step": 10029 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 1.2719699144363403, + "learning_rate": 1.3328299159051371e-07, + "loss": 5.301, + "step": 10030 + }, + { + "epoch": 0.9673095467695275, + "grad_norm": 1.153454303741455, + "learning_rate": 1.3250310963527358e-07, + "loss": 5.0306, + "step": 10031 + }, + { + "epoch": 0.9674059787849566, + "grad_norm": 1.2746140956878662, + "learning_rate": 1.3172550998829246e-07, + "loss": 5.1647, + "step": 10032 + }, + { + "epoch": 0.9675024108003857, + "grad_norm": 1.2586402893066406, + "learning_rate": 1.3095019272093555e-07, + "loss": 5.2319, + "step": 10033 + }, + { + "epoch": 0.9675988428158149, + "grad_norm": 1.1483099460601807, + "learning_rate": 1.3017715790436257e-07, + "loss": 5.3665, + "step": 10034 + }, + { + "epoch": 0.967695274831244, + "grad_norm": 2.388522148132324, + "learning_rate": 1.2940640560951956e-07, + "loss": 5.5798, + "step": 10035 + }, + { + "epoch": 0.9677917068466731, + "grad_norm": 1.2066377401351929, + "learning_rate": 1.2863793590714435e-07, + "loss": 5.2322, + "step": 10036 + }, + { + "epoch": 0.9678881388621022, + "grad_norm": 1.839813232421875, + "learning_rate": 1.2787174886776943e-07, + "loss": 5.3567, + "step": 10037 + }, + { + "epoch": 0.9679845708775313, + "grad_norm": 2.472959280014038, + "learning_rate": 1.2710784456171354e-07, + "loss": 5.446, + "step": 10038 + }, + { + "epoch": 0.9680810028929605, + "grad_norm": 1.3902734518051147, + "learning_rate": 1.263462230590845e-07, + "loss": 5.2244, + "step": 10039 + }, + { + "epoch": 0.9681774349083896, + "grad_norm": 1.2038782835006714, + "learning_rate": 1.2558688442978473e-07, + "loss": 5.0754, + "step": 10040 + }, + { + "epoch": 0.9682738669238187, + "grad_norm": 1.7657151222229004, + "learning_rate": 1.2482982874350845e-07, + "loss": 5.131, + "step": 10041 + }, + { + "epoch": 0.9683702989392479, + "grad_norm": 1.2111022472381592, + "learning_rate": 1.2407505606973069e-07, + "loss": 5.1973, + "step": 10042 + }, + { + "epoch": 0.968466730954677, + "grad_norm": 1.1063989400863647, + "learning_rate": 1.2332256647772654e-07, + "loss": 5.4181, + "step": 10043 + }, + { + "epoch": 0.968563162970106, + "grad_norm": 1.0669668912887573, + "learning_rate": 1.2257236003656025e-07, + "loss": 5.3861, + "step": 10044 + }, + { + "epoch": 0.9686595949855352, + "grad_norm": 1.6980376243591309, + "learning_rate": 1.2182443681508505e-07, + "loss": 5.3487, + "step": 10045 + }, + { + "epoch": 0.9687560270009643, + "grad_norm": 1.3404821157455444, + "learning_rate": 1.2107879688194047e-07, + "loss": 5.5275, + "step": 10046 + }, + { + "epoch": 0.9688524590163935, + "grad_norm": 1.3109310865402222, + "learning_rate": 1.2033544030556622e-07, + "loss": 5.3418, + "step": 10047 + }, + { + "epoch": 0.9689488910318226, + "grad_norm": 1.1986862421035767, + "learning_rate": 1.1959436715417994e-07, + "loss": 5.4081, + "step": 10048 + }, + { + "epoch": 0.9690453230472517, + "grad_norm": 1.2184828519821167, + "learning_rate": 1.1885557749579945e-07, + "loss": 5.2211, + "step": 10049 + }, + { + "epoch": 0.9691417550626809, + "grad_norm": 1.114328145980835, + "learning_rate": 1.1811907139823441e-07, + "loss": 5.482, + "step": 10050 + }, + { + "epoch": 0.9692381870781099, + "grad_norm": 1.543430209159851, + "learning_rate": 1.1738484892907242e-07, + "loss": 5.3389, + "step": 10051 + }, + { + "epoch": 0.969334619093539, + "grad_norm": 1.4682148694992065, + "learning_rate": 1.1665291015570402e-07, + "loss": 5.4447, + "step": 10052 + }, + { + "epoch": 0.9694310511089682, + "grad_norm": 1.3803000450134277, + "learning_rate": 1.15923255145306e-07, + "loss": 5.364, + "step": 10053 + }, + { + "epoch": 0.9695274831243973, + "grad_norm": 1.188308596611023, + "learning_rate": 1.1519588396484427e-07, + "loss": 5.5007, + "step": 10054 + }, + { + "epoch": 0.9696239151398264, + "grad_norm": 1.2890838384628296, + "learning_rate": 1.1447079668107652e-07, + "loss": 5.1415, + "step": 10055 + }, + { + "epoch": 0.9697203471552556, + "grad_norm": 1.2199373245239258, + "learning_rate": 1.137479933605523e-07, + "loss": 5.2499, + "step": 10056 + }, + { + "epoch": 0.9698167791706847, + "grad_norm": 1.0593889951705933, + "learning_rate": 1.130274740696019e-07, + "loss": 5.4752, + "step": 10057 + }, + { + "epoch": 0.9699132111861138, + "grad_norm": 1.073594570159912, + "learning_rate": 1.1230923887436406e-07, + "loss": 5.1671, + "step": 10058 + }, + { + "epoch": 0.9700096432015429, + "grad_norm": 1.2790136337280273, + "learning_rate": 1.1159328784074719e-07, + "loss": 5.2778, + "step": 10059 + }, + { + "epoch": 0.970106075216972, + "grad_norm": 1.485177755355835, + "learning_rate": 1.1087962103447092e-07, + "loss": 5.21, + "step": 10060 + }, + { + "epoch": 0.9702025072324012, + "grad_norm": 1.4525209665298462, + "learning_rate": 1.1016823852102732e-07, + "loss": 5.238, + "step": 10061 + }, + { + "epoch": 0.9702989392478303, + "grad_norm": 1.041674017906189, + "learning_rate": 1.0945914036570859e-07, + "loss": 5.2938, + "step": 10062 + }, + { + "epoch": 0.9703953712632594, + "grad_norm": 1.3654624223709106, + "learning_rate": 1.0875232663359326e-07, + "loss": 5.4246, + "step": 10063 + }, + { + "epoch": 0.9704918032786886, + "grad_norm": 1.0955772399902344, + "learning_rate": 1.0804779738955717e-07, + "loss": 5.2826, + "step": 10064 + }, + { + "epoch": 0.9705882352941176, + "grad_norm": 1.1506872177124023, + "learning_rate": 1.0734555269825141e-07, + "loss": 5.233, + "step": 10065 + }, + { + "epoch": 0.9706846673095467, + "grad_norm": 1.4096359014511108, + "learning_rate": 1.0664559262413831e-07, + "loss": 5.2959, + "step": 10066 + }, + { + "epoch": 0.9707810993249759, + "grad_norm": 1.4919387102127075, + "learning_rate": 1.0594791723144981e-07, + "loss": 5.2384, + "step": 10067 + }, + { + "epoch": 0.970877531340405, + "grad_norm": 1.762961745262146, + "learning_rate": 1.0525252658422358e-07, + "loss": 5.3005, + "step": 10068 + }, + { + "epoch": 0.9709739633558342, + "grad_norm": 1.1749343872070312, + "learning_rate": 1.0455942074628078e-07, + "loss": 5.463, + "step": 10069 + }, + { + "epoch": 0.9710703953712633, + "grad_norm": 1.5215814113616943, + "learning_rate": 1.0386859978123165e-07, + "loss": 5.1392, + "step": 10070 + }, + { + "epoch": 0.9711668273866924, + "grad_norm": 1.9052096605300903, + "learning_rate": 1.0318006375248102e-07, + "loss": 5.1811, + "step": 10071 + }, + { + "epoch": 0.9712632594021215, + "grad_norm": 1.387021541595459, + "learning_rate": 1.0249381272322279e-07, + "loss": 5.296, + "step": 10072 + }, + { + "epoch": 0.9713596914175506, + "grad_norm": 1.1214683055877686, + "learning_rate": 1.0180984675643712e-07, + "loss": 5.281, + "step": 10073 + }, + { + "epoch": 0.9714561234329797, + "grad_norm": 1.2170169353485107, + "learning_rate": 1.0112816591489882e-07, + "loss": 5.5139, + "step": 10074 + }, + { + "epoch": 0.9715525554484089, + "grad_norm": 1.2026182413101196, + "learning_rate": 1.0044877026117172e-07, + "loss": 5.4415, + "step": 10075 + }, + { + "epoch": 0.971648987463838, + "grad_norm": 0.9782285094261169, + "learning_rate": 9.977165985760872e-08, + "loss": 5.3149, + "step": 10076 + }, + { + "epoch": 0.9717454194792671, + "grad_norm": 1.0969157218933105, + "learning_rate": 9.909683476636011e-08, + "loss": 5.3005, + "step": 10077 + }, + { + "epoch": 0.9718418514946963, + "grad_norm": 1.6562451124191284, + "learning_rate": 9.842429504935136e-08, + "loss": 5.4863, + "step": 10078 + }, + { + "epoch": 0.9719382835101253, + "grad_norm": 1.3988368511199951, + "learning_rate": 9.775404076831363e-08, + "loss": 5.2819, + "step": 10079 + }, + { + "epoch": 0.9720347155255545, + "grad_norm": 1.4844474792480469, + "learning_rate": 9.708607198476161e-08, + "loss": 4.9952, + "step": 10080 + }, + { + "epoch": 0.9721311475409836, + "grad_norm": 1.141634225845337, + "learning_rate": 9.642038875999903e-08, + "loss": 5.2605, + "step": 10081 + }, + { + "epoch": 0.9722275795564127, + "grad_norm": 1.5664708614349365, + "learning_rate": 9.57569911551215e-08, + "loss": 5.3217, + "step": 10082 + }, + { + "epoch": 0.9723240115718419, + "grad_norm": 2.214928388595581, + "learning_rate": 9.509587923101359e-08, + "loss": 5.3614, + "step": 10083 + }, + { + "epoch": 0.972420443587271, + "grad_norm": 1.517556071281433, + "learning_rate": 9.443705304835459e-08, + "loss": 5.049, + "step": 10084 + }, + { + "epoch": 0.9725168756027001, + "grad_norm": 1.1463929414749146, + "learning_rate": 9.378051266761001e-08, + "loss": 5.425, + "step": 10085 + }, + { + "epoch": 0.9726133076181293, + "grad_norm": 1.2858508825302124, + "learning_rate": 9.312625814903164e-08, + "loss": 5.3076, + "step": 10086 + }, + { + "epoch": 0.9727097396335583, + "grad_norm": 1.7069048881530762, + "learning_rate": 9.247428955267423e-08, + "loss": 4.8342, + "step": 10087 + }, + { + "epoch": 0.9728061716489874, + "grad_norm": 2.130887508392334, + "learning_rate": 9.182460693836769e-08, + "loss": 5.0139, + "step": 10088 + }, + { + "epoch": 0.9729026036644166, + "grad_norm": 1.3595716953277588, + "learning_rate": 9.117721036574212e-08, + "loss": 5.1972, + "step": 10089 + }, + { + "epoch": 0.9729990356798457, + "grad_norm": 1.2807053327560425, + "learning_rate": 9.053209989421386e-08, + "loss": 5.4636, + "step": 10090 + }, + { + "epoch": 0.9730954676952749, + "grad_norm": 2.260024309158325, + "learning_rate": 8.988927558299387e-08, + "loss": 5.401, + "step": 10091 + }, + { + "epoch": 0.973191899710704, + "grad_norm": 1.442013144493103, + "learning_rate": 8.924873749107387e-08, + "loss": 5.4375, + "step": 10092 + }, + { + "epoch": 0.973288331726133, + "grad_norm": 2.3913512229919434, + "learning_rate": 8.861048567724572e-08, + "loss": 5.4591, + "step": 10093 + }, + { + "epoch": 0.9733847637415622, + "grad_norm": 1.095559000968933, + "learning_rate": 8.797452020008478e-08, + "loss": 5.4418, + "step": 10094 + }, + { + "epoch": 0.9734811957569913, + "grad_norm": 1.0165406465530396, + "learning_rate": 8.734084111796381e-08, + "loss": 5.4471, + "step": 10095 + }, + { + "epoch": 0.9735776277724204, + "grad_norm": 1.2083901166915894, + "learning_rate": 8.670944848903628e-08, + "loss": 5.1804, + "step": 10096 + }, + { + "epoch": 0.9736740597878496, + "grad_norm": 1.2476553916931152, + "learning_rate": 8.608034237125029e-08, + "loss": 5.1073, + "step": 10097 + }, + { + "epoch": 0.9737704918032787, + "grad_norm": 0.9702637195587158, + "learning_rate": 8.545352282234853e-08, + "loss": 5.3193, + "step": 10098 + }, + { + "epoch": 0.9738669238187078, + "grad_norm": 1.0943925380706787, + "learning_rate": 8.482898989985722e-08, + "loss": 5.4137, + "step": 10099 + }, + { + "epoch": 0.973963355834137, + "grad_norm": 1.3025734424591064, + "learning_rate": 8.420674366109716e-08, + "loss": 5.1255, + "step": 10100 + }, + { + "epoch": 0.974059787849566, + "grad_norm": 1.5847201347351074, + "learning_rate": 8.358678416317267e-08, + "loss": 5.4203, + "step": 10101 + }, + { + "epoch": 0.9741562198649952, + "grad_norm": 1.2121161222457886, + "learning_rate": 8.296911146298825e-08, + "loss": 5.3793, + "step": 10102 + }, + { + "epoch": 0.9742526518804243, + "grad_norm": 0.9604878425598145, + "learning_rate": 8.235372561722909e-08, + "loss": 5.1058, + "step": 10103 + }, + { + "epoch": 0.9743490838958534, + "grad_norm": 1.0124657154083252, + "learning_rate": 8.174062668238058e-08, + "loss": 5.5866, + "step": 10104 + }, + { + "epoch": 0.9744455159112826, + "grad_norm": 1.470929741859436, + "learning_rate": 8.112981471470327e-08, + "loss": 5.4116, + "step": 10105 + }, + { + "epoch": 0.9745419479267117, + "grad_norm": 1.3947657346725464, + "learning_rate": 8.052128977026341e-08, + "loss": 5.1111, + "step": 10106 + }, + { + "epoch": 0.9746383799421408, + "grad_norm": 1.3869082927703857, + "learning_rate": 7.9915051904908e-08, + "loss": 5.3116, + "step": 10107 + }, + { + "epoch": 0.97473481195757, + "grad_norm": 1.3075939416885376, + "learning_rate": 7.931110117427864e-08, + "loss": 5.1245, + "step": 10108 + }, + { + "epoch": 0.974831243972999, + "grad_norm": 1.6851094961166382, + "learning_rate": 7.870943763380323e-08, + "loss": 5.2338, + "step": 10109 + }, + { + "epoch": 0.9749276759884281, + "grad_norm": 1.152250051498413, + "learning_rate": 7.811006133870424e-08, + "loss": 5.3257, + "step": 10110 + }, + { + "epoch": 0.9750241080038573, + "grad_norm": 0.9795024991035461, + "learning_rate": 7.75129723439877e-08, + "loss": 5.1916, + "step": 10111 + }, + { + "epoch": 0.9751205400192864, + "grad_norm": 1.3423633575439453, + "learning_rate": 7.691817070445694e-08, + "loss": 5.3989, + "step": 10112 + }, + { + "epoch": 0.9752169720347156, + "grad_norm": 1.3220996856689453, + "learning_rate": 7.632565647469892e-08, + "loss": 5.2457, + "step": 10113 + }, + { + "epoch": 0.9753134040501447, + "grad_norm": 1.1833775043487549, + "learning_rate": 7.573542970909786e-08, + "loss": 5.424, + "step": 10114 + }, + { + "epoch": 0.9754098360655737, + "grad_norm": 1.3151181936264038, + "learning_rate": 7.514749046182434e-08, + "loss": 5.3503, + "step": 10115 + }, + { + "epoch": 0.9755062680810029, + "grad_norm": 2.3571553230285645, + "learning_rate": 7.456183878683243e-08, + "loss": 5.5602, + "step": 10116 + }, + { + "epoch": 0.975602700096432, + "grad_norm": 1.5214492082595825, + "learning_rate": 7.397847473787911e-08, + "loss": 5.3535, + "step": 10117 + }, + { + "epoch": 0.9756991321118611, + "grad_norm": 1.2084373235702515, + "learning_rate": 7.339739836850213e-08, + "loss": 5.2732, + "step": 10118 + }, + { + "epoch": 0.9757955641272903, + "grad_norm": 1.0838494300842285, + "learning_rate": 7.281860973203381e-08, + "loss": 5.495, + "step": 10119 + }, + { + "epoch": 0.9758919961427194, + "grad_norm": 1.467576026916504, + "learning_rate": 7.224210888159e-08, + "loss": 5.2216, + "step": 10120 + }, + { + "epoch": 0.9759884281581485, + "grad_norm": 1.0047286748886108, + "learning_rate": 7.166789587008672e-08, + "loss": 5.0523, + "step": 10121 + }, + { + "epoch": 0.9760848601735777, + "grad_norm": 1.959740161895752, + "learning_rate": 7.109597075022345e-08, + "loss": 5.1405, + "step": 10122 + }, + { + "epoch": 0.9761812921890067, + "grad_norm": 1.2117356061935425, + "learning_rate": 7.052633357448601e-08, + "loss": 5.262, + "step": 10123 + }, + { + "epoch": 0.9762777242044359, + "grad_norm": 2.0862395763397217, + "learning_rate": 6.995898439516036e-08, + "loss": 5.0766, + "step": 10124 + }, + { + "epoch": 0.976374156219865, + "grad_norm": 1.0166866779327393, + "learning_rate": 6.939392326431593e-08, + "loss": 5.2875, + "step": 10125 + }, + { + "epoch": 0.9764705882352941, + "grad_norm": 1.9621480703353882, + "learning_rate": 6.883115023381126e-08, + "loss": 5.255, + "step": 10126 + }, + { + "epoch": 0.9765670202507233, + "grad_norm": 1.3307042121887207, + "learning_rate": 6.827066535529946e-08, + "loss": 5.4053, + "step": 10127 + }, + { + "epoch": 0.9766634522661524, + "grad_norm": 1.0680809020996094, + "learning_rate": 6.771246868021719e-08, + "loss": 5.5223, + "step": 10128 + }, + { + "epoch": 0.9767598842815814, + "grad_norm": 1.1616321802139282, + "learning_rate": 6.715656025980122e-08, + "loss": 5.3182, + "step": 10129 + }, + { + "epoch": 0.9768563162970106, + "grad_norm": 1.226900339126587, + "learning_rate": 6.660294014506907e-08, + "loss": 5.2287, + "step": 10130 + }, + { + "epoch": 0.9769527483124397, + "grad_norm": 1.9055465459823608, + "learning_rate": 6.605160838683011e-08, + "loss": 5.5003, + "step": 10131 + }, + { + "epoch": 0.9770491803278688, + "grad_norm": 1.9077353477478027, + "learning_rate": 6.550256503568276e-08, + "loss": 5.0888, + "step": 10132 + }, + { + "epoch": 0.977145612343298, + "grad_norm": 1.270274043083191, + "learning_rate": 6.495581014202556e-08, + "loss": 5.27, + "step": 10133 + }, + { + "epoch": 0.9772420443587271, + "grad_norm": 1.2665079832077026, + "learning_rate": 6.441134375602953e-08, + "loss": 5.3577, + "step": 10134 + }, + { + "epoch": 0.9773384763741563, + "grad_norm": 1.3414952754974365, + "learning_rate": 6.386916592767133e-08, + "loss": 5.2256, + "step": 10135 + }, + { + "epoch": 0.9774349083895854, + "grad_norm": 1.3418956995010376, + "learning_rate": 6.332927670671119e-08, + "loss": 5.1928, + "step": 10136 + }, + { + "epoch": 0.9775313404050144, + "grad_norm": 1.7143350839614868, + "learning_rate": 6.279167614269832e-08, + "loss": 5.0481, + "step": 10137 + }, + { + "epoch": 0.9776277724204436, + "grad_norm": 1.391810417175293, + "learning_rate": 6.225636428497106e-08, + "loss": 5.0789, + "step": 10138 + }, + { + "epoch": 0.9777242044358727, + "grad_norm": 1.5211313962936401, + "learning_rate": 6.172334118266232e-08, + "loss": 5.1611, + "step": 10139 + }, + { + "epoch": 0.9778206364513018, + "grad_norm": 1.1593451499938965, + "learning_rate": 6.119260688468854e-08, + "loss": 5.0946, + "step": 10140 + }, + { + "epoch": 0.977917068466731, + "grad_norm": 1.3526231050491333, + "learning_rate": 6.066416143976628e-08, + "loss": 5.2587, + "step": 10141 + }, + { + "epoch": 0.9780135004821601, + "grad_norm": 1.1290918588638306, + "learning_rate": 6.013800489639009e-08, + "loss": 5.2069, + "step": 10142 + }, + { + "epoch": 0.9781099324975892, + "grad_norm": 1.9147388935089111, + "learning_rate": 5.961413730285192e-08, + "loss": 4.9987, + "step": 10143 + }, + { + "epoch": 0.9782063645130183, + "grad_norm": 1.1531740427017212, + "learning_rate": 5.9092558707232737e-08, + "loss": 5.0975, + "step": 10144 + }, + { + "epoch": 0.9783027965284474, + "grad_norm": 1.749523401260376, + "learning_rate": 5.85732691573998e-08, + "loss": 5.3488, + "step": 10145 + }, + { + "epoch": 0.9783992285438766, + "grad_norm": 1.5201691389083862, + "learning_rate": 5.8056268701014995e-08, + "loss": 5.3674, + "step": 10146 + }, + { + "epoch": 0.9784956605593057, + "grad_norm": 1.4121158123016357, + "learning_rate": 5.7541557385526465e-08, + "loss": 5.5268, + "step": 10147 + }, + { + "epoch": 0.9785920925747348, + "grad_norm": 1.4431129693984985, + "learning_rate": 5.7029135258174214e-08, + "loss": 5.1698, + "step": 10148 + }, + { + "epoch": 0.978688524590164, + "grad_norm": 1.426473617553711, + "learning_rate": 5.651900236599006e-08, + "loss": 5.462, + "step": 10149 + }, + { + "epoch": 0.9787849566055931, + "grad_norm": 1.4689453840255737, + "learning_rate": 5.601115875578933e-08, + "loss": 5.4571, + "step": 10150 + }, + { + "epoch": 0.9788813886210221, + "grad_norm": 1.122692584991455, + "learning_rate": 5.5505604474184734e-08, + "loss": 5.0194, + "step": 10151 + }, + { + "epoch": 0.9789778206364513, + "grad_norm": 1.1265424489974976, + "learning_rate": 5.5002339567575275e-08, + "loss": 5.2696, + "step": 10152 + }, + { + "epoch": 0.9790742526518804, + "grad_norm": 1.0074801445007324, + "learning_rate": 5.450136408214623e-08, + "loss": 5.1877, + "step": 10153 + }, + { + "epoch": 0.9791706846673095, + "grad_norm": 0.8983921408653259, + "learning_rate": 5.400267806388304e-08, + "loss": 5.1591, + "step": 10154 + }, + { + "epoch": 0.9792671166827387, + "grad_norm": 1.7483834028244019, + "learning_rate": 5.3506281558546313e-08, + "loss": 5.034, + "step": 10155 + }, + { + "epoch": 0.9793635486981678, + "grad_norm": 1.527286171913147, + "learning_rate": 5.301217461169961e-08, + "loss": 5.0633, + "step": 10156 + }, + { + "epoch": 0.979459980713597, + "grad_norm": 1.0076245069503784, + "learning_rate": 5.2520357268692755e-08, + "loss": 5.2582, + "step": 10157 + }, + { + "epoch": 0.979556412729026, + "grad_norm": 0.9799389243125916, + "learning_rate": 5.203082957466188e-08, + "loss": 5.1536, + "step": 10158 + }, + { + "epoch": 0.9796528447444551, + "grad_norm": 1.2233866453170776, + "learning_rate": 5.1543591574532145e-08, + "loss": 5.3299, + "step": 10159 + }, + { + "epoch": 0.9797492767598843, + "grad_norm": 1.2845500707626343, + "learning_rate": 5.1058643313028897e-08, + "loss": 5.334, + "step": 10160 + }, + { + "epoch": 0.9798457087753134, + "grad_norm": 1.2247728109359741, + "learning_rate": 5.0575984834655424e-08, + "loss": 5.2116, + "step": 10161 + }, + { + "epoch": 0.9799421407907425, + "grad_norm": 1.7021913528442383, + "learning_rate": 5.009561618370961e-08, + "loss": 5.1702, + "step": 10162 + }, + { + "epoch": 0.9800385728061717, + "grad_norm": 1.3847354650497437, + "learning_rate": 4.961753740427844e-08, + "loss": 5.483, + "step": 10163 + }, + { + "epoch": 0.9801350048216008, + "grad_norm": 1.220481514930725, + "learning_rate": 4.914174854024345e-08, + "loss": 5.5616, + "step": 10164 + }, + { + "epoch": 0.9802314368370298, + "grad_norm": 1.7424153089523315, + "learning_rate": 4.866824963526695e-08, + "loss": 5.8472, + "step": 10165 + }, + { + "epoch": 0.980327868852459, + "grad_norm": 1.5936084985733032, + "learning_rate": 4.819704073281139e-08, + "loss": 5.4444, + "step": 10166 + }, + { + "epoch": 0.9804243008678881, + "grad_norm": 1.26979398727417, + "learning_rate": 4.772812187611719e-08, + "loss": 5.3604, + "step": 10167 + }, + { + "epoch": 0.9805207328833173, + "grad_norm": 1.015939712524414, + "learning_rate": 4.726149310822769e-08, + "loss": 5.3095, + "step": 10168 + }, + { + "epoch": 0.9806171648987464, + "grad_norm": 1.2335330247879028, + "learning_rate": 4.67971544719642e-08, + "loss": 5.3923, + "step": 10169 + }, + { + "epoch": 0.9807135969141755, + "grad_norm": 1.7588969469070435, + "learning_rate": 4.633510600994817e-08, + "loss": 4.9141, + "step": 10170 + }, + { + "epoch": 0.9808100289296047, + "grad_norm": 0.9549494981765747, + "learning_rate": 4.58753477645818e-08, + "loss": 5.1986, + "step": 10171 + }, + { + "epoch": 0.9809064609450338, + "grad_norm": 1.176961898803711, + "learning_rate": 4.5417879778059115e-08, + "loss": 5.247, + "step": 10172 + }, + { + "epoch": 0.9810028929604628, + "grad_norm": 1.1613560914993286, + "learning_rate": 4.49627020923743e-08, + "loss": 5.1847, + "step": 10173 + }, + { + "epoch": 0.981099324975892, + "grad_norm": 1.1440726518630981, + "learning_rate": 4.450981474929672e-08, + "loss": 5.3496, + "step": 10174 + }, + { + "epoch": 0.9811957569913211, + "grad_norm": 1.7433459758758545, + "learning_rate": 4.405921779039313e-08, + "loss": 5.2013, + "step": 10175 + }, + { + "epoch": 0.9812921890067502, + "grad_norm": 2.7590880393981934, + "learning_rate": 4.361091125701655e-08, + "loss": 5.0004, + "step": 10176 + }, + { + "epoch": 0.9813886210221794, + "grad_norm": 2.0143916606903076, + "learning_rate": 4.316489519031464e-08, + "loss": 5.0975, + "step": 10177 + }, + { + "epoch": 0.9814850530376085, + "grad_norm": 1.2427701950073242, + "learning_rate": 4.272116963122408e-08, + "loss": 5.3088, + "step": 10178 + }, + { + "epoch": 0.9815814850530377, + "grad_norm": 1.465028166770935, + "learning_rate": 4.227973462046508e-08, + "loss": 5.2574, + "step": 10179 + }, + { + "epoch": 0.9816779170684667, + "grad_norm": 1.207133173942566, + "learning_rate": 4.184059019855524e-08, + "loss": 5.2895, + "step": 10180 + }, + { + "epoch": 0.9817743490838958, + "grad_norm": 1.2045694589614868, + "learning_rate": 4.1403736405798424e-08, + "loss": 5.3849, + "step": 10181 + }, + { + "epoch": 0.981870781099325, + "grad_norm": 1.494881272315979, + "learning_rate": 4.0969173282287554e-08, + "loss": 5.3879, + "step": 10182 + }, + { + "epoch": 0.9819672131147541, + "grad_norm": 1.3481597900390625, + "learning_rate": 4.053690086790463e-08, + "loss": 5.108, + "step": 10183 + }, + { + "epoch": 0.9820636451301832, + "grad_norm": 1.1774020195007324, + "learning_rate": 4.0106919202326234e-08, + "loss": 5.459, + "step": 10184 + }, + { + "epoch": 0.9821600771456124, + "grad_norm": 1.2824493646621704, + "learning_rate": 3.967922832501247e-08, + "loss": 5.3447, + "step": 10185 + }, + { + "epoch": 0.9822565091610415, + "grad_norm": 1.358713150024414, + "learning_rate": 3.9253828275218064e-08, + "loss": 5.5817, + "step": 10186 + }, + { + "epoch": 0.9823529411764705, + "grad_norm": 1.8295749425888062, + "learning_rate": 3.8830719091986764e-08, + "loss": 5.1171, + "step": 10187 + }, + { + "epoch": 0.9824493731918997, + "grad_norm": 1.8957033157348633, + "learning_rate": 3.840990081415141e-08, + "loss": 5.1626, + "step": 10188 + }, + { + "epoch": 0.9825458052073288, + "grad_norm": 2.4495606422424316, + "learning_rate": 3.7991373480331105e-08, + "loss": 5.2281, + "step": 10189 + }, + { + "epoch": 0.982642237222758, + "grad_norm": 1.9000794887542725, + "learning_rate": 3.757513712893956e-08, + "loss": 5.0975, + "step": 10190 + }, + { + "epoch": 0.9827386692381871, + "grad_norm": 1.9340952634811401, + "learning_rate": 3.7161191798176784e-08, + "loss": 5.3162, + "step": 10191 + }, + { + "epoch": 0.9828351012536162, + "grad_norm": 2.0077402591705322, + "learning_rate": 3.674953752603738e-08, + "loss": 5.2423, + "step": 10192 + }, + { + "epoch": 0.9829315332690454, + "grad_norm": 2.7946765422821045, + "learning_rate": 3.634017435029946e-08, + "loss": 5.0935, + "step": 10193 + }, + { + "epoch": 0.9830279652844744, + "grad_norm": 1.869299054145813, + "learning_rate": 3.593310230853575e-08, + "loss": 5.1183, + "step": 10194 + }, + { + "epoch": 0.9831243972999035, + "grad_norm": 1.964680790901184, + "learning_rate": 3.552832143810525e-08, + "loss": 5.0508, + "step": 10195 + }, + { + "epoch": 0.9832208293153327, + "grad_norm": 1.979008436203003, + "learning_rate": 3.5125831776158804e-08, + "loss": 5.0788, + "step": 10196 + }, + { + "epoch": 0.9833172613307618, + "grad_norm": 1.6503841876983643, + "learning_rate": 3.4725633359639074e-08, + "loss": 5.1813, + "step": 10197 + }, + { + "epoch": 0.9834136933461909, + "grad_norm": 2.109868049621582, + "learning_rate": 3.432772622527225e-08, + "loss": 5.0091, + "step": 10198 + }, + { + "epoch": 0.9835101253616201, + "grad_norm": 1.8196877241134644, + "learning_rate": 3.3932110409579107e-08, + "loss": 5.0047, + "step": 10199 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 1.7051199674606323, + "learning_rate": 3.3538785948866723e-08, + "loss": 4.9436, + "step": 10200 + }, + { + "epoch": 0.9837029893924784, + "grad_norm": 1.457876205444336, + "learning_rate": 3.314775287923677e-08, + "loss": 5.0451, + "step": 10201 + }, + { + "epoch": 0.9837994214079074, + "grad_norm": 2.250014305114746, + "learning_rate": 3.275901123657721e-08, + "loss": 4.9355, + "step": 10202 + }, + { + "epoch": 0.9838958534233365, + "grad_norm": 2.9177327156066895, + "learning_rate": 3.237256105656783e-08, + "loss": 5.071, + "step": 10203 + }, + { + "epoch": 0.9839922854387657, + "grad_norm": 1.4382871389389038, + "learning_rate": 3.198840237467193e-08, + "loss": 5.0243, + "step": 10204 + }, + { + "epoch": 0.9840887174541948, + "grad_norm": 1.621718168258667, + "learning_rate": 3.16065352261502e-08, + "loss": 5.1176, + "step": 10205 + }, + { + "epoch": 0.9841851494696239, + "grad_norm": 1.7714933156967163, + "learning_rate": 3.122695964605238e-08, + "loss": 5.0495, + "step": 10206 + }, + { + "epoch": 0.9842815814850531, + "grad_norm": 1.9737739562988281, + "learning_rate": 3.0849675669208935e-08, + "loss": 4.9633, + "step": 10207 + }, + { + "epoch": 0.9843780135004822, + "grad_norm": 1.625383734703064, + "learning_rate": 3.047468333025327e-08, + "loss": 4.9266, + "step": 10208 + }, + { + "epoch": 0.9844744455159112, + "grad_norm": 1.826066493988037, + "learning_rate": 3.010198266359676e-08, + "loss": 4.959, + "step": 10209 + }, + { + "epoch": 0.9845708775313404, + "grad_norm": 1.8063597679138184, + "learning_rate": 2.9731573703448145e-08, + "loss": 4.7903, + "step": 10210 + }, + { + "epoch": 0.9846673095467695, + "grad_norm": 1.8213714361190796, + "learning_rate": 2.9363456483802455e-08, + "loss": 5.0388, + "step": 10211 + }, + { + "epoch": 0.9847637415621987, + "grad_norm": 1.515777587890625, + "learning_rate": 2.8997631038443772e-08, + "loss": 5.1059, + "step": 10212 + }, + { + "epoch": 0.9848601735776278, + "grad_norm": 1.8277373313903809, + "learning_rate": 2.8634097400948023e-08, + "loss": 4.949, + "step": 10213 + }, + { + "epoch": 0.9849566055930569, + "grad_norm": 1.4263640642166138, + "learning_rate": 2.8272855604680183e-08, + "loss": 5.1317, + "step": 10214 + }, + { + "epoch": 0.9850530376084861, + "grad_norm": 1.5433131456375122, + "learning_rate": 2.7913905682797058e-08, + "loss": 5.1885, + "step": 10215 + }, + { + "epoch": 0.9851494696239151, + "grad_norm": 1.6363126039505005, + "learning_rate": 2.7557247668236196e-08, + "loss": 5.1092, + "step": 10216 + }, + { + "epoch": 0.9852459016393442, + "grad_norm": 1.7445919513702393, + "learning_rate": 2.720288159373807e-08, + "loss": 5.2313, + "step": 10217 + }, + { + "epoch": 0.9853423336547734, + "grad_norm": 1.745779275894165, + "learning_rate": 2.6850807491823892e-08, + "loss": 4.8513, + "step": 10218 + }, + { + "epoch": 0.9854387656702025, + "grad_norm": 1.563155174255371, + "learning_rate": 2.650102539480115e-08, + "loss": 5.0242, + "step": 10219 + }, + { + "epoch": 0.9855351976856316, + "grad_norm": 2.0225579738616943, + "learning_rate": 2.6153535334780264e-08, + "loss": 5.157, + "step": 10220 + }, + { + "epoch": 0.9856316297010608, + "grad_norm": 1.4893535375595093, + "learning_rate": 2.5808337343649625e-08, + "loss": 5.0714, + "step": 10221 + }, + { + "epoch": 0.9857280617164899, + "grad_norm": 1.9063140153884888, + "learning_rate": 2.5465431453092216e-08, + "loss": 4.8188, + "step": 10222 + }, + { + "epoch": 0.985824493731919, + "grad_norm": 1.4307782649993896, + "learning_rate": 2.5124817694577306e-08, + "loss": 5.0535, + "step": 10223 + }, + { + "epoch": 0.9859209257473481, + "grad_norm": 1.643080234527588, + "learning_rate": 2.4786496099366008e-08, + "loss": 5.1937, + "step": 10224 + }, + { + "epoch": 0.9860173577627772, + "grad_norm": 2.4217851161956787, + "learning_rate": 2.445046669851403e-08, + "loss": 5.2275, + "step": 10225 + }, + { + "epoch": 0.9861137897782064, + "grad_norm": 1.5828465223312378, + "learning_rate": 2.4116729522857818e-08, + "loss": 5.3195, + "step": 10226 + }, + { + "epoch": 0.9862102217936355, + "grad_norm": 1.9691423177719116, + "learning_rate": 2.3785284603025647e-08, + "loss": 5.495, + "step": 10227 + }, + { + "epoch": 0.9863066538090646, + "grad_norm": 1.893185019493103, + "learning_rate": 2.345613196944041e-08, + "loss": 5.1288, + "step": 10228 + }, + { + "epoch": 0.9864030858244938, + "grad_norm": 1.6365846395492554, + "learning_rate": 2.3129271652311268e-08, + "loss": 5.1717, + "step": 10229 + }, + { + "epoch": 0.9864995178399228, + "grad_norm": 1.65410315990448, + "learning_rate": 2.2804703681636453e-08, + "loss": 5.278, + "step": 10230 + }, + { + "epoch": 0.9865959498553519, + "grad_norm": 1.8867475986480713, + "learning_rate": 2.2482428087203243e-08, + "loss": 5.0973, + "step": 10231 + }, + { + "epoch": 0.9866923818707811, + "grad_norm": 3.0146005153656006, + "learning_rate": 2.216244489859076e-08, + "loss": 5.0858, + "step": 10232 + }, + { + "epoch": 0.9867888138862102, + "grad_norm": 2.942876100540161, + "learning_rate": 2.1844754145164403e-08, + "loss": 5.073, + "step": 10233 + }, + { + "epoch": 0.9868852459016394, + "grad_norm": 1.9070345163345337, + "learning_rate": 2.1529355856084177e-08, + "loss": 5.1887, + "step": 10234 + }, + { + "epoch": 0.9869816779170685, + "grad_norm": 2.542656898498535, + "learning_rate": 2.121625006029637e-08, + "loss": 4.7816, + "step": 10235 + }, + { + "epoch": 0.9870781099324976, + "grad_norm": 1.852657675743103, + "learning_rate": 2.0905436786539112e-08, + "loss": 4.9206, + "step": 10236 + }, + { + "epoch": 0.9871745419479268, + "grad_norm": 1.8167680501937866, + "learning_rate": 2.0596916063334025e-08, + "loss": 4.8428, + "step": 10237 + }, + { + "epoch": 0.9872709739633558, + "grad_norm": 1.4592914581298828, + "learning_rate": 2.0290687919000128e-08, + "loss": 5.148, + "step": 10238 + }, + { + "epoch": 0.9873674059787849, + "grad_norm": 1.5188255310058594, + "learning_rate": 1.9986752381642714e-08, + "loss": 5.0568, + "step": 10239 + }, + { + "epoch": 0.9874638379942141, + "grad_norm": 1.5164687633514404, + "learning_rate": 1.9685109479156138e-08, + "loss": 4.8735, + "step": 10240 + }, + { + "epoch": 0.9875602700096432, + "grad_norm": 1.581488847732544, + "learning_rate": 1.9385759239221035e-08, + "loss": 5.0731, + "step": 10241 + }, + { + "epoch": 0.9876567020250723, + "grad_norm": 1.387882113456726, + "learning_rate": 1.90887016893182e-08, + "loss": 5.065, + "step": 10242 + }, + { + "epoch": 0.9877531340405015, + "grad_norm": 1.4103105068206787, + "learning_rate": 1.8793936856706383e-08, + "loss": 5.003, + "step": 10243 + }, + { + "epoch": 0.9878495660559306, + "grad_norm": 2.006037950515747, + "learning_rate": 1.8501464768441724e-08, + "loss": 5.0664, + "step": 10244 + }, + { + "epoch": 0.9879459980713597, + "grad_norm": 1.173508644104004, + "learning_rate": 1.8211285451363858e-08, + "loss": 5.0531, + "step": 10245 + }, + { + "epoch": 0.9880424300867888, + "grad_norm": 1.3495467901229858, + "learning_rate": 1.792339893210704e-08, + "loss": 5.1425, + "step": 10246 + }, + { + "epoch": 0.9881388621022179, + "grad_norm": 1.379981517791748, + "learning_rate": 1.7637805237094573e-08, + "loss": 5.0434, + "step": 10247 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 2.151252508163452, + "learning_rate": 1.735450439253328e-08, + "loss": 5.0563, + "step": 10248 + }, + { + "epoch": 0.9883317261330762, + "grad_norm": 1.8491114377975464, + "learning_rate": 1.7073496424427348e-08, + "loss": 5.044, + "step": 10249 + }, + { + "epoch": 0.9884281581485053, + "grad_norm": 2.0495822429656982, + "learning_rate": 1.679478135856727e-08, + "loss": 4.9448, + "step": 10250 + }, + { + "epoch": 0.9885245901639345, + "grad_norm": 1.7392712831497192, + "learning_rate": 1.6518359220535352e-08, + "loss": 5.0516, + "step": 10251 + }, + { + "epoch": 0.9886210221793635, + "grad_norm": 1.339551329612732, + "learning_rate": 1.624423003569464e-08, + "loss": 4.9485, + "step": 10252 + }, + { + "epoch": 0.9887174541947926, + "grad_norm": 1.4844669103622437, + "learning_rate": 1.5972393829211118e-08, + "loss": 4.8552, + "step": 10253 + }, + { + "epoch": 0.9888138862102218, + "grad_norm": 1.2545491456985474, + "learning_rate": 1.5702850626031497e-08, + "loss": 5.0136, + "step": 10254 + }, + { + "epoch": 0.9889103182256509, + "grad_norm": 1.4170689582824707, + "learning_rate": 1.543560045089154e-08, + "loss": 4.9401, + "step": 10255 + }, + { + "epoch": 0.9890067502410801, + "grad_norm": 2.171910285949707, + "learning_rate": 1.5170643328321633e-08, + "loss": 5.0361, + "step": 10256 + }, + { + "epoch": 0.9891031822565092, + "grad_norm": 1.6283208131790161, + "learning_rate": 1.490797928263843e-08, + "loss": 5.2986, + "step": 10257 + }, + { + "epoch": 0.9891996142719383, + "grad_norm": 1.481693983078003, + "learning_rate": 1.4647608337950425e-08, + "loss": 5.1816, + "step": 10258 + }, + { + "epoch": 0.9892960462873674, + "grad_norm": 1.8496779203414917, + "learning_rate": 1.4389530518152394e-08, + "loss": 5.159, + "step": 10259 + }, + { + "epoch": 0.9893924783027965, + "grad_norm": 3.483255624771118, + "learning_rate": 1.4133745846930945e-08, + "loss": 4.7823, + "step": 10260 + }, + { + "epoch": 0.9894889103182256, + "grad_norm": 2.098707675933838, + "learning_rate": 1.3880254347761745e-08, + "loss": 5.4047, + "step": 10261 + }, + { + "epoch": 0.9895853423336548, + "grad_norm": 1.9285451173782349, + "learning_rate": 1.3629056043909517e-08, + "loss": 5.4732, + "step": 10262 + }, + { + "epoch": 0.9896817743490839, + "grad_norm": 2.066678047180176, + "learning_rate": 1.3380150958430814e-08, + "loss": 5.5033, + "step": 10263 + }, + { + "epoch": 0.989778206364513, + "grad_norm": 1.263688564300537, + "learning_rate": 1.3133539114165705e-08, + "loss": 5.1097, + "step": 10264 + }, + { + "epoch": 0.9898746383799422, + "grad_norm": 2.592017889022827, + "learning_rate": 1.2889220533751634e-08, + "loss": 5.0726, + "step": 10265 + }, + { + "epoch": 0.9899710703953712, + "grad_norm": 1.4026459455490112, + "learning_rate": 1.2647195239612331e-08, + "loss": 5.0433, + "step": 10266 + }, + { + "epoch": 0.9900675024108004, + "grad_norm": 1.2927111387252808, + "learning_rate": 1.2407463253957807e-08, + "loss": 5.0662, + "step": 10267 + }, + { + "epoch": 0.9901639344262295, + "grad_norm": 1.258758306503296, + "learning_rate": 1.2170024598792684e-08, + "loss": 5.0598, + "step": 10268 + }, + { + "epoch": 0.9902603664416586, + "grad_norm": 1.3063730001449585, + "learning_rate": 1.1934879295905089e-08, + "loss": 5.0735, + "step": 10269 + }, + { + "epoch": 0.9903567984570878, + "grad_norm": 1.42191743850708, + "learning_rate": 1.170202736688053e-08, + "loss": 4.9433, + "step": 10270 + }, + { + "epoch": 0.9904532304725169, + "grad_norm": 1.4007868766784668, + "learning_rate": 1.1471468833088028e-08, + "loss": 5.06, + "step": 10271 + }, + { + "epoch": 0.990549662487946, + "grad_norm": 1.4695255756378174, + "learning_rate": 1.1243203715688432e-08, + "loss": 5.05, + "step": 10272 + }, + { + "epoch": 0.9906460945033752, + "grad_norm": 1.3072752952575684, + "learning_rate": 1.1017232035631653e-08, + "loss": 5.0276, + "step": 10273 + }, + { + "epoch": 0.9907425265188042, + "grad_norm": 1.7485498189926147, + "learning_rate": 1.0793553813656653e-08, + "loss": 5.0067, + "step": 10274 + }, + { + "epoch": 0.9908389585342333, + "grad_norm": 1.323870062828064, + "learning_rate": 1.0572169070294235e-08, + "loss": 5.0337, + "step": 10275 + }, + { + "epoch": 0.9909353905496625, + "grad_norm": 1.3906561136245728, + "learning_rate": 1.0353077825858703e-08, + "loss": 5.1341, + "step": 10276 + }, + { + "epoch": 0.9910318225650916, + "grad_norm": 1.3215006589889526, + "learning_rate": 1.0136280100461748e-08, + "loss": 5.0271, + "step": 10277 + }, + { + "epoch": 0.9911282545805208, + "grad_norm": 2.190155029296875, + "learning_rate": 9.921775913998565e-09, + "loss": 5.1354, + "step": 10278 + }, + { + "epoch": 0.9912246865959499, + "grad_norm": 1.4405395984649658, + "learning_rate": 9.709565286158962e-09, + "loss": 5.0115, + "step": 10279 + }, + { + "epoch": 0.991321118611379, + "grad_norm": 1.6569679975509644, + "learning_rate": 9.499648236416247e-09, + "loss": 5.0347, + "step": 10280 + }, + { + "epoch": 0.9914175506268081, + "grad_norm": 1.595481038093567, + "learning_rate": 9.292024784035569e-09, + "loss": 4.8942, + "step": 10281 + }, + { + "epoch": 0.9915139826422372, + "grad_norm": 1.3021200895309448, + "learning_rate": 9.08669494807668e-09, + "loss": 4.911, + "step": 10282 + }, + { + "epoch": 0.9916104146576663, + "grad_norm": 2.1678788661956787, + "learning_rate": 8.883658747380064e-09, + "loss": 4.9216, + "step": 10283 + }, + { + "epoch": 0.9917068466730955, + "grad_norm": 1.5826259851455688, + "learning_rate": 8.682916200583591e-09, + "loss": 5.1056, + "step": 10284 + }, + { + "epoch": 0.9918032786885246, + "grad_norm": 1.3139245510101318, + "learning_rate": 8.484467326111412e-09, + "loss": 5.0487, + "step": 10285 + }, + { + "epoch": 0.9918997107039537, + "grad_norm": 1.5951180458068848, + "learning_rate": 8.288312142173959e-09, + "loss": 5.0818, + "step": 10286 + }, + { + "epoch": 0.9919961427193829, + "grad_norm": 1.4365711212158203, + "learning_rate": 8.0944506667735e-09, + "loss": 5.1138, + "step": 10287 + }, + { + "epoch": 0.9920925747348119, + "grad_norm": 1.4159399271011353, + "learning_rate": 7.902882917706912e-09, + "loss": 5.0089, + "step": 10288 + }, + { + "epoch": 0.9921890067502411, + "grad_norm": 1.603958010673523, + "learning_rate": 7.713608912551796e-09, + "loss": 5.027, + "step": 10289 + }, + { + "epoch": 0.9922854387656702, + "grad_norm": 1.255707859992981, + "learning_rate": 7.526628668680369e-09, + "loss": 4.9907, + "step": 10290 + }, + { + "epoch": 0.9923818707810993, + "grad_norm": 1.727705955505371, + "learning_rate": 7.34194220325668e-09, + "loss": 5.0855, + "step": 10291 + }, + { + "epoch": 0.9924783027965285, + "grad_norm": 1.7978838682174683, + "learning_rate": 7.159549533228282e-09, + "loss": 5.1261, + "step": 10292 + }, + { + "epoch": 0.9925747348119576, + "grad_norm": 1.4646589756011963, + "learning_rate": 6.979450675334565e-09, + "loss": 5.009, + "step": 10293 + }, + { + "epoch": 0.9926711668273867, + "grad_norm": 1.3128061294555664, + "learning_rate": 6.801645646103971e-09, + "loss": 4.9711, + "step": 10294 + }, + { + "epoch": 0.9927675988428158, + "grad_norm": 1.4100922346115112, + "learning_rate": 6.626134461859556e-09, + "loss": 5.0106, + "step": 10295 + }, + { + "epoch": 0.9928640308582449, + "grad_norm": 1.4834256172180176, + "learning_rate": 6.452917138705106e-09, + "loss": 4.8749, + "step": 10296 + }, + { + "epoch": 0.992960462873674, + "grad_norm": 1.4115533828735352, + "learning_rate": 6.281993692539012e-09, + "loss": 5.0995, + "step": 10297 + }, + { + "epoch": 0.9930568948891032, + "grad_norm": 2.203300714492798, + "learning_rate": 6.113364139051503e-09, + "loss": 5.0666, + "step": 10298 + }, + { + "epoch": 0.9931533269045323, + "grad_norm": 1.271748661994934, + "learning_rate": 5.947028493713536e-09, + "loss": 5.0534, + "step": 10299 + }, + { + "epoch": 0.9932497589199615, + "grad_norm": 1.4901851415634155, + "learning_rate": 5.782986771799004e-09, + "loss": 4.9505, + "step": 10300 + }, + { + "epoch": 0.9933461909353906, + "grad_norm": 1.3237426280975342, + "learning_rate": 5.621238988356981e-09, + "loss": 5.0944, + "step": 10301 + }, + { + "epoch": 0.9934426229508196, + "grad_norm": 1.4361753463745117, + "learning_rate": 5.461785158233923e-09, + "loss": 5.1427, + "step": 10302 + }, + { + "epoch": 0.9935390549662488, + "grad_norm": 2.898420572280884, + "learning_rate": 5.3046252960653465e-09, + "loss": 5.2015, + "step": 10303 + }, + { + "epoch": 0.9936354869816779, + "grad_norm": 1.4698772430419922, + "learning_rate": 5.149759416273048e-09, + "loss": 5.1407, + "step": 10304 + }, + { + "epoch": 0.993731918997107, + "grad_norm": 1.1964484453201294, + "learning_rate": 4.997187533076208e-09, + "loss": 5.209, + "step": 10305 + }, + { + "epoch": 0.9938283510125362, + "grad_norm": 1.7254019975662231, + "learning_rate": 4.8469096604719656e-09, + "loss": 5.1003, + "step": 10306 + }, + { + "epoch": 0.9939247830279653, + "grad_norm": 1.9614300727844238, + "learning_rate": 4.698925812252064e-09, + "loss": 5.1463, + "step": 10307 + }, + { + "epoch": 0.9940212150433944, + "grad_norm": 2.4464118480682373, + "learning_rate": 4.5532360020028585e-09, + "loss": 5.1189, + "step": 10308 + }, + { + "epoch": 0.9941176470588236, + "grad_norm": 1.1943306922912598, + "learning_rate": 4.4098402430914345e-09, + "loss": 5.039, + "step": 10309 + }, + { + "epoch": 0.9942140790742526, + "grad_norm": 1.3268325328826904, + "learning_rate": 4.268738548682261e-09, + "loss": 5.0093, + "step": 10310 + }, + { + "epoch": 0.9943105110896818, + "grad_norm": 1.6657978296279907, + "learning_rate": 4.129930931723314e-09, + "loss": 5.0895, + "step": 10311 + }, + { + "epoch": 0.9944069431051109, + "grad_norm": 1.3964253664016724, + "learning_rate": 3.993417404954403e-09, + "loss": 5.0362, + "step": 10312 + }, + { + "epoch": 0.99450337512054, + "grad_norm": 1.598909854888916, + "learning_rate": 3.859197980904394e-09, + "loss": 4.944, + "step": 10313 + }, + { + "epoch": 0.9945998071359692, + "grad_norm": 1.7035458087921143, + "learning_rate": 3.7272726718912134e-09, + "loss": 5.1909, + "step": 10314 + }, + { + "epoch": 0.9946962391513983, + "grad_norm": 1.7374027967453003, + "learning_rate": 3.597641490024617e-09, + "loss": 4.8703, + "step": 10315 + }, + { + "epoch": 0.9947926711668273, + "grad_norm": 2.5345730781555176, + "learning_rate": 3.470304447200645e-09, + "loss": 4.8841, + "step": 10316 + }, + { + "epoch": 0.9948891031822565, + "grad_norm": 1.3201158046722412, + "learning_rate": 3.3452615551043952e-09, + "loss": 4.9987, + "step": 10317 + }, + { + "epoch": 0.9949855351976856, + "grad_norm": 1.3710408210754395, + "learning_rate": 3.2225128252155735e-09, + "loss": 4.9608, + "step": 10318 + }, + { + "epoch": 0.9950819672131147, + "grad_norm": 1.6850311756134033, + "learning_rate": 3.1020582688001678e-09, + "loss": 5.027, + "step": 10319 + }, + { + "epoch": 0.9951783992285439, + "grad_norm": 3.0176644325256348, + "learning_rate": 2.983897896910448e-09, + "loss": 5.1054, + "step": 10320 + }, + { + "epoch": 0.995274831243973, + "grad_norm": 1.388449788093567, + "learning_rate": 2.8680317203905185e-09, + "loss": 4.9937, + "step": 10321 + }, + { + "epoch": 0.9953712632594022, + "grad_norm": 1.325310230255127, + "learning_rate": 2.75445974987909e-09, + "loss": 4.9724, + "step": 10322 + }, + { + "epoch": 0.9954676952748313, + "grad_norm": 1.4748884439468384, + "learning_rate": 2.6431819957956074e-09, + "loss": 4.986, + "step": 10323 + }, + { + "epoch": 0.9955641272902603, + "grad_norm": 1.4162639379501343, + "learning_rate": 2.534198468354121e-09, + "loss": 4.92, + "step": 10324 + }, + { + "epoch": 0.9956605593056895, + "grad_norm": 1.341005802154541, + "learning_rate": 2.427509177554965e-09, + "loss": 5.1146, + "step": 10325 + }, + { + "epoch": 0.9957569913211186, + "grad_norm": 1.3244211673736572, + "learning_rate": 2.3231141331958585e-09, + "loss": 5.1225, + "step": 10326 + }, + { + "epoch": 0.9958534233365477, + "grad_norm": 1.7839714288711548, + "learning_rate": 2.2210133448496983e-09, + "loss": 4.9198, + "step": 10327 + }, + { + "epoch": 0.9959498553519769, + "grad_norm": 1.169614553451538, + "learning_rate": 2.1212068218950942e-09, + "loss": 4.9633, + "step": 10328 + }, + { + "epoch": 0.996046287367406, + "grad_norm": 1.6917768716812134, + "learning_rate": 2.02369457348861e-09, + "loss": 5.0102, + "step": 10329 + }, + { + "epoch": 0.996142719382835, + "grad_norm": 1.530044674873352, + "learning_rate": 1.9284766085786443e-09, + "loss": 4.8219, + "step": 10330 + }, + { + "epoch": 0.9962391513982642, + "grad_norm": 1.2911157608032227, + "learning_rate": 1.8355529359054268e-09, + "loss": 5.0651, + "step": 10331 + }, + { + "epoch": 0.9963355834136933, + "grad_norm": 1.1498888731002808, + "learning_rate": 1.7449235639982465e-09, + "loss": 5.0746, + "step": 10332 + }, + { + "epoch": 0.9964320154291225, + "grad_norm": 1.2129662036895752, + "learning_rate": 1.6565885011726734e-09, + "loss": 5.063, + "step": 10333 + }, + { + "epoch": 0.9965284474445516, + "grad_norm": 1.32682204246521, + "learning_rate": 1.570547755538887e-09, + "loss": 5.0419, + "step": 10334 + }, + { + "epoch": 0.9966248794599807, + "grad_norm": 1.4090055227279663, + "learning_rate": 1.4868013349933486e-09, + "loss": 5.0357, + "step": 10335 + }, + { + "epoch": 0.9967213114754099, + "grad_norm": 1.329856514930725, + "learning_rate": 1.4053492472188013e-09, + "loss": 5.0482, + "step": 10336 + }, + { + "epoch": 0.996817743490839, + "grad_norm": 1.4093222618103027, + "learning_rate": 1.3261914996953728e-09, + "loss": 5.0581, + "step": 10337 + }, + { + "epoch": 0.996914175506268, + "grad_norm": 1.4252300262451172, + "learning_rate": 1.2493280996839218e-09, + "loss": 5.1126, + "step": 10338 + }, + { + "epoch": 0.9970106075216972, + "grad_norm": 1.736867904663086, + "learning_rate": 1.1747590542399155e-09, + "loss": 5.1905, + "step": 10339 + }, + { + "epoch": 0.9971070395371263, + "grad_norm": 2.0189332962036133, + "learning_rate": 1.102484370207879e-09, + "loss": 5.0688, + "step": 10340 + }, + { + "epoch": 0.9972034715525554, + "grad_norm": 2.2381958961486816, + "learning_rate": 1.0325040542241704e-09, + "loss": 5.5034, + "step": 10341 + }, + { + "epoch": 0.9972999035679846, + "grad_norm": 1.7288577556610107, + "learning_rate": 9.648181127058786e-10, + "loss": 5.4658, + "step": 10342 + }, + { + "epoch": 0.9973963355834137, + "grad_norm": 2.0012686252593994, + "learning_rate": 8.994265518674772e-10, + "loss": 5.4203, + "step": 10343 + }, + { + "epoch": 0.9974927675988429, + "grad_norm": 2.2713727951049805, + "learning_rate": 8.36329377712497e-10, + "loss": 5.2068, + "step": 10344 + }, + { + "epoch": 0.997589199614272, + "grad_norm": 2.1104116439819336, + "learning_rate": 7.755265960307511e-10, + "loss": 5.3788, + "step": 10345 + }, + { + "epoch": 0.997685631629701, + "grad_norm": 2.337204933166504, + "learning_rate": 7.170182124011105e-10, + "loss": 4.9231, + "step": 10346 + }, + { + "epoch": 0.9977820636451302, + "grad_norm": 1.0990314483642578, + "learning_rate": 6.608042321942787e-10, + "loss": 5.0279, + "step": 10347 + }, + { + "epoch": 0.9978784956605593, + "grad_norm": 2.197108268737793, + "learning_rate": 6.068846605700174e-10, + "loss": 5.086, + "step": 10348 + }, + { + "epoch": 0.9979749276759884, + "grad_norm": 1.8724734783172607, + "learning_rate": 5.552595024743701e-10, + "loss": 5.1564, + "step": 10349 + }, + { + "epoch": 0.9980713596914176, + "grad_norm": 1.8960014581680298, + "learning_rate": 5.059287626507647e-10, + "loss": 5.2829, + "step": 10350 + }, + { + "epoch": 0.9981677917068467, + "grad_norm": 1.8822871446609497, + "learning_rate": 4.588924456233601e-10, + "loss": 5.1658, + "step": 10351 + }, + { + "epoch": 0.9982642237222757, + "grad_norm": 2.8869926929473877, + "learning_rate": 4.141505557081482e-10, + "loss": 5.2687, + "step": 10352 + }, + { + "epoch": 0.9983606557377049, + "grad_norm": 2.222513198852539, + "learning_rate": 3.7170309701295426e-10, + "loss": 5.0803, + "step": 10353 + }, + { + "epoch": 0.998457087753134, + "grad_norm": 1.335644006729126, + "learning_rate": 3.3155007343188547e-10, + "loss": 5.1448, + "step": 10354 + }, + { + "epoch": 0.9985535197685632, + "grad_norm": 1.4473686218261719, + "learning_rate": 2.936914886536579e-10, + "loss": 5.0826, + "step": 10355 + }, + { + "epoch": 0.9986499517839923, + "grad_norm": 2.1331801414489746, + "learning_rate": 2.5812734615049406e-10, + "loss": 5.1439, + "step": 10356 + }, + { + "epoch": 0.9987463837994214, + "grad_norm": 2.25042986869812, + "learning_rate": 2.2485764918644958e-10, + "loss": 5.4608, + "step": 10357 + }, + { + "epoch": 0.9988428158148506, + "grad_norm": 1.8790476322174072, + "learning_rate": 1.938824008146378e-10, + "loss": 5.2027, + "step": 10358 + }, + { + "epoch": 0.9989392478302797, + "grad_norm": 2.8649730682373047, + "learning_rate": 1.6520160388000528e-10, + "loss": 5.0602, + "step": 10359 + }, + { + "epoch": 0.9990356798457087, + "grad_norm": 2.1630783081054688, + "learning_rate": 1.3881526101100496e-10, + "loss": 5.1469, + "step": 10360 + }, + { + "epoch": 0.9991321118611379, + "grad_norm": 1.3714072704315186, + "learning_rate": 1.1472337463347416e-10, + "loss": 5.1098, + "step": 10361 + }, + { + "epoch": 0.999228543876567, + "grad_norm": 2.4715304374694824, + "learning_rate": 9.292594695675672e-11, + "loss": 4.9721, + "step": 10362 + }, + { + "epoch": 0.9993249758919961, + "grad_norm": 2.0427868366241455, + "learning_rate": 7.342297998202962e-11, + "loss": 4.9391, + "step": 10363 + }, + { + "epoch": 0.9994214079074253, + "grad_norm": 1.458808183670044, + "learning_rate": 5.621447549675196e-11, + "loss": 5.1276, + "step": 10364 + }, + { + "epoch": 0.9995178399228544, + "grad_norm": 1.346563458442688, + "learning_rate": 4.130043508576709e-11, + "loss": 4.9943, + "step": 10365 + }, + { + "epoch": 0.9996142719382836, + "grad_norm": 1.335033655166626, + "learning_rate": 2.8680860111873764e-11, + "loss": 5.0699, + "step": 10366 + }, + { + "epoch": 0.9997107039537126, + "grad_norm": 1.349319577217102, + "learning_rate": 1.8355751735255057e-11, + "loss": 4.9615, + "step": 10367 + }, + { + "epoch": 0.9998071359691417, + "grad_norm": 1.2834179401397705, + "learning_rate": 1.0325110902376089e-11, + "loss": 4.9672, + "step": 10368 + }, + { + "epoch": 0.9999035679845709, + "grad_norm": 1.5805972814559937, + "learning_rate": 4.5889383543107345e-12, + "loss": 4.9085, + "step": 10369 + }, + { + "epoch": 1.0, + "grad_norm": 4.3006510734558105, + "learning_rate": 1.14723461563937e-12, + "loss": 4.587, + "step": 10370 + } + ], + "logging_steps": 1, + "max_steps": 10370, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 12000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.431559782048727e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}