diff --git "a/checkpoint-50000/trainer_state.json" "b/checkpoint-50000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-50000/trainer_state.json" @@ -0,0 +1,350033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4360642584291221, + "eval_steps": 500, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.721285168582443e-06, + "grad_norm": 14.5625, + "learning_rate": 0.0005, + "loss": 3.9722, + "step": 1 + }, + { + "epoch": 1.7442570337164886e-05, + "grad_norm": 101.0, + "learning_rate": 0.0005, + "loss": 20.2026, + "step": 2 + }, + { + "epoch": 2.6163855505747325e-05, + "grad_norm": 23.375, + "learning_rate": 0.0005, + "loss": 3.5394, + "step": 3 + }, + { + "epoch": 3.488514067432977e-05, + "grad_norm": 14.75, + "learning_rate": 0.0005, + "loss": 3.3477, + "step": 4 + }, + { + "epoch": 4.360642584291221e-05, + "grad_norm": 8.375, + "learning_rate": 0.0005, + "loss": 2.493, + "step": 5 + }, + { + "epoch": 5.232771101149465e-05, + "grad_norm": 3.4375, + "learning_rate": 0.0005, + "loss": 1.94, + "step": 6 + }, + { + "epoch": 6.10489961800771e-05, + "grad_norm": 7.6875, + "learning_rate": 0.0005, + "loss": 2.168, + "step": 7 + }, + { + "epoch": 6.977028134865954e-05, + "grad_norm": 3.5, + "learning_rate": 0.0005, + "loss": 1.9331, + "step": 8 + }, + { + "epoch": 7.849156651724198e-05, + "grad_norm": 7.40625, + "learning_rate": 0.0005, + "loss": 2.0418, + "step": 9 + }, + { + "epoch": 8.721285168582442e-05, + "grad_norm": 5.96875, + "learning_rate": 0.0005, + "loss": 1.9854, + "step": 10 + }, + { + "epoch": 9.593413685440687e-05, + "grad_norm": 2.25, + "learning_rate": 0.0005, + "loss": 1.8546, + "step": 11 + }, + { + "epoch": 0.0001046554220229893, + "grad_norm": 1.703125, + "learning_rate": 0.0005, + "loss": 1.7533, + "step": 12 + }, + { + "epoch": 0.00011337670719157175, + "grad_norm": 3.703125, + "learning_rate": 0.0005, + "loss": 1.7708, + "step": 13 + }, + { + "epoch": 0.0001220979923601542, + "grad_norm": 2.671875, + "learning_rate": 0.0005, + "loss": 1.7223, + "step": 14 + }, + { + "epoch": 0.00013081927752873663, + "grad_norm": 2.65625, + "learning_rate": 0.0005, + "loss": 1.6401, + "step": 15 + }, + { + "epoch": 0.0001395405626973191, + "grad_norm": 2.234375, + "learning_rate": 0.0005, + "loss": 1.6347, + "step": 16 + }, + { + "epoch": 0.00014826184786590152, + "grad_norm": 0.75, + "learning_rate": 0.0005, + "loss": 1.6005, + "step": 17 + }, + { + "epoch": 0.00015698313303448395, + "grad_norm": 0.65234375, + "learning_rate": 0.0005, + "loss": 1.5638, + "step": 18 + }, + { + "epoch": 0.0001657044182030664, + "grad_norm": 0.91015625, + "learning_rate": 0.0005, + "loss": 1.5872, + "step": 19 + }, + { + "epoch": 0.00017442570337164885, + "grad_norm": 0.6953125, + "learning_rate": 0.0005, + "loss": 1.5424, + "step": 20 + }, + { + "epoch": 0.00018314698854023128, + "grad_norm": 0.408203125, + "learning_rate": 0.0005, + "loss": 1.519, + "step": 21 + }, + { + "epoch": 0.00019186827370881374, + "grad_norm": 0.5546875, + "learning_rate": 0.0005, + "loss": 1.4877, + "step": 22 + }, + { + "epoch": 0.00020058955887739617, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.4981, + "step": 23 + }, + { + "epoch": 0.0002093108440459786, + "grad_norm": 0.515625, + "learning_rate": 0.0005, + "loss": 1.4835, + "step": 24 + }, + { + "epoch": 0.00021803212921456106, + "grad_norm": 0.3984375, + "learning_rate": 0.0005, + "loss": 1.4588, + "step": 25 + }, + { + "epoch": 0.0002267534143831435, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.4621, + "step": 26 + }, + { + "epoch": 0.00023547469955172593, + "grad_norm": 0.359375, + "learning_rate": 0.0005, + "loss": 1.4704, + "step": 27 + }, + { + "epoch": 0.0002441959847203084, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.4381, + "step": 28 + }, + { + "epoch": 0.0002529172698888908, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.4278, + "step": 29 + }, + { + "epoch": 0.00026163855505747325, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.4313, + "step": 30 + }, + { + "epoch": 0.0002703598402260557, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.3899, + "step": 31 + }, + { + "epoch": 0.0002790811253946382, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.3984, + "step": 32 + }, + { + "epoch": 0.0002878024105632206, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.3972, + "step": 33 + }, + { + "epoch": 0.00029652369573180304, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.3988, + "step": 34 + }, + { + "epoch": 0.0003052449809003855, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.3888, + "step": 35 + }, + { + "epoch": 0.0003139662660689679, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.3863, + "step": 36 + }, + { + "epoch": 0.00032268755123755034, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.3768, + "step": 37 + }, + { + "epoch": 0.0003314088364061328, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.3691, + "step": 38 + }, + { + "epoch": 0.00034013012157471526, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.3795, + "step": 39 + }, + { + "epoch": 0.0003488514067432977, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.348, + "step": 40 + }, + { + "epoch": 0.0003575726919118801, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.3584, + "step": 41 + }, + { + "epoch": 0.00036629397708046256, + "grad_norm": 0.474609375, + "learning_rate": 0.0005, + "loss": 1.3255, + "step": 42 + }, + { + "epoch": 0.00037501526224904504, + "grad_norm": 0.59375, + "learning_rate": 0.0005, + "loss": 1.3539, + "step": 43 + }, + { + "epoch": 0.0003837365474176275, + "grad_norm": 0.486328125, + "learning_rate": 0.0005, + "loss": 1.3335, + "step": 44 + }, + { + "epoch": 0.0003924578325862099, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.332, + "step": 45 + }, + { + "epoch": 0.00040117911775479234, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.3472, + "step": 46 + }, + { + "epoch": 0.0004099004029233748, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.3348, + "step": 47 + }, + { + "epoch": 0.0004186216880919572, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.334, + "step": 48 + }, + { + "epoch": 0.0004273429732605397, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.312, + "step": 49 + }, + { + "epoch": 0.00043606425842912213, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.2979, + "step": 50 + }, + { + "epoch": 0.00044478554359770456, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.3228, + "step": 51 + }, + { + "epoch": 0.000453506828766287, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.3429, + "step": 52 + }, + { + "epoch": 0.0004622281139348694, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.3086, + "step": 53 + }, + { + "epoch": 0.00047094939910345186, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.3015, + "step": 54 + }, + { + "epoch": 0.00047967068427203435, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.3197, + "step": 55 + }, + { + "epoch": 0.0004883919694406168, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.3038, + "step": 56 + }, + { + "epoch": 0.0004971132546091993, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.3111, + "step": 57 + }, + { + "epoch": 0.0005058345397777816, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.3016, + "step": 58 + }, + { + "epoch": 0.0005145558249463641, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.2902, + "step": 59 + }, + { + "epoch": 0.0005232771101149465, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.2874, + "step": 60 + }, + { + "epoch": 0.000531998395283529, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.284, + "step": 61 + }, + { + "epoch": 0.0005407196804521114, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.2728, + "step": 62 + }, + { + "epoch": 0.0005494409656206939, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.2857, + "step": 63 + }, + { + "epoch": 0.0005581622507892763, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.2966, + "step": 64 + }, + { + "epoch": 0.0005668835359578587, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.2771, + "step": 65 + }, + { + "epoch": 0.0005756048211264412, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.2941, + "step": 66 + }, + { + "epoch": 0.0005843261062950236, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.2793, + "step": 67 + }, + { + "epoch": 0.0005930473914636061, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.2849, + "step": 68 + }, + { + "epoch": 0.0006017686766321886, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.2623, + "step": 69 + }, + { + "epoch": 0.000610489961800771, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.2711, + "step": 70 + }, + { + "epoch": 0.0006192112469693534, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.2519, + "step": 71 + }, + { + "epoch": 0.0006279325321379358, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.2551, + "step": 72 + }, + { + "epoch": 0.0006366538173065183, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.2601, + "step": 73 + }, + { + "epoch": 0.0006453751024751007, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.2766, + "step": 74 + }, + { + "epoch": 0.0006540963876436832, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.2748, + "step": 75 + }, + { + "epoch": 0.0006628176728122657, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.2587, + "step": 76 + }, + { + "epoch": 0.000671538957980848, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.2459, + "step": 77 + }, + { + "epoch": 0.0006802602431494305, + "grad_norm": 0.453125, + "learning_rate": 0.0005, + "loss": 1.2582, + "step": 78 + }, + { + "epoch": 0.0006889815283180129, + "grad_norm": 0.57421875, + "learning_rate": 0.0005, + "loss": 1.2731, + "step": 79 + }, + { + "epoch": 0.0006977028134865954, + "grad_norm": 0.4765625, + "learning_rate": 0.0005, + "loss": 1.2773, + "step": 80 + }, + { + "epoch": 0.0007064240986551779, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.2562, + "step": 81 + }, + { + "epoch": 0.0007151453838237602, + "grad_norm": 0.41796875, + "learning_rate": 0.0005, + "loss": 1.273, + "step": 82 + }, + { + "epoch": 0.0007238666689923427, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.3138, + "step": 83 + }, + { + "epoch": 0.0007325879541609251, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.2606, + "step": 84 + }, + { + "epoch": 0.0007413092393295076, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.2741, + "step": 85 + }, + { + "epoch": 0.0007500305244980901, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.2348, + "step": 86 + }, + { + "epoch": 0.0007587518096666725, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.2314, + "step": 87 + }, + { + "epoch": 0.000767473094835255, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.2617, + "step": 88 + }, + { + "epoch": 0.0007761943800038373, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.2503, + "step": 89 + }, + { + "epoch": 0.0007849156651724198, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.2419, + "step": 90 + }, + { + "epoch": 0.0007936369503410022, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.2622, + "step": 91 + }, + { + "epoch": 0.0008023582355095847, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.2285, + "step": 92 + }, + { + "epoch": 0.0008110795206781672, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.237, + "step": 93 + }, + { + "epoch": 0.0008198008058467496, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.2546, + "step": 94 + }, + { + "epoch": 0.000828522091015332, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.2498, + "step": 95 + }, + { + "epoch": 0.0008372433761839144, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.2503, + "step": 96 + }, + { + "epoch": 0.0008459646613524969, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.2329, + "step": 97 + }, + { + "epoch": 0.0008546859465210794, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.2358, + "step": 98 + }, + { + "epoch": 0.0008634072316896618, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.2435, + "step": 99 + }, + { + "epoch": 0.0008721285168582443, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.2461, + "step": 100 + }, + { + "epoch": 0.0008808498020268266, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.2339, + "step": 101 + }, + { + "epoch": 0.0008895710871954091, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.2229, + "step": 102 + }, + { + "epoch": 0.0008982923723639916, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.232, + "step": 103 + }, + { + "epoch": 0.000907013657532574, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.2306, + "step": 104 + }, + { + "epoch": 0.0009157349427011565, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.2317, + "step": 105 + }, + { + "epoch": 0.0009244562278697389, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.221, + "step": 106 + }, + { + "epoch": 0.0009331775130383213, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.2299, + "step": 107 + }, + { + "epoch": 0.0009418987982069037, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.2271, + "step": 108 + }, + { + "epoch": 0.0009506200833754862, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.2063, + "step": 109 + }, + { + "epoch": 0.0009593413685440687, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.2447, + "step": 110 + }, + { + "epoch": 0.0009680626537126511, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.213, + "step": 111 + }, + { + "epoch": 0.0009767839388812336, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.2421, + "step": 112 + }, + { + "epoch": 0.000985505224049816, + "grad_norm": 0.63671875, + "learning_rate": 0.0005, + "loss": 1.211, + "step": 113 + }, + { + "epoch": 0.0009942265092183985, + "grad_norm": 0.92578125, + "learning_rate": 0.0005, + "loss": 1.2456, + "step": 114 + }, + { + "epoch": 0.0010029477943869808, + "grad_norm": 0.6015625, + "learning_rate": 0.0005, + "loss": 1.2055, + "step": 115 + }, + { + "epoch": 0.0010116690795555633, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.2179, + "step": 116 + }, + { + "epoch": 0.0010203903647241458, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.2541, + "step": 117 + }, + { + "epoch": 0.0010291116498927283, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.2211, + "step": 118 + }, + { + "epoch": 0.0010378329350613105, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.2194, + "step": 119 + }, + { + "epoch": 0.001046554220229893, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.2276, + "step": 120 + }, + { + "epoch": 0.0010552755053984755, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.2156, + "step": 121 + }, + { + "epoch": 0.001063996790567058, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.2117, + "step": 122 + }, + { + "epoch": 0.0010727180757356405, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.2348, + "step": 123 + }, + { + "epoch": 0.0010814393609042228, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.2177, + "step": 124 + }, + { + "epoch": 0.0010901606460728052, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.2073, + "step": 125 + }, + { + "epoch": 0.0010988819312413877, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.208, + "step": 126 + }, + { + "epoch": 0.0011076032164099702, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.2232, + "step": 127 + }, + { + "epoch": 0.0011163245015785527, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.2134, + "step": 128 + }, + { + "epoch": 0.001125045786747135, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.1867, + "step": 129 + }, + { + "epoch": 0.0011337670719157175, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.2316, + "step": 130 + }, + { + "epoch": 0.0011424883570843, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.2083, + "step": 131 + }, + { + "epoch": 0.0011512096422528824, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.2171, + "step": 132 + }, + { + "epoch": 0.001159930927421465, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.2155, + "step": 133 + }, + { + "epoch": 0.0011686522125900472, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1894, + "step": 134 + }, + { + "epoch": 0.0011773734977586297, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1817, + "step": 135 + }, + { + "epoch": 0.0011860947829272122, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.2191, + "step": 136 + }, + { + "epoch": 0.0011948160680957946, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.2255, + "step": 137 + }, + { + "epoch": 0.0012035373532643771, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1969, + "step": 138 + }, + { + "epoch": 0.0012122586384329594, + "grad_norm": 0.3671875, + "learning_rate": 0.0005, + "loss": 1.2077, + "step": 139 + }, + { + "epoch": 0.001220979923601542, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.1968, + "step": 140 + }, + { + "epoch": 0.0012297012087701244, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 1.2042, + "step": 141 + }, + { + "epoch": 0.0012384224939387069, + "grad_norm": 0.51953125, + "learning_rate": 0.0005, + "loss": 1.2144, + "step": 142 + }, + { + "epoch": 0.0012471437791072894, + "grad_norm": 0.75, + "learning_rate": 0.0005, + "loss": 1.2173, + "step": 143 + }, + { + "epoch": 0.0012558650642758716, + "grad_norm": 1.0859375, + "learning_rate": 0.0005, + "loss": 1.247, + "step": 144 + }, + { + "epoch": 0.0012645863494444541, + "grad_norm": 0.76953125, + "learning_rate": 0.0005, + "loss": 1.2344, + "step": 145 + }, + { + "epoch": 0.0012733076346130366, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.2059, + "step": 146 + }, + { + "epoch": 0.001282028919781619, + "grad_norm": 0.6484375, + "learning_rate": 0.0005, + "loss": 1.2061, + "step": 147 + }, + { + "epoch": 0.0012907502049502014, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.219, + "step": 148 + }, + { + "epoch": 0.0012994714901187838, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.2223, + "step": 149 + }, + { + "epoch": 0.0013081927752873663, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.1954, + "step": 150 + }, + { + "epoch": 0.0013169140604559488, + "grad_norm": 0.41796875, + "learning_rate": 0.0005, + "loss": 1.2337, + "step": 151 + }, + { + "epoch": 0.0013256353456245313, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.2033, + "step": 152 + }, + { + "epoch": 0.0013343566307931136, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1913, + "step": 153 + }, + { + "epoch": 0.001343077915961696, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1934, + "step": 154 + }, + { + "epoch": 0.0013517992011302785, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.2172, + "step": 155 + }, + { + "epoch": 0.001360520486298861, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.2045, + "step": 156 + }, + { + "epoch": 0.0013692417714674435, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1955, + "step": 157 + }, + { + "epoch": 0.0013779630566360258, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.1893, + "step": 158 + }, + { + "epoch": 0.0013866843418046083, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1999, + "step": 159 + }, + { + "epoch": 0.0013954056269731908, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.22, + "step": 160 + }, + { + "epoch": 0.0014041269121417733, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.2001, + "step": 161 + }, + { + "epoch": 0.0014128481973103557, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.1966, + "step": 162 + }, + { + "epoch": 0.001421569482478938, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.1969, + "step": 163 + }, + { + "epoch": 0.0014302907676475205, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.2097, + "step": 164 + }, + { + "epoch": 0.001439012052816103, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.1965, + "step": 165 + }, + { + "epoch": 0.0014477333379846855, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.1989, + "step": 166 + }, + { + "epoch": 0.001456454623153268, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.2195, + "step": 167 + }, + { + "epoch": 0.0014651759083218502, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.2037, + "step": 168 + }, + { + "epoch": 0.0014738971934904327, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.2012, + "step": 169 + }, + { + "epoch": 0.0014826184786590152, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1961, + "step": 170 + }, + { + "epoch": 0.0014913397638275977, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1808, + "step": 171 + }, + { + "epoch": 0.0015000610489961802, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1988, + "step": 172 + }, + { + "epoch": 0.0015087823341647624, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.1863, + "step": 173 + }, + { + "epoch": 0.001517503619333345, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1854, + "step": 174 + }, + { + "epoch": 0.0015262249045019274, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1814, + "step": 175 + }, + { + "epoch": 0.00153494618967051, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.1765, + "step": 176 + }, + { + "epoch": 0.0015436674748390924, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.2048, + "step": 177 + }, + { + "epoch": 0.0015523887600076747, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1807, + "step": 178 + }, + { + "epoch": 0.0015611100451762572, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1956, + "step": 179 + }, + { + "epoch": 0.0015698313303448396, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1908, + "step": 180 + }, + { + "epoch": 0.0015785526155134221, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1766, + "step": 181 + }, + { + "epoch": 0.0015872739006820044, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1879, + "step": 182 + }, + { + "epoch": 0.0015959951858505869, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1975, + "step": 183 + }, + { + "epoch": 0.0016047164710191694, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.201, + "step": 184 + }, + { + "epoch": 0.0016134377561877519, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.1837, + "step": 185 + }, + { + "epoch": 0.0016221590413563343, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1906, + "step": 186 + }, + { + "epoch": 0.0016308803265249166, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1835, + "step": 187 + }, + { + "epoch": 0.001639601611693499, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.1881, + "step": 188 + }, + { + "epoch": 0.0016483228968620816, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.1746, + "step": 189 + }, + { + "epoch": 0.001657044182030664, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.197, + "step": 190 + }, + { + "epoch": 0.0016657654671992466, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.1827, + "step": 191 + }, + { + "epoch": 0.0016744867523678288, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.2052, + "step": 192 + }, + { + "epoch": 0.0016832080375364113, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1738, + "step": 193 + }, + { + "epoch": 0.0016919293227049938, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1735, + "step": 194 + }, + { + "epoch": 0.0017006506078735763, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.1891, + "step": 195 + }, + { + "epoch": 0.0017093718930421588, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.19, + "step": 196 + }, + { + "epoch": 0.001718093178210741, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.178, + "step": 197 + }, + { + "epoch": 0.0017268144633793235, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1935, + "step": 198 + }, + { + "epoch": 0.001735535748547906, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.1813, + "step": 199 + }, + { + "epoch": 0.0017442570337164885, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.2007, + "step": 200 + }, + { + "epoch": 0.001752978318885071, + "grad_norm": 0.70703125, + "learning_rate": 0.0005, + "loss": 1.1684, + "step": 201 + }, + { + "epoch": 0.0017616996040536533, + "grad_norm": 1.2265625, + "learning_rate": 0.0005, + "loss": 1.2006, + "step": 202 + }, + { + "epoch": 0.0017704208892222358, + "grad_norm": 0.6796875, + "learning_rate": 0.0005, + "loss": 1.1958, + "step": 203 + }, + { + "epoch": 0.0017791421743908182, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.1743, + "step": 204 + }, + { + "epoch": 0.0017878634595594007, + "grad_norm": 0.88671875, + "learning_rate": 0.0005, + "loss": 1.1959, + "step": 205 + }, + { + "epoch": 0.0017965847447279832, + "grad_norm": 0.7890625, + "learning_rate": 0.0005, + "loss": 1.1849, + "step": 206 + }, + { + "epoch": 0.0018053060298965655, + "grad_norm": 0.443359375, + "learning_rate": 0.0005, + "loss": 1.1958, + "step": 207 + }, + { + "epoch": 0.001814027315065148, + "grad_norm": 0.72265625, + "learning_rate": 0.0005, + "loss": 1.1987, + "step": 208 + }, + { + "epoch": 0.0018227486002337305, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.191, + "step": 209 + }, + { + "epoch": 0.001831469885402313, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.1875, + "step": 210 + }, + { + "epoch": 0.0018401911705708954, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.1838, + "step": 211 + }, + { + "epoch": 0.0018489124557394777, + "grad_norm": 0.337890625, + "learning_rate": 0.0005, + "loss": 1.193, + "step": 212 + }, + { + "epoch": 0.0018576337409080602, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.182, + "step": 213 + }, + { + "epoch": 0.0018663550260766427, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.1844, + "step": 214 + }, + { + "epoch": 0.0018750763112452252, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1974, + "step": 215 + }, + { + "epoch": 0.0018837975964138074, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.2024, + "step": 216 + }, + { + "epoch": 0.00189251888158239, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1885, + "step": 217 + }, + { + "epoch": 0.0019012401667509724, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.1823, + "step": 218 + }, + { + "epoch": 0.001909961451919555, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1794, + "step": 219 + }, + { + "epoch": 0.0019186827370881374, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1819, + "step": 220 + }, + { + "epoch": 0.0019274040222567197, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.1792, + "step": 221 + }, + { + "epoch": 0.0019361253074253021, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.1857, + "step": 222 + }, + { + "epoch": 0.0019448465925938846, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.1836, + "step": 223 + }, + { + "epoch": 0.001953567877762467, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.2043, + "step": 224 + }, + { + "epoch": 0.0019622891629310496, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.1707, + "step": 225 + }, + { + "epoch": 0.001971010448099632, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.1961, + "step": 226 + }, + { + "epoch": 0.0019797317332682146, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.1903, + "step": 227 + }, + { + "epoch": 0.001988453018436797, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1698, + "step": 228 + }, + { + "epoch": 0.001997174303605379, + "grad_norm": 0.337890625, + "learning_rate": 0.0005, + "loss": 1.1762, + "step": 229 + }, + { + "epoch": 0.0020058955887739616, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.1742, + "step": 230 + }, + { + "epoch": 0.002014616873942544, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1854, + "step": 231 + }, + { + "epoch": 0.0020233381591111266, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1538, + "step": 232 + }, + { + "epoch": 0.002032059444279709, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.1738, + "step": 233 + }, + { + "epoch": 0.0020407807294482916, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.1734, + "step": 234 + }, + { + "epoch": 0.002049502014616874, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.1713, + "step": 235 + }, + { + "epoch": 0.0020582232997854565, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.174, + "step": 236 + }, + { + "epoch": 0.002066944584954039, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1746, + "step": 237 + }, + { + "epoch": 0.002075665870122621, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.1773, + "step": 238 + }, + { + "epoch": 0.0020843871552912036, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1631, + "step": 239 + }, + { + "epoch": 0.002093108440459786, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.1679, + "step": 240 + }, + { + "epoch": 0.0021018297256283685, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.1751, + "step": 241 + }, + { + "epoch": 0.002110551010796951, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1572, + "step": 242 + }, + { + "epoch": 0.0021192722959655335, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1734, + "step": 243 + }, + { + "epoch": 0.002127993581134116, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1732, + "step": 244 + }, + { + "epoch": 0.0021367148663026985, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.152, + "step": 245 + }, + { + "epoch": 0.002145436151471281, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1552, + "step": 246 + }, + { + "epoch": 0.0021541574366398634, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.1501, + "step": 247 + }, + { + "epoch": 0.0021628787218084455, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.1804, + "step": 248 + }, + { + "epoch": 0.002171600006977028, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1595, + "step": 249 + }, + { + "epoch": 0.0021803212921456105, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.1614, + "step": 250 + }, + { + "epoch": 0.002189042577314193, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.1613, + "step": 251 + }, + { + "epoch": 0.0021977638624827755, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1684, + "step": 252 + }, + { + "epoch": 0.002206485147651358, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 253 + }, + { + "epoch": 0.0022152064328199404, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1734, + "step": 254 + }, + { + "epoch": 0.002223927717988523, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.1706, + "step": 255 + }, + { + "epoch": 0.0022326490031571054, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1596, + "step": 256 + }, + { + "epoch": 0.002241370288325688, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1873, + "step": 257 + }, + { + "epoch": 0.00225009157349427, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1647, + "step": 258 + }, + { + "epoch": 0.0022588128586628524, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1598, + "step": 259 + }, + { + "epoch": 0.002267534143831435, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1587, + "step": 260 + }, + { + "epoch": 0.0022762554290000174, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1596, + "step": 261 + }, + { + "epoch": 0.0022849767141686, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1507, + "step": 262 + }, + { + "epoch": 0.0022936979993371824, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.1788, + "step": 263 + }, + { + "epoch": 0.002302419284505765, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.1604, + "step": 264 + }, + { + "epoch": 0.0023111405696743473, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.1698, + "step": 265 + }, + { + "epoch": 0.00231986185484293, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.1698, + "step": 266 + }, + { + "epoch": 0.002328583140011512, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.1873, + "step": 267 + }, + { + "epoch": 0.0023373044251800944, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.1744, + "step": 268 + }, + { + "epoch": 0.002346025710348677, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.1679, + "step": 269 + }, + { + "epoch": 0.0023547469955172593, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.1799, + "step": 270 + }, + { + "epoch": 0.002363468280685842, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1567, + "step": 271 + }, + { + "epoch": 0.0023721895658544243, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1651, + "step": 272 + }, + { + "epoch": 0.002380910851023007, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.1802, + "step": 273 + }, + { + "epoch": 0.0023896321361915893, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.171, + "step": 274 + }, + { + "epoch": 0.002398353421360172, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.1791, + "step": 275 + }, + { + "epoch": 0.0024070747065287543, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.1742, + "step": 276 + }, + { + "epoch": 0.0024157959916973363, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.1618, + "step": 277 + }, + { + "epoch": 0.002424517276865919, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.1793, + "step": 278 + }, + { + "epoch": 0.0024332385620345013, + "grad_norm": 0.65234375, + "learning_rate": 0.0005, + "loss": 1.1812, + "step": 279 + }, + { + "epoch": 0.002441959847203084, + "grad_norm": 0.8203125, + "learning_rate": 0.0005, + "loss": 1.1855, + "step": 280 + }, + { + "epoch": 0.0024506811323716663, + "grad_norm": 1.1953125, + "learning_rate": 0.0005, + "loss": 1.2026, + "step": 281 + }, + { + "epoch": 0.0024594024175402488, + "grad_norm": 0.8671875, + "learning_rate": 0.0005, + "loss": 1.1677, + "step": 282 + }, + { + "epoch": 0.0024681237027088312, + "grad_norm": 0.72265625, + "learning_rate": 0.0005, + "loss": 1.1726, + "step": 283 + }, + { + "epoch": 0.0024768449878774137, + "grad_norm": 1.046875, + "learning_rate": 0.0005, + "loss": 1.1909, + "step": 284 + }, + { + "epoch": 0.0024855662730459962, + "grad_norm": 0.5859375, + "learning_rate": 0.0005, + "loss": 1.1671, + "step": 285 + }, + { + "epoch": 0.0024942875582145787, + "grad_norm": 0.578125, + "learning_rate": 0.0005, + "loss": 1.1834, + "step": 286 + }, + { + "epoch": 0.0025030088433831608, + "grad_norm": 0.5703125, + "learning_rate": 0.0005, + "loss": 1.181, + "step": 287 + }, + { + "epoch": 0.0025117301285517432, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.1688, + "step": 288 + }, + { + "epoch": 0.0025204514137203257, + "grad_norm": 0.490234375, + "learning_rate": 0.0005, + "loss": 1.1666, + "step": 289 + }, + { + "epoch": 0.0025291726988889082, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.1802, + "step": 290 + }, + { + "epoch": 0.0025378939840574907, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.1767, + "step": 291 + }, + { + "epoch": 0.002546615269226073, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.1759, + "step": 292 + }, + { + "epoch": 0.0025553365543946557, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1553, + "step": 293 + }, + { + "epoch": 0.002564057839563238, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.1758, + "step": 294 + }, + { + "epoch": 0.0025727791247318207, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.1605, + "step": 295 + }, + { + "epoch": 0.0025815004099004027, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.1838, + "step": 296 + }, + { + "epoch": 0.002590221695068985, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.156, + "step": 297 + }, + { + "epoch": 0.0025989429802375677, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.1646, + "step": 298 + }, + { + "epoch": 0.00260766426540615, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.1788, + "step": 299 + }, + { + "epoch": 0.0026163855505747327, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.1548, + "step": 300 + }, + { + "epoch": 0.002625106835743315, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.1397, + "step": 301 + }, + { + "epoch": 0.0026338281209118976, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.1657, + "step": 302 + }, + { + "epoch": 0.00264254940608048, + "grad_norm": 0.498046875, + "learning_rate": 0.0005, + "loss": 1.162, + "step": 303 + }, + { + "epoch": 0.0026512706912490626, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.1528, + "step": 304 + }, + { + "epoch": 0.002659991976417645, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.162, + "step": 305 + }, + { + "epoch": 0.002668713261586227, + "grad_norm": 0.423828125, + "learning_rate": 0.0005, + "loss": 1.1643, + "step": 306 + }, + { + "epoch": 0.0026774345467548096, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.1481, + "step": 307 + }, + { + "epoch": 0.002686155831923392, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.164, + "step": 308 + }, + { + "epoch": 0.0026948771170919746, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1648, + "step": 309 + }, + { + "epoch": 0.002703598402260557, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1731, + "step": 310 + }, + { + "epoch": 0.0027123196874291396, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.1678, + "step": 311 + }, + { + "epoch": 0.002721040972597722, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1557, + "step": 312 + }, + { + "epoch": 0.0027297622577663046, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.1597, + "step": 313 + }, + { + "epoch": 0.002738483542934887, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.136, + "step": 314 + }, + { + "epoch": 0.0027472048281034695, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1458, + "step": 315 + }, + { + "epoch": 0.0027559261132720516, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 316 + }, + { + "epoch": 0.002764647398440634, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1565, + "step": 317 + }, + { + "epoch": 0.0027733686836092166, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1398, + "step": 318 + }, + { + "epoch": 0.002782089968777799, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1377, + "step": 319 + }, + { + "epoch": 0.0027908112539463815, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1463, + "step": 320 + }, + { + "epoch": 0.002799532539114964, + "grad_norm": 0.396484375, + "learning_rate": 0.0005, + "loss": 1.1645, + "step": 321 + }, + { + "epoch": 0.0028082538242835465, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.1625, + "step": 322 + }, + { + "epoch": 0.002816975109452129, + "grad_norm": 0.5546875, + "learning_rate": 0.0005, + "loss": 1.1428, + "step": 323 + }, + { + "epoch": 0.0028256963946207115, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1652, + "step": 324 + }, + { + "epoch": 0.002834417679789294, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.1764, + "step": 325 + }, + { + "epoch": 0.002843138964957876, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.1712, + "step": 326 + }, + { + "epoch": 0.0028518602501264585, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1616, + "step": 327 + }, + { + "epoch": 0.002860581535295041, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1416, + "step": 328 + }, + { + "epoch": 0.0028693028204636235, + "grad_norm": 0.40234375, + "learning_rate": 0.0005, + "loss": 1.1308, + "step": 329 + }, + { + "epoch": 0.002878024105632206, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.1759, + "step": 330 + }, + { + "epoch": 0.0028867453908007885, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.143, + "step": 331 + }, + { + "epoch": 0.002895466675969371, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.1413, + "step": 332 + }, + { + "epoch": 0.0029041879611379534, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.1199, + "step": 333 + }, + { + "epoch": 0.002912909246306536, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.1332, + "step": 334 + }, + { + "epoch": 0.002921630531475118, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1428, + "step": 335 + }, + { + "epoch": 0.0029303518166437005, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.1571, + "step": 336 + }, + { + "epoch": 0.002939073101812283, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.1585, + "step": 337 + }, + { + "epoch": 0.0029477943869808654, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.1585, + "step": 338 + }, + { + "epoch": 0.002956515672149448, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1663, + "step": 339 + }, + { + "epoch": 0.0029652369573180304, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.1557, + "step": 340 + }, + { + "epoch": 0.002973958242486613, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1433, + "step": 341 + }, + { + "epoch": 0.0029826795276551954, + "grad_norm": 0.34375, + "learning_rate": 0.0005, + "loss": 1.154, + "step": 342 + }, + { + "epoch": 0.002991400812823778, + "grad_norm": 0.73828125, + "learning_rate": 0.0005, + "loss": 1.1501, + "step": 343 + }, + { + "epoch": 0.0030001220979923604, + "grad_norm": 0.84375, + "learning_rate": 0.0005, + "loss": 1.1507, + "step": 344 + }, + { + "epoch": 0.0030088433831609424, + "grad_norm": 0.66015625, + "learning_rate": 0.0005, + "loss": 1.1728, + "step": 345 + }, + { + "epoch": 0.003017564668329525, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.1687, + "step": 346 + }, + { + "epoch": 0.0030262859534981074, + "grad_norm": 0.70703125, + "learning_rate": 0.0005, + "loss": 1.1539, + "step": 347 + }, + { + "epoch": 0.00303500723866669, + "grad_norm": 0.5546875, + "learning_rate": 0.0005, + "loss": 1.1481, + "step": 348 + }, + { + "epoch": 0.0030437285238352724, + "grad_norm": 0.625, + "learning_rate": 0.0005, + "loss": 1.1662, + "step": 349 + }, + { + "epoch": 0.003052449809003855, + "grad_norm": 0.8671875, + "learning_rate": 0.0005, + "loss": 1.1541, + "step": 350 + }, + { + "epoch": 0.0030611710941724373, + "grad_norm": 0.58984375, + "learning_rate": 0.0005, + "loss": 1.1755, + "step": 351 + }, + { + "epoch": 0.00306989237934102, + "grad_norm": 0.52734375, + "learning_rate": 0.0005, + "loss": 1.1598, + "step": 352 + }, + { + "epoch": 0.0030786136645096023, + "grad_norm": 0.58984375, + "learning_rate": 0.0005, + "loss": 1.1578, + "step": 353 + }, + { + "epoch": 0.003087334949678185, + "grad_norm": 0.5078125, + "learning_rate": 0.0005, + "loss": 1.1668, + "step": 354 + }, + { + "epoch": 0.003096056234846767, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.1436, + "step": 355 + }, + { + "epoch": 0.0031047775200153493, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.1769, + "step": 356 + }, + { + "epoch": 0.003113498805183932, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.1614, + "step": 357 + }, + { + "epoch": 0.0031222200903525143, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.1507, + "step": 358 + }, + { + "epoch": 0.003130941375521097, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.1652, + "step": 359 + }, + { + "epoch": 0.0031396626606896793, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.1542, + "step": 360 + }, + { + "epoch": 0.0031483839458582618, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.146, + "step": 361 + }, + { + "epoch": 0.0031571052310268443, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.1536, + "step": 362 + }, + { + "epoch": 0.0031658265161954267, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.1465, + "step": 363 + }, + { + "epoch": 0.003174547801364009, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1741, + "step": 364 + }, + { + "epoch": 0.0031832690865325913, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.1433, + "step": 365 + }, + { + "epoch": 0.0031919903717011738, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.1516, + "step": 366 + }, + { + "epoch": 0.0032007116568697563, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.1593, + "step": 367 + }, + { + "epoch": 0.0032094329420383387, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1483, + "step": 368 + }, + { + "epoch": 0.0032181542272069212, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1391, + "step": 369 + }, + { + "epoch": 0.0032268755123755037, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.1386, + "step": 370 + }, + { + "epoch": 0.003235596797544086, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1318, + "step": 371 + }, + { + "epoch": 0.0032443180827126687, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.1463, + "step": 372 + }, + { + "epoch": 0.003253039367881251, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.1397, + "step": 373 + }, + { + "epoch": 0.0032617606530498332, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1583, + "step": 374 + }, + { + "epoch": 0.0032704819382184157, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.1518, + "step": 375 + }, + { + "epoch": 0.003279203223386998, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.1538, + "step": 376 + }, + { + "epoch": 0.0032879245085555807, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.1394, + "step": 377 + }, + { + "epoch": 0.003296645793724163, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1456, + "step": 378 + }, + { + "epoch": 0.0033053670788927457, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.1436, + "step": 379 + }, + { + "epoch": 0.003314088364061328, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.1571, + "step": 380 + }, + { + "epoch": 0.0033228096492299106, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1592, + "step": 381 + }, + { + "epoch": 0.003331530934398493, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1559, + "step": 382 + }, + { + "epoch": 0.0033402522195670756, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.166, + "step": 383 + }, + { + "epoch": 0.0033489735047356577, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.146, + "step": 384 + }, + { + "epoch": 0.00335769478990424, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1466, + "step": 385 + }, + { + "epoch": 0.0033664160750728226, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1209, + "step": 386 + }, + { + "epoch": 0.003375137360241405, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 387 + }, + { + "epoch": 0.0033838586454099876, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1414, + "step": 388 + }, + { + "epoch": 0.00339257993057857, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.1556, + "step": 389 + }, + { + "epoch": 0.0034013012157471526, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.1346, + "step": 390 + }, + { + "epoch": 0.003410022500915735, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1589, + "step": 391 + }, + { + "epoch": 0.0034187437860843176, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.1349, + "step": 392 + }, + { + "epoch": 0.0034274650712528996, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1554, + "step": 393 + }, + { + "epoch": 0.003436186356421482, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1441, + "step": 394 + }, + { + "epoch": 0.0034449076415900646, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.162, + "step": 395 + }, + { + "epoch": 0.003453628926758647, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.1335, + "step": 396 + }, + { + "epoch": 0.0034623502119272296, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1424, + "step": 397 + }, + { + "epoch": 0.003471071497095812, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.139, + "step": 398 + }, + { + "epoch": 0.0034797927822643945, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.1717, + "step": 399 + }, + { + "epoch": 0.003488514067432977, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.1301, + "step": 400 + }, + { + "epoch": 0.0034972353526015595, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.15, + "step": 401 + }, + { + "epoch": 0.003505956637770142, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1425, + "step": 402 + }, + { + "epoch": 0.003514677922938724, + "grad_norm": 0.486328125, + "learning_rate": 0.0005, + "loss": 1.1429, + "step": 403 + }, + { + "epoch": 0.0035233992081073065, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.1604, + "step": 404 + }, + { + "epoch": 0.003532120493275889, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.1359, + "step": 405 + }, + { + "epoch": 0.0035408417784444715, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.1402, + "step": 406 + }, + { + "epoch": 0.003549563063613054, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.1458, + "step": 407 + }, + { + "epoch": 0.0035582843487816365, + "grad_norm": 0.50390625, + "learning_rate": 0.0005, + "loss": 1.1566, + "step": 408 + }, + { + "epoch": 0.003567005633950219, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.1342, + "step": 409 + }, + { + "epoch": 0.0035757269191188015, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.1352, + "step": 410 + }, + { + "epoch": 0.003584448204287384, + "grad_norm": 0.494140625, + "learning_rate": 0.0005, + "loss": 1.1258, + "step": 411 + }, + { + "epoch": 0.0035931694894559664, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.143, + "step": 412 + }, + { + "epoch": 0.0036018907746245485, + "grad_norm": 0.66015625, + "learning_rate": 0.0005, + "loss": 1.1776, + "step": 413 + }, + { + "epoch": 0.003610612059793131, + "grad_norm": 0.671875, + "learning_rate": 0.0005, + "loss": 1.1675, + "step": 414 + }, + { + "epoch": 0.0036193333449617135, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.1444, + "step": 415 + }, + { + "epoch": 0.003628054630130296, + "grad_norm": 0.49609375, + "learning_rate": 0.0005, + "loss": 1.1306, + "step": 416 + }, + { + "epoch": 0.0036367759152988784, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.1498, + "step": 417 + }, + { + "epoch": 0.003645497200467461, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.1512, + "step": 418 + }, + { + "epoch": 0.0036542184856360434, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.1593, + "step": 419 + }, + { + "epoch": 0.003662939770804626, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1464, + "step": 420 + }, + { + "epoch": 0.0036716610559732084, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.1589, + "step": 421 + }, + { + "epoch": 0.003680382341141791, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1272, + "step": 422 + }, + { + "epoch": 0.003689103626310373, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.1544, + "step": 423 + }, + { + "epoch": 0.0036978249114789554, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.1263, + "step": 424 + }, + { + "epoch": 0.003706546196647538, + "grad_norm": 0.39453125, + "learning_rate": 0.0005, + "loss": 1.1324, + "step": 425 + }, + { + "epoch": 0.0037152674818161204, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.1151, + "step": 426 + }, + { + "epoch": 0.003723988766984703, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.1287, + "step": 427 + }, + { + "epoch": 0.0037327100521532854, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.1481, + "step": 428 + }, + { + "epoch": 0.003741431337321868, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1501, + "step": 429 + }, + { + "epoch": 0.0037501526224904503, + "grad_norm": 0.44921875, + "learning_rate": 0.0005, + "loss": 1.1362, + "step": 430 + }, + { + "epoch": 0.003758873907659033, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.1326, + "step": 431 + }, + { + "epoch": 0.003767595192827615, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 1.1396, + "step": 432 + }, + { + "epoch": 0.0037763164779961974, + "grad_norm": 0.4921875, + "learning_rate": 0.0005, + "loss": 1.1366, + "step": 433 + }, + { + "epoch": 0.00378503776316478, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1514, + "step": 434 + }, + { + "epoch": 0.0037937590483333623, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 435 + }, + { + "epoch": 0.003802480333501945, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.1287, + "step": 436 + }, + { + "epoch": 0.0038112016186705273, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1565, + "step": 437 + }, + { + "epoch": 0.00381992290383911, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1513, + "step": 438 + }, + { + "epoch": 0.0038286441890076923, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.1365, + "step": 439 + }, + { + "epoch": 0.0038373654741762748, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.1538, + "step": 440 + }, + { + "epoch": 0.0038460867593448573, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.1528, + "step": 441 + }, + { + "epoch": 0.0038548080445134393, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.1558, + "step": 442 + }, + { + "epoch": 0.003863529329682022, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1553, + "step": 443 + }, + { + "epoch": 0.0038722506148506043, + "grad_norm": 0.396484375, + "learning_rate": 0.0005, + "loss": 1.1371, + "step": 444 + }, + { + "epoch": 0.0038809719000191868, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 445 + }, + { + "epoch": 0.0038896931851877693, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.1391, + "step": 446 + }, + { + "epoch": 0.0038984144703563517, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.1535, + "step": 447 + }, + { + "epoch": 0.003907135755524934, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.1461, + "step": 448 + }, + { + "epoch": 0.003915857040693516, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.1393, + "step": 449 + }, + { + "epoch": 0.003924578325862099, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.154, + "step": 450 + }, + { + "epoch": 0.003933299611030681, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1317, + "step": 451 + }, + { + "epoch": 0.003942020896199264, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.1402, + "step": 452 + }, + { + "epoch": 0.003950742181367846, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1156, + "step": 453 + }, + { + "epoch": 0.003959463466536429, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1493, + "step": 454 + }, + { + "epoch": 0.003968184751705011, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1405, + "step": 455 + }, + { + "epoch": 0.003976906036873594, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1243, + "step": 456 + }, + { + "epoch": 0.003985627322042176, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1431, + "step": 457 + }, + { + "epoch": 0.003994348607210758, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.1293, + "step": 458 + }, + { + "epoch": 0.004003069892379341, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1416, + "step": 459 + }, + { + "epoch": 0.004011791177547923, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1398, + "step": 460 + }, + { + "epoch": 0.004020512462716506, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.1379, + "step": 461 + }, + { + "epoch": 0.004029233747885088, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.1343, + "step": 462 + }, + { + "epoch": 0.004037955033053671, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.1422, + "step": 463 + }, + { + "epoch": 0.004046676318222253, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.1325, + "step": 464 + }, + { + "epoch": 0.004055397603390836, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1473, + "step": 465 + }, + { + "epoch": 0.004064118888559418, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1485, + "step": 466 + }, + { + "epoch": 0.004072840173728, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.1343, + "step": 467 + }, + { + "epoch": 0.004081561458896583, + "grad_norm": 0.59375, + "learning_rate": 0.0005, + "loss": 1.1255, + "step": 468 + }, + { + "epoch": 0.004090282744065165, + "grad_norm": 0.6875, + "learning_rate": 0.0005, + "loss": 1.145, + "step": 469 + }, + { + "epoch": 0.004099004029233748, + "grad_norm": 0.46875, + "learning_rate": 0.0005, + "loss": 1.1564, + "step": 470 + }, + { + "epoch": 0.00410772531440233, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1508, + "step": 471 + }, + { + "epoch": 0.004116446599570913, + "grad_norm": 0.423828125, + "learning_rate": 0.0005, + "loss": 1.1349, + "step": 472 + }, + { + "epoch": 0.004125167884739495, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.1621, + "step": 473 + }, + { + "epoch": 0.004133889169908078, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.1097, + "step": 474 + }, + { + "epoch": 0.00414261045507666, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1542, + "step": 475 + }, + { + "epoch": 0.004151331740245242, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.1575, + "step": 476 + }, + { + "epoch": 0.004160053025413825, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.1354, + "step": 477 + }, + { + "epoch": 0.004168774310582407, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1421, + "step": 478 + }, + { + "epoch": 0.00417749559575099, + "grad_norm": 0.439453125, + "learning_rate": 0.0005, + "loss": 1.1231, + "step": 479 + }, + { + "epoch": 0.004186216880919572, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.1303, + "step": 480 + }, + { + "epoch": 0.004194938166088155, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.1507, + "step": 481 + }, + { + "epoch": 0.004203659451256737, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.1467, + "step": 482 + }, + { + "epoch": 0.00421238073642532, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 483 + }, + { + "epoch": 0.004221102021593902, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.1241, + "step": 484 + }, + { + "epoch": 0.004229823306762485, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.1422, + "step": 485 + }, + { + "epoch": 0.004238544591931067, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.1421, + "step": 486 + }, + { + "epoch": 0.004247265877099649, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1268, + "step": 487 + }, + { + "epoch": 0.004255987162268232, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.1442, + "step": 488 + }, + { + "epoch": 0.004264708447436814, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1334, + "step": 489 + }, + { + "epoch": 0.004273429732605397, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.142, + "step": 490 + }, + { + "epoch": 0.004282151017773979, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.1415, + "step": 491 + }, + { + "epoch": 0.004290872302942562, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.1322, + "step": 492 + }, + { + "epoch": 0.004299593588111144, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1355, + "step": 493 + }, + { + "epoch": 0.004308314873279727, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1577, + "step": 494 + }, + { + "epoch": 0.004317036158448309, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.131, + "step": 495 + }, + { + "epoch": 0.004325757443616891, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.144, + "step": 496 + }, + { + "epoch": 0.004334478728785474, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1463, + "step": 497 + }, + { + "epoch": 0.004343200013954056, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.1358, + "step": 498 + }, + { + "epoch": 0.004351921299122639, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1464, + "step": 499 + }, + { + "epoch": 0.004360642584291221, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1234, + "step": 500 + }, + { + "epoch": 0.004369363869459804, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.1298, + "step": 501 + }, + { + "epoch": 0.004378085154628386, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1346, + "step": 502 + }, + { + "epoch": 0.004386806439796969, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1354, + "step": 503 + }, + { + "epoch": 0.004395527724965551, + "grad_norm": 0.49609375, + "learning_rate": 0.0005, + "loss": 1.1369, + "step": 504 + }, + { + "epoch": 0.004404249010134133, + "grad_norm": 0.53125, + "learning_rate": 0.0005, + "loss": 1.1504, + "step": 505 + }, + { + "epoch": 0.004412970295302716, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1082, + "step": 506 + }, + { + "epoch": 0.004421691580471298, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.1293, + "step": 507 + }, + { + "epoch": 0.004430412865639881, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.147, + "step": 508 + }, + { + "epoch": 0.004439134150808463, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.1063, + "step": 509 + }, + { + "epoch": 0.004447855435977046, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1415, + "step": 510 + }, + { + "epoch": 0.004456576721145628, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.148, + "step": 511 + }, + { + "epoch": 0.004465298006314211, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.1347, + "step": 512 + }, + { + "epoch": 0.004474019291482793, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.1489, + "step": 513 + }, + { + "epoch": 0.004482740576651376, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1608, + "step": 514 + }, + { + "epoch": 0.004491461861819958, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.1383, + "step": 515 + }, + { + "epoch": 0.00450018314698854, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.1507, + "step": 516 + }, + { + "epoch": 0.004508904432157123, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.1233, + "step": 517 + }, + { + "epoch": 0.004517625717325705, + "grad_norm": 0.65625, + "learning_rate": 0.0005, + "loss": 1.1428, + "step": 518 + }, + { + "epoch": 0.004526347002494288, + "grad_norm": 0.59375, + "learning_rate": 0.0005, + "loss": 1.1261, + "step": 519 + }, + { + "epoch": 0.00453506828766287, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1333, + "step": 520 + }, + { + "epoch": 0.004543789572831453, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.1374, + "step": 521 + }, + { + "epoch": 0.004552510858000035, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.1323, + "step": 522 + }, + { + "epoch": 0.004561232143168618, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.1326, + "step": 523 + }, + { + "epoch": 0.0045699534283372, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.125, + "step": 524 + }, + { + "epoch": 0.004578674713505782, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.1503, + "step": 525 + }, + { + "epoch": 0.004587395998674365, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 526 + }, + { + "epoch": 0.004596117283842947, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1366, + "step": 527 + }, + { + "epoch": 0.00460483856901153, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1251, + "step": 528 + }, + { + "epoch": 0.004613559854180112, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.1291, + "step": 529 + }, + { + "epoch": 0.004622281139348695, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.125, + "step": 530 + }, + { + "epoch": 0.004631002424517277, + "grad_norm": 0.373046875, + "learning_rate": 0.0005, + "loss": 1.1599, + "step": 531 + }, + { + "epoch": 0.00463972370968586, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.144, + "step": 532 + }, + { + "epoch": 0.004648444994854442, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.159, + "step": 533 + }, + { + "epoch": 0.004657166280023024, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.1187, + "step": 534 + }, + { + "epoch": 0.004665887565191607, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1319, + "step": 535 + }, + { + "epoch": 0.004674608850360189, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.1113, + "step": 536 + }, + { + "epoch": 0.004683330135528772, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1321, + "step": 537 + }, + { + "epoch": 0.004692051420697354, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1399, + "step": 538 + }, + { + "epoch": 0.004700772705865937, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.1191, + "step": 539 + }, + { + "epoch": 0.004709493991034519, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.1323, + "step": 540 + }, + { + "epoch": 0.004718215276203102, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 541 + }, + { + "epoch": 0.004726936561371684, + "grad_norm": 0.3671875, + "learning_rate": 0.0005, + "loss": 1.1483, + "step": 542 + }, + { + "epoch": 0.004735657846540267, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 543 + }, + { + "epoch": 0.004744379131708849, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.1346, + "step": 544 + }, + { + "epoch": 0.004753100416877431, + "grad_norm": 0.4375, + "learning_rate": 0.0005, + "loss": 1.1233, + "step": 545 + }, + { + "epoch": 0.004761821702046014, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.121, + "step": 546 + }, + { + "epoch": 0.004770542987214596, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.126, + "step": 547 + }, + { + "epoch": 0.004779264272383179, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.116, + "step": 548 + }, + { + "epoch": 0.004787985557551761, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.137, + "step": 549 + }, + { + "epoch": 0.004796706842720344, + "grad_norm": 0.34375, + "learning_rate": 0.0005, + "loss": 1.1171, + "step": 550 + }, + { + "epoch": 0.004805428127888926, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1375, + "step": 551 + }, + { + "epoch": 0.0048141494130575085, + "grad_norm": 0.43359375, + "learning_rate": 0.0005, + "loss": 1.1382, + "step": 552 + }, + { + "epoch": 0.004822870698226091, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1351, + "step": 553 + }, + { + "epoch": 0.004831591983394673, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1476, + "step": 554 + }, + { + "epoch": 0.004840313268563256, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.1492, + "step": 555 + }, + { + "epoch": 0.004849034553731838, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.1366, + "step": 556 + }, + { + "epoch": 0.0048577558389004205, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.1236, + "step": 557 + }, + { + "epoch": 0.004866477124069003, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1346, + "step": 558 + }, + { + "epoch": 0.0048751984092375855, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.139, + "step": 559 + }, + { + "epoch": 0.004883919694406168, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1334, + "step": 560 + }, + { + "epoch": 0.0048926409795747505, + "grad_norm": 0.4453125, + "learning_rate": 0.0005, + "loss": 1.1287, + "step": 561 + }, + { + "epoch": 0.0049013622647433325, + "grad_norm": 0.62109375, + "learning_rate": 0.0005, + "loss": 1.1398, + "step": 562 + }, + { + "epoch": 0.004910083549911915, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.1617, + "step": 563 + }, + { + "epoch": 0.0049188048350804975, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1291, + "step": 564 + }, + { + "epoch": 0.00492752612024908, + "grad_norm": 0.486328125, + "learning_rate": 0.0005, + "loss": 1.1329, + "step": 565 + }, + { + "epoch": 0.0049362474054176625, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.1327, + "step": 566 + }, + { + "epoch": 0.0049449686905862445, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1424, + "step": 567 + }, + { + "epoch": 0.0049536899757548275, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.1453, + "step": 568 + }, + { + "epoch": 0.0049624112609234095, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.1365, + "step": 569 + }, + { + "epoch": 0.0049711325460919924, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1318, + "step": 570 + }, + { + "epoch": 0.0049798538312605745, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1266, + "step": 571 + }, + { + "epoch": 0.004988575116429157, + "grad_norm": 0.455078125, + "learning_rate": 0.0005, + "loss": 1.1253, + "step": 572 + }, + { + "epoch": 0.0049972964015977395, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.1169, + "step": 573 + }, + { + "epoch": 0.0050060176867663215, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1285, + "step": 574 + }, + { + "epoch": 0.0050147389719349044, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.1299, + "step": 575 + }, + { + "epoch": 0.0050234602571034865, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1492, + "step": 576 + }, + { + "epoch": 0.005032181542272069, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.125, + "step": 577 + }, + { + "epoch": 0.0050409028274406515, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.113, + "step": 578 + }, + { + "epoch": 0.005049624112609234, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1278, + "step": 579 + }, + { + "epoch": 0.0050583453977778164, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.1244, + "step": 580 + }, + { + "epoch": 0.005067066682946399, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.1235, + "step": 581 + }, + { + "epoch": 0.005075787968114981, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 582 + }, + { + "epoch": 0.0050845092532835635, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 583 + }, + { + "epoch": 0.005093230538452146, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.1407, + "step": 584 + }, + { + "epoch": 0.0051019518236207284, + "grad_norm": 0.4609375, + "learning_rate": 0.0005, + "loss": 1.119, + "step": 585 + }, + { + "epoch": 0.005110673108789311, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1115, + "step": 586 + }, + { + "epoch": 0.005119394393957893, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.1211, + "step": 587 + }, + { + "epoch": 0.005128115679126476, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.1276, + "step": 588 + }, + { + "epoch": 0.005136836964295058, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.1233, + "step": 589 + }, + { + "epoch": 0.005145558249463641, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.1256, + "step": 590 + }, + { + "epoch": 0.005154279534632223, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1272, + "step": 591 + }, + { + "epoch": 0.005163000819800805, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1216, + "step": 592 + }, + { + "epoch": 0.005171722104969388, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.1227, + "step": 593 + }, + { + "epoch": 0.00518044339013797, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 594 + }, + { + "epoch": 0.005189164675306553, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.1134, + "step": 595 + }, + { + "epoch": 0.005197885960475135, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 596 + }, + { + "epoch": 0.005206607245643718, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.1309, + "step": 597 + }, + { + "epoch": 0.0052153285308123, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.1441, + "step": 598 + }, + { + "epoch": 0.005224049815980883, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1002, + "step": 599 + }, + { + "epoch": 0.005232771101149465, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.1246, + "step": 600 + }, + { + "epoch": 0.005241492386318048, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.1176, + "step": 601 + }, + { + "epoch": 0.00525021367148663, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1407, + "step": 602 + }, + { + "epoch": 0.005258934956655212, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 603 + }, + { + "epoch": 0.005267656241823795, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.1211, + "step": 604 + }, + { + "epoch": 0.005276377526992377, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.115, + "step": 605 + }, + { + "epoch": 0.00528509881216096, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.1235, + "step": 606 + }, + { + "epoch": 0.005293820097329542, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1416, + "step": 607 + }, + { + "epoch": 0.005302541382498125, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1277, + "step": 608 + }, + { + "epoch": 0.005311262667666707, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.1233, + "step": 609 + }, + { + "epoch": 0.00531998395283529, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1444, + "step": 610 + }, + { + "epoch": 0.005328705238003872, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1111, + "step": 611 + }, + { + "epoch": 0.005337426523172454, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.1391, + "step": 612 + }, + { + "epoch": 0.005346147808341037, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1228, + "step": 613 + }, + { + "epoch": 0.005354869093509619, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1377, + "step": 614 + }, + { + "epoch": 0.005363590378678202, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1259, + "step": 615 + }, + { + "epoch": 0.005372311663846784, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 616 + }, + { + "epoch": 0.005381032949015367, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1348, + "step": 617 + }, + { + "epoch": 0.005389754234183949, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.1059, + "step": 618 + }, + { + "epoch": 0.005398475519352532, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1285, + "step": 619 + }, + { + "epoch": 0.005407196804521114, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1367, + "step": 620 + }, + { + "epoch": 0.005415918089689696, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 621 + }, + { + "epoch": 0.005424639374858279, + "grad_norm": 0.404296875, + "learning_rate": 0.0005, + "loss": 1.1393, + "step": 622 + }, + { + "epoch": 0.005433360660026861, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.132, + "step": 623 + }, + { + "epoch": 0.005442081945195444, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.113, + "step": 624 + }, + { + "epoch": 0.005450803230364026, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.1186, + "step": 625 + }, + { + "epoch": 0.005459524515532609, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.1212, + "step": 626 + }, + { + "epoch": 0.005468245800701191, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.1239, + "step": 627 + }, + { + "epoch": 0.005476967085869774, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.1384, + "step": 628 + }, + { + "epoch": 0.005485688371038356, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1093, + "step": 629 + }, + { + "epoch": 0.005494409656206939, + "grad_norm": 0.57421875, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 630 + }, + { + "epoch": 0.005503130941375521, + "grad_norm": 0.63671875, + "learning_rate": 0.0005, + "loss": 1.1252, + "step": 631 + }, + { + "epoch": 0.005511852226544103, + "grad_norm": 0.41015625, + "learning_rate": 0.0005, + "loss": 1.1242, + "step": 632 + }, + { + "epoch": 0.005520573511712686, + "grad_norm": 0.337890625, + "learning_rate": 0.0005, + "loss": 1.1147, + "step": 633 + }, + { + "epoch": 0.005529294796881268, + "grad_norm": 0.87890625, + "learning_rate": 0.0005, + "loss": 1.1367, + "step": 634 + }, + { + "epoch": 0.005538016082049851, + "grad_norm": 0.76953125, + "learning_rate": 0.0005, + "loss": 1.1561, + "step": 635 + }, + { + "epoch": 0.005546737367218433, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 636 + }, + { + "epoch": 0.005555458652387016, + "grad_norm": 1.046875, + "learning_rate": 0.0005, + "loss": 1.1369, + "step": 637 + }, + { + "epoch": 0.005564179937555598, + "grad_norm": 0.7578125, + "learning_rate": 0.0005, + "loss": 1.137, + "step": 638 + }, + { + "epoch": 0.005572901222724181, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 639 + }, + { + "epoch": 0.005581622507892763, + "grad_norm": 0.81640625, + "learning_rate": 0.0005, + "loss": 1.1398, + "step": 640 + }, + { + "epoch": 0.005590343793061345, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.1145, + "step": 641 + }, + { + "epoch": 0.005599065078229928, + "grad_norm": 0.51171875, + "learning_rate": 0.0005, + "loss": 1.1358, + "step": 642 + }, + { + "epoch": 0.00560778636339851, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1121, + "step": 643 + }, + { + "epoch": 0.005616507648567093, + "grad_norm": 0.44140625, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 644 + }, + { + "epoch": 0.005625228933735675, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 645 + }, + { + "epoch": 0.005633950218904258, + "grad_norm": 0.39453125, + "learning_rate": 0.0005, + "loss": 1.1227, + "step": 646 + }, + { + "epoch": 0.00564267150407284, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.1259, + "step": 647 + }, + { + "epoch": 0.005651392789241423, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.1245, + "step": 648 + }, + { + "epoch": 0.005660114074410005, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.129, + "step": 649 + }, + { + "epoch": 0.005668835359578588, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.1201, + "step": 650 + }, + { + "epoch": 0.00567755664474717, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.1219, + "step": 651 + }, + { + "epoch": 0.005686277929915752, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1438, + "step": 652 + }, + { + "epoch": 0.005694999215084335, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.1301, + "step": 653 + }, + { + "epoch": 0.005703720500252917, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1274, + "step": 654 + }, + { + "epoch": 0.0057124417854215, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.1208, + "step": 655 + }, + { + "epoch": 0.005721163070590082, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1415, + "step": 656 + }, + { + "epoch": 0.005729884355758665, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.1253, + "step": 657 + }, + { + "epoch": 0.005738605640927247, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.1361, + "step": 658 + }, + { + "epoch": 0.00574732692609583, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.1059, + "step": 659 + }, + { + "epoch": 0.005756048211264412, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.005764769496432994, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1213, + "step": 661 + }, + { + "epoch": 0.005773490781601577, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1201, + "step": 662 + }, + { + "epoch": 0.005782212066770159, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1402, + "step": 663 + }, + { + "epoch": 0.005790933351938742, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.1216, + "step": 664 + }, + { + "epoch": 0.005799654637107324, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1382, + "step": 665 + }, + { + "epoch": 0.005808375922275907, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.122, + "step": 666 + }, + { + "epoch": 0.005817097207444489, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 667 + }, + { + "epoch": 0.005825818492613072, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.133, + "step": 668 + }, + { + "epoch": 0.005834539777781654, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1082, + "step": 669 + }, + { + "epoch": 0.005843261062950236, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.1333, + "step": 670 + }, + { + "epoch": 0.005851982348118819, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.131, + "step": 671 + }, + { + "epoch": 0.005860703633287401, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.13, + "step": 672 + }, + { + "epoch": 0.005869424918455984, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1258, + "step": 673 + }, + { + "epoch": 0.005878146203624566, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1251, + "step": 674 + }, + { + "epoch": 0.005886867488793149, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.124, + "step": 675 + }, + { + "epoch": 0.005895588773961731, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1281, + "step": 676 + }, + { + "epoch": 0.005904310059130314, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1421, + "step": 677 + }, + { + "epoch": 0.005913031344298896, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 678 + }, + { + "epoch": 0.005921752629467479, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1419, + "step": 679 + }, + { + "epoch": 0.005930473914636061, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.1294, + "step": 680 + }, + { + "epoch": 0.005939195199804643, + "grad_norm": 0.54296875, + "learning_rate": 0.0005, + "loss": 1.1271, + "step": 681 + }, + { + "epoch": 0.005947916484973226, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.1039, + "step": 682 + }, + { + "epoch": 0.005956637770141808, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.1179, + "step": 683 + }, + { + "epoch": 0.005965359055310391, + "grad_norm": 0.337890625, + "learning_rate": 0.0005, + "loss": 1.1272, + "step": 684 + }, + { + "epoch": 0.005974080340478973, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.1258, + "step": 685 + }, + { + "epoch": 0.005982801625647556, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 686 + }, + { + "epoch": 0.005991522910816138, + "grad_norm": 0.4921875, + "learning_rate": 0.0005, + "loss": 1.1302, + "step": 687 + }, + { + "epoch": 0.006000244195984721, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 688 + }, + { + "epoch": 0.006008965481153303, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.1458, + "step": 689 + }, + { + "epoch": 0.006017686766321885, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.1343, + "step": 690 + }, + { + "epoch": 0.006026408051490468, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.147, + "step": 691 + }, + { + "epoch": 0.00603512933665905, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.1356, + "step": 692 + }, + { + "epoch": 0.006043850621827633, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.115, + "step": 693 + }, + { + "epoch": 0.006052571906996215, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.1259, + "step": 694 + }, + { + "epoch": 0.006061293192164798, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1139, + "step": 695 + }, + { + "epoch": 0.00607001447733338, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.1161, + "step": 696 + }, + { + "epoch": 0.006078735762501963, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1326, + "step": 697 + }, + { + "epoch": 0.006087457047670545, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.1293, + "step": 698 + }, + { + "epoch": 0.006096178332839127, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.1251, + "step": 699 + }, + { + "epoch": 0.00610489961800771, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1383, + "step": 700 + }, + { + "epoch": 0.006113620903176292, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1113, + "step": 701 + }, + { + "epoch": 0.006122342188344875, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1127, + "step": 702 + }, + { + "epoch": 0.006131063473513457, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1439, + "step": 703 + }, + { + "epoch": 0.00613978475868204, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 704 + }, + { + "epoch": 0.006148506043850622, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1058, + "step": 705 + }, + { + "epoch": 0.006157227329019205, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 706 + }, + { + "epoch": 0.006165948614187787, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 707 + }, + { + "epoch": 0.00617466989935637, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.1311, + "step": 708 + }, + { + "epoch": 0.006183391184524952, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.1273, + "step": 709 + }, + { + "epoch": 0.006192112469693534, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 710 + }, + { + "epoch": 0.006200833754862117, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.1366, + "step": 711 + }, + { + "epoch": 0.006209555040030699, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1397, + "step": 712 + }, + { + "epoch": 0.006218276325199282, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.1131, + "step": 713 + }, + { + "epoch": 0.006226997610367864, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.1278, + "step": 714 + }, + { + "epoch": 0.0062357188955364466, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 715 + }, + { + "epoch": 0.006244440180705029, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.1113, + "step": 716 + }, + { + "epoch": 0.0062531614658736115, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.1165, + "step": 717 + }, + { + "epoch": 0.006261882751042194, + "grad_norm": 0.51953125, + "learning_rate": 0.0005, + "loss": 1.1357, + "step": 718 + }, + { + "epoch": 0.006270604036210776, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 719 + }, + { + "epoch": 0.0062793253213793586, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.1208, + "step": 720 + }, + { + "epoch": 0.006288046606547941, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 721 + }, + { + "epoch": 0.0062967678917165235, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.1075, + "step": 722 + }, + { + "epoch": 0.006305489176885106, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.1299, + "step": 723 + }, + { + "epoch": 0.0063142104620536885, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 724 + }, + { + "epoch": 0.0063229317472222706, + "grad_norm": 0.62109375, + "learning_rate": 0.0005, + "loss": 1.1277, + "step": 725 + }, + { + "epoch": 0.0063316530323908535, + "grad_norm": 0.5703125, + "learning_rate": 0.0005, + "loss": 1.115, + "step": 726 + }, + { + "epoch": 0.0063403743175594355, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1196, + "step": 727 + }, + { + "epoch": 0.006349095602728018, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1301, + "step": 728 + }, + { + "epoch": 0.0063578168878966005, + "grad_norm": 0.46875, + "learning_rate": 0.0005, + "loss": 1.1277, + "step": 729 + }, + { + "epoch": 0.0063665381730651826, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1201, + "step": 730 + }, + { + "epoch": 0.0063752594582337655, + "grad_norm": 0.46484375, + "learning_rate": 0.0005, + "loss": 1.123, + "step": 731 + }, + { + "epoch": 0.0063839807434023475, + "grad_norm": 0.6484375, + "learning_rate": 0.0005, + "loss": 1.1235, + "step": 732 + }, + { + "epoch": 0.0063927020285709305, + "grad_norm": 0.419921875, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 733 + }, + { + "epoch": 0.0064014233137395125, + "grad_norm": 0.4453125, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 734 + }, + { + "epoch": 0.006410144598908095, + "grad_norm": 0.94140625, + "learning_rate": 0.0005, + "loss": 1.1314, + "step": 735 + }, + { + "epoch": 0.0064188658840766775, + "grad_norm": 0.90625, + "learning_rate": 0.0005, + "loss": 1.1109, + "step": 736 + }, + { + "epoch": 0.00642758716924526, + "grad_norm": 0.4609375, + "learning_rate": 0.0005, + "loss": 1.1303, + "step": 737 + }, + { + "epoch": 0.0064363084544138425, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.1287, + "step": 738 + }, + { + "epoch": 0.0064450297395824245, + "grad_norm": 0.57421875, + "learning_rate": 0.0005, + "loss": 1.1247, + "step": 739 + }, + { + "epoch": 0.006453751024751007, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 740 + }, + { + "epoch": 0.0064624723099195895, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.1211, + "step": 741 + }, + { + "epoch": 0.006471193595088172, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1218, + "step": 742 + }, + { + "epoch": 0.0064799148802567545, + "grad_norm": 0.5546875, + "learning_rate": 0.0005, + "loss": 1.1199, + "step": 743 + }, + { + "epoch": 0.006488636165425337, + "grad_norm": 0.32421875, + "learning_rate": 0.0005, + "loss": 1.1443, + "step": 744 + }, + { + "epoch": 0.006497357450593919, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.1405, + "step": 745 + }, + { + "epoch": 0.006506078735762502, + "grad_norm": 0.34375, + "learning_rate": 0.0005, + "loss": 1.1157, + "step": 746 + }, + { + "epoch": 0.006514800020931084, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1387, + "step": 747 + }, + { + "epoch": 0.0065235213060996665, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.1009, + "step": 748 + }, + { + "epoch": 0.006532242591268249, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1197, + "step": 749 + }, + { + "epoch": 0.006540963876436831, + "grad_norm": 0.4296875, + "learning_rate": 0.0005, + "loss": 1.128, + "step": 750 + }, + { + "epoch": 0.006549685161605414, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 751 + }, + { + "epoch": 0.006558406446773996, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.1208, + "step": 752 + }, + { + "epoch": 0.006567127731942579, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1285, + "step": 753 + }, + { + "epoch": 0.006575849017111161, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.1312, + "step": 754 + }, + { + "epoch": 0.006584570302279744, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.1206, + "step": 755 + }, + { + "epoch": 0.006593291587448326, + "grad_norm": 0.67578125, + "learning_rate": 0.0005, + "loss": 1.1178, + "step": 756 + }, + { + "epoch": 0.006602012872616908, + "grad_norm": 0.6328125, + "learning_rate": 0.0005, + "loss": 1.1178, + "step": 757 + }, + { + "epoch": 0.006610734157785491, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.1356, + "step": 758 + }, + { + "epoch": 0.006619455442954073, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.1228, + "step": 759 + }, + { + "epoch": 0.006628176728122656, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.1323, + "step": 760 + }, + { + "epoch": 0.006636898013291238, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 761 + }, + { + "epoch": 0.006645619298459821, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1156, + "step": 762 + }, + { + "epoch": 0.006654340583628403, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1023, + "step": 763 + }, + { + "epoch": 0.006663061868796986, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 764 + }, + { + "epoch": 0.006671783153965568, + "grad_norm": 0.359375, + "learning_rate": 0.0005, + "loss": 1.1209, + "step": 765 + }, + { + "epoch": 0.006680504439134151, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.1324, + "step": 766 + }, + { + "epoch": 0.006689225724302733, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1185, + "step": 767 + }, + { + "epoch": 0.006697947009471315, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.118, + "step": 768 + }, + { + "epoch": 0.006706668294639898, + "grad_norm": 0.453125, + "learning_rate": 0.0005, + "loss": 1.1237, + "step": 769 + }, + { + "epoch": 0.00671538957980848, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1057, + "step": 770 + }, + { + "epoch": 0.006724110864977063, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.1179, + "step": 771 + }, + { + "epoch": 0.006732832150145645, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1106, + "step": 772 + }, + { + "epoch": 0.006741553435314228, + "grad_norm": 0.46875, + "learning_rate": 0.0005, + "loss": 1.1388, + "step": 773 + }, + { + "epoch": 0.00675027472048281, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.1295, + "step": 774 + }, + { + "epoch": 0.006758996005651393, + "grad_norm": 0.484375, + "learning_rate": 0.0005, + "loss": 1.1219, + "step": 775 + }, + { + "epoch": 0.006767717290819975, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.1172, + "step": 776 + }, + { + "epoch": 0.006776438575988557, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.1264, + "step": 777 + }, + { + "epoch": 0.00678515986115714, + "grad_norm": 0.6015625, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 778 + }, + { + "epoch": 0.006793881146325722, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 779 + }, + { + "epoch": 0.006802602431494305, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.1182, + "step": 780 + }, + { + "epoch": 0.006811323716662887, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.1256, + "step": 781 + }, + { + "epoch": 0.00682004500183147, + "grad_norm": 0.43359375, + "learning_rate": 0.0005, + "loss": 1.1073, + "step": 782 + }, + { + "epoch": 0.006828766287000052, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.126, + "step": 783 + }, + { + "epoch": 0.006837487572168635, + "grad_norm": 0.38671875, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 784 + }, + { + "epoch": 0.006846208857337217, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.1224, + "step": 785 + }, + { + "epoch": 0.006854930142505799, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.1166, + "step": 786 + }, + { + "epoch": 0.006863651427674382, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.1315, + "step": 787 + }, + { + "epoch": 0.006872372712842964, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 788 + }, + { + "epoch": 0.006881093998011547, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1339, + "step": 789 + }, + { + "epoch": 0.006889815283180129, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 790 + }, + { + "epoch": 0.006898536568348712, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.1264, + "step": 791 + }, + { + "epoch": 0.006907257853517294, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.1294, + "step": 792 + }, + { + "epoch": 0.006915979138685877, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1122, + "step": 793 + }, + { + "epoch": 0.006924700423854459, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1239, + "step": 794 + }, + { + "epoch": 0.006933421709023042, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1027, + "step": 795 + }, + { + "epoch": 0.006942142994191624, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 796 + }, + { + "epoch": 0.006950864279360206, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.1272, + "step": 797 + }, + { + "epoch": 0.006959585564528789, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1161, + "step": 798 + }, + { + "epoch": 0.006968306849697371, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 799 + }, + { + "epoch": 0.006977028134865954, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 800 + }, + { + "epoch": 0.006985749420034536, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.1111, + "step": 801 + }, + { + "epoch": 0.006994470705203119, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1049, + "step": 802 + }, + { + "epoch": 0.007003191990371701, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 803 + }, + { + "epoch": 0.007011913275540284, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 804 + }, + { + "epoch": 0.007020634560708866, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1419, + "step": 805 + }, + { + "epoch": 0.007029355845877448, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1066, + "step": 806 + }, + { + "epoch": 0.007038077131046031, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1106, + "step": 807 + }, + { + "epoch": 0.007046798416214613, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 808 + }, + { + "epoch": 0.007055519701383196, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.1049, + "step": 809 + }, + { + "epoch": 0.007064240986551778, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.1036, + "step": 810 + }, + { + "epoch": 0.007072962271720361, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 811 + }, + { + "epoch": 0.007081683556888943, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1198, + "step": 812 + }, + { + "epoch": 0.007090404842057526, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1073, + "step": 813 + }, + { + "epoch": 0.007099126127226108, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.1182, + "step": 814 + }, + { + "epoch": 0.00710784741239469, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 815 + }, + { + "epoch": 0.007116568697563273, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0985, + "step": 816 + }, + { + "epoch": 0.007125289982731855, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.1189, + "step": 817 + }, + { + "epoch": 0.007134011267900438, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1186, + "step": 818 + }, + { + "epoch": 0.00714273255306902, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.147, + "step": 819 + }, + { + "epoch": 0.007151453838237603, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.1133, + "step": 820 + }, + { + "epoch": 0.007160175123406185, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.1239, + "step": 821 + }, + { + "epoch": 0.007168896408574768, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 822 + }, + { + "epoch": 0.00717761769374335, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1023, + "step": 823 + }, + { + "epoch": 0.007186338978911933, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.1087, + "step": 824 + }, + { + "epoch": 0.007195060264080515, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 825 + }, + { + "epoch": 0.007203781549249097, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.1309, + "step": 826 + }, + { + "epoch": 0.00721250283441768, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1109, + "step": 827 + }, + { + "epoch": 0.007221224119586262, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.1162, + "step": 828 + }, + { + "epoch": 0.007229945404754845, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1197, + "step": 829 + }, + { + "epoch": 0.007238666689923427, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.1129, + "step": 830 + }, + { + "epoch": 0.00724738797509201, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1274, + "step": 831 + }, + { + "epoch": 0.007256109260260592, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.1289, + "step": 832 + }, + { + "epoch": 0.007264830545429175, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 833 + }, + { + "epoch": 0.007273551830597757, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 834 + }, + { + "epoch": 0.007282273115766339, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 835 + }, + { + "epoch": 0.007290994400934922, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.1242, + "step": 836 + }, + { + "epoch": 0.007299715686103504, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 837 + }, + { + "epoch": 0.007308436971272087, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.1329, + "step": 838 + }, + { + "epoch": 0.007317158256440669, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 839 + }, + { + "epoch": 0.007325879541609252, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1155, + "step": 840 + }, + { + "epoch": 0.007334600826777834, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.1081, + "step": 841 + }, + { + "epoch": 0.007343322111946417, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.1123, + "step": 842 + }, + { + "epoch": 0.007352043397114999, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.127, + "step": 843 + }, + { + "epoch": 0.007360764682283582, + "grad_norm": 0.412109375, + "learning_rate": 0.0005, + "loss": 1.1099, + "step": 844 + }, + { + "epoch": 0.007369485967452164, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 845 + }, + { + "epoch": 0.007378207252620746, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0993, + "step": 846 + }, + { + "epoch": 0.007386928537789329, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 847 + }, + { + "epoch": 0.007395649822957911, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1207, + "step": 848 + }, + { + "epoch": 0.007404371108126494, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1206, + "step": 849 + }, + { + "epoch": 0.007413092393295076, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.1099, + "step": 850 + }, + { + "epoch": 0.007421813678463659, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 851 + }, + { + "epoch": 0.007430534963632241, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0985, + "step": 852 + }, + { + "epoch": 0.007439256248800824, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 853 + }, + { + "epoch": 0.007447977533969406, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 854 + }, + { + "epoch": 0.007456698819137988, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.1103, + "step": 855 + }, + { + "epoch": 0.007465420104306571, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1041, + "step": 856 + }, + { + "epoch": 0.007474141389475153, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.1126, + "step": 857 + }, + { + "epoch": 0.007482862674643736, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.1281, + "step": 858 + }, + { + "epoch": 0.007491583959812318, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.127, + "step": 859 + }, + { + "epoch": 0.007500305244980901, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1133, + "step": 860 + }, + { + "epoch": 0.007509026530149483, + "grad_norm": 0.373046875, + "learning_rate": 0.0005, + "loss": 1.1273, + "step": 861 + }, + { + "epoch": 0.007517747815318066, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.1134, + "step": 862 + }, + { + "epoch": 0.007526469100486648, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 863 + }, + { + "epoch": 0.00753519038565523, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1201, + "step": 864 + }, + { + "epoch": 0.007543911670823813, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.1131, + "step": 865 + }, + { + "epoch": 0.007552632955992395, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0943, + "step": 866 + }, + { + "epoch": 0.007561354241160978, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 867 + }, + { + "epoch": 0.00757007552632956, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.1094, + "step": 868 + }, + { + "epoch": 0.007578796811498143, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1196, + "step": 869 + }, + { + "epoch": 0.007587518096666725, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.1174, + "step": 870 + }, + { + "epoch": 0.007596239381835308, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.1206, + "step": 871 + }, + { + "epoch": 0.00760496066700389, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1103, + "step": 872 + }, + { + "epoch": 0.0076136819521724726, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 873 + }, + { + "epoch": 0.007622403237341055, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.1109, + "step": 874 + }, + { + "epoch": 0.007631124522509637, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 875 + }, + { + "epoch": 0.00763984580767822, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1077, + "step": 876 + }, + { + "epoch": 0.007648567092846802, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1308, + "step": 877 + }, + { + "epoch": 0.0076572883780153846, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.1023, + "step": 878 + }, + { + "epoch": 0.007666009663183967, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 879 + }, + { + "epoch": 0.0076747309483525495, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.123, + "step": 880 + }, + { + "epoch": 0.007683452233521132, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1214, + "step": 881 + }, + { + "epoch": 0.0076921735186897145, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 882 + }, + { + "epoch": 0.0077008948038582966, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1193, + "step": 883 + }, + { + "epoch": 0.007709616089026879, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.1171, + "step": 884 + }, + { + "epoch": 0.0077183373741954615, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1097, + "step": 885 + }, + { + "epoch": 0.007727058659364044, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.112, + "step": 886 + }, + { + "epoch": 0.0077357799445326265, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.1159, + "step": 887 + }, + { + "epoch": 0.0077445012297012086, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1128, + "step": 888 + }, + { + "epoch": 0.0077532225148697915, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.1204, + "step": 889 + }, + { + "epoch": 0.0077619438000383735, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 890 + }, + { + "epoch": 0.0077706650852069565, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0958, + "step": 891 + }, + { + "epoch": 0.0077793863703755385, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.1167, + "step": 892 + }, + { + "epoch": 0.0077881076555441206, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.1081, + "step": 893 + }, + { + "epoch": 0.0077968289407127035, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1198, + "step": 894 + }, + { + "epoch": 0.0078055502258812855, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1178, + "step": 895 + }, + { + "epoch": 0.007814271511049868, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 896 + }, + { + "epoch": 0.007822992796218451, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 897 + }, + { + "epoch": 0.007831714081387033, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 898 + }, + { + "epoch": 0.007840435366555615, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1187, + "step": 899 + }, + { + "epoch": 0.007849156651724198, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.1169, + "step": 900 + }, + { + "epoch": 0.007857877936892781, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1052, + "step": 901 + }, + { + "epoch": 0.007866599222061363, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1033, + "step": 902 + }, + { + "epoch": 0.007875320507229945, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.1144, + "step": 903 + }, + { + "epoch": 0.007884041792398528, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 904 + }, + { + "epoch": 0.00789276307756711, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 905 + }, + { + "epoch": 0.007901484362735692, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.1285, + "step": 906 + }, + { + "epoch": 0.007910205647904275, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0975, + "step": 907 + }, + { + "epoch": 0.007918926933072858, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.1325, + "step": 908 + }, + { + "epoch": 0.00792764821824144, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.125, + "step": 909 + }, + { + "epoch": 0.007936369503410022, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.115, + "step": 910 + }, + { + "epoch": 0.007945090788578605, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 911 + }, + { + "epoch": 0.007953812073747188, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.1012, + "step": 912 + }, + { + "epoch": 0.00796253335891577, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.1254, + "step": 913 + }, + { + "epoch": 0.007971254644084352, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.1188, + "step": 914 + }, + { + "epoch": 0.007979975929252935, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.117, + "step": 915 + }, + { + "epoch": 0.007988697214421516, + "grad_norm": 0.490234375, + "learning_rate": 0.0005, + "loss": 1.1228, + "step": 916 + }, + { + "epoch": 0.0079974184995901, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 917 + }, + { + "epoch": 0.008006139784758682, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 918 + }, + { + "epoch": 0.008014861069927265, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1255, + "step": 919 + }, + { + "epoch": 0.008023582355095846, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1177, + "step": 920 + }, + { + "epoch": 0.00803230364026443, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1123, + "step": 921 + }, + { + "epoch": 0.008041024925433012, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 922 + }, + { + "epoch": 0.008049746210601593, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.1235, + "step": 923 + }, + { + "epoch": 0.008058467495770176, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 924 + }, + { + "epoch": 0.00806718878093876, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.097, + "step": 925 + }, + { + "epoch": 0.008075910066107342, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 926 + }, + { + "epoch": 0.008084631351275923, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.1257, + "step": 927 + }, + { + "epoch": 0.008093352636444506, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.1089, + "step": 928 + }, + { + "epoch": 0.00810207392161309, + "grad_norm": 0.32421875, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 929 + }, + { + "epoch": 0.008110795206781672, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.1106, + "step": 930 + }, + { + "epoch": 0.008119516491950253, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1082, + "step": 931 + }, + { + "epoch": 0.008128237777118836, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 932 + }, + { + "epoch": 0.00813695906228742, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 933 + }, + { + "epoch": 0.008145680347456, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1225, + "step": 934 + }, + { + "epoch": 0.008154401632624583, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.1135, + "step": 935 + }, + { + "epoch": 0.008163122917793166, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 936 + }, + { + "epoch": 0.008171844202961749, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1104, + "step": 937 + }, + { + "epoch": 0.00818056548813033, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 938 + }, + { + "epoch": 0.008189286773298913, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.1376, + "step": 939 + }, + { + "epoch": 0.008198008058467496, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1338, + "step": 940 + }, + { + "epoch": 0.008206729343636079, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1157, + "step": 941 + }, + { + "epoch": 0.00821545062880466, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1123, + "step": 942 + }, + { + "epoch": 0.008224171913973243, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 943 + }, + { + "epoch": 0.008232893199141826, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.1064, + "step": 944 + }, + { + "epoch": 0.008241614484310407, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.136, + "step": 945 + }, + { + "epoch": 0.00825033576947899, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0908, + "step": 946 + }, + { + "epoch": 0.008259057054647573, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.097, + "step": 947 + }, + { + "epoch": 0.008267778339816156, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1276, + "step": 948 + }, + { + "epoch": 0.008276499624984737, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.1234, + "step": 949 + }, + { + "epoch": 0.00828522091015332, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1144, + "step": 950 + }, + { + "epoch": 0.008293942195321903, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1045, + "step": 951 + }, + { + "epoch": 0.008302663480490484, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 952 + }, + { + "epoch": 0.008311384765659067, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.117, + "step": 953 + }, + { + "epoch": 0.00832010605082765, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 954 + }, + { + "epoch": 0.008328827335996233, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1322, + "step": 955 + }, + { + "epoch": 0.008337548621164814, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 956 + }, + { + "epoch": 0.008346269906333397, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.1232, + "step": 957 + }, + { + "epoch": 0.00835499119150198, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.1052, + "step": 958 + }, + { + "epoch": 0.008363712476670563, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 959 + }, + { + "epoch": 0.008372433761839144, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.112, + "step": 960 + }, + { + "epoch": 0.008381155047007727, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.1076, + "step": 961 + }, + { + "epoch": 0.00838987633217631, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1013, + "step": 962 + }, + { + "epoch": 0.008398597617344891, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 963 + }, + { + "epoch": 0.008407318902513474, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 964 + }, + { + "epoch": 0.008416040187682057, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 965 + }, + { + "epoch": 0.00842476147285064, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1109, + "step": 966 + }, + { + "epoch": 0.008433482758019221, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.092, + "step": 967 + }, + { + "epoch": 0.008442204043187804, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.107, + "step": 968 + }, + { + "epoch": 0.008450925328356387, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 969 + }, + { + "epoch": 0.00845964661352497, + "grad_norm": 0.44140625, + "learning_rate": 0.0005, + "loss": 1.135, + "step": 970 + }, + { + "epoch": 0.008468367898693551, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 971 + }, + { + "epoch": 0.008477089183862134, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.1372, + "step": 972 + }, + { + "epoch": 0.008485810469030717, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 973 + }, + { + "epoch": 0.008494531754199298, + "grad_norm": 0.423828125, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 974 + }, + { + "epoch": 0.008503253039367881, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.1268, + "step": 975 + }, + { + "epoch": 0.008511974324536464, + "grad_norm": 0.46875, + "learning_rate": 0.0005, + "loss": 1.1108, + "step": 976 + }, + { + "epoch": 0.008520695609705047, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 977 + }, + { + "epoch": 0.008529416894873628, + "grad_norm": 0.443359375, + "learning_rate": 0.0005, + "loss": 1.1141, + "step": 978 + }, + { + "epoch": 0.008538138180042211, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.1247, + "step": 979 + }, + { + "epoch": 0.008546859465210794, + "grad_norm": 0.390625, + "learning_rate": 0.0005, + "loss": 1.1237, + "step": 980 + }, + { + "epoch": 0.008555580750379375, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1082, + "step": 981 + }, + { + "epoch": 0.008564302035547958, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1089, + "step": 982 + }, + { + "epoch": 0.008573023320716541, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 983 + }, + { + "epoch": 0.008581744605885124, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.1096, + "step": 984 + }, + { + "epoch": 0.008590465891053705, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 985 + }, + { + "epoch": 0.008599187176222288, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 986 + }, + { + "epoch": 0.008607908461390871, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 987 + }, + { + "epoch": 0.008616629746559454, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 988 + }, + { + "epoch": 0.008625351031728035, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.1072, + "step": 989 + }, + { + "epoch": 0.008634072316896618, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 990 + }, + { + "epoch": 0.0086427936020652, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 991 + }, + { + "epoch": 0.008651514887233782, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.1209, + "step": 992 + }, + { + "epoch": 0.008660236172402365, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 993 + }, + { + "epoch": 0.008668957457570948, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 994 + }, + { + "epoch": 0.00867767874273953, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.1008, + "step": 995 + }, + { + "epoch": 0.008686400027908112, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0924, + "step": 996 + }, + { + "epoch": 0.008695121313076695, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.1178, + "step": 997 + }, + { + "epoch": 0.008703842598245278, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.1253, + "step": 998 + }, + { + "epoch": 0.00871256388341386, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 999 + }, + { + "epoch": 0.008721285168582442, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1000 + }, + { + "epoch": 0.008730006453751025, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1052, + "step": 1001 + }, + { + "epoch": 0.008738727738919608, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.1241, + "step": 1002 + }, + { + "epoch": 0.008747449024088189, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1016, + "step": 1003 + }, + { + "epoch": 0.008756170309256772, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 1004 + }, + { + "epoch": 0.008764891594425355, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.1085, + "step": 1005 + }, + { + "epoch": 0.008773612879593938, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 1006 + }, + { + "epoch": 0.008782334164762519, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.1167, + "step": 1007 + }, + { + "epoch": 0.008791055449931102, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.092, + "step": 1008 + }, + { + "epoch": 0.008799776735099685, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.1029, + "step": 1009 + }, + { + "epoch": 0.008808498020268266, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.1248, + "step": 1010 + }, + { + "epoch": 0.008817219305436849, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1178, + "step": 1011 + }, + { + "epoch": 0.008825940590605432, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1022, + "step": 1012 + }, + { + "epoch": 0.008834661875774015, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 1013 + }, + { + "epoch": 0.008843383160942596, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 1014 + }, + { + "epoch": 0.008852104446111179, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1072, + "step": 1015 + }, + { + "epoch": 0.008860825731279762, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0942, + "step": 1016 + }, + { + "epoch": 0.008869547016448345, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.1115, + "step": 1017 + }, + { + "epoch": 0.008878268301616926, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1104, + "step": 1018 + }, + { + "epoch": 0.008886989586785509, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1019 + }, + { + "epoch": 0.008895710871954092, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.1118, + "step": 1020 + }, + { + "epoch": 0.008904432157122673, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1223, + "step": 1021 + }, + { + "epoch": 0.008913153442291256, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.1029, + "step": 1022 + }, + { + "epoch": 0.008921874727459839, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.107, + "step": 1023 + }, + { + "epoch": 0.008930596012628422, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 1024 + }, + { + "epoch": 0.008939317297797003, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0998, + "step": 1025 + }, + { + "epoch": 0.008948038582965586, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 1026 + }, + { + "epoch": 0.008956759868134169, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 1027 + }, + { + "epoch": 0.008965481153302752, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 1028 + }, + { + "epoch": 0.008974202438471333, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.1133, + "step": 1029 + }, + { + "epoch": 0.008982923723639916, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 1030 + }, + { + "epoch": 0.008991645008808499, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.127, + "step": 1031 + }, + { + "epoch": 0.00900036629397708, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 1032 + }, + { + "epoch": 0.009009087579145663, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1059, + "step": 1033 + }, + { + "epoch": 0.009017808864314246, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0998, + "step": 1034 + }, + { + "epoch": 0.009026530149482829, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1012, + "step": 1035 + }, + { + "epoch": 0.00903525143465141, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1036 + }, + { + "epoch": 0.009043972719819993, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.1062, + "step": 1037 + }, + { + "epoch": 0.009052694004988576, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.1104, + "step": 1038 + }, + { + "epoch": 0.009061415290157157, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1246, + "step": 1039 + }, + { + "epoch": 0.00907013657532574, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 1040 + }, + { + "epoch": 0.009078857860494323, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 1041 + }, + { + "epoch": 0.009087579145662905, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.0968, + "step": 1042 + }, + { + "epoch": 0.009096300430831487, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1024, + "step": 1043 + }, + { + "epoch": 0.00910502171600007, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.115, + "step": 1044 + }, + { + "epoch": 0.009113743001168653, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 1045 + }, + { + "epoch": 0.009122464286337235, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1042, + "step": 1046 + }, + { + "epoch": 0.009131185571505817, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1096, + "step": 1047 + }, + { + "epoch": 0.0091399068566744, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 1048 + }, + { + "epoch": 0.009148628141842982, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.1031, + "step": 1049 + }, + { + "epoch": 0.009157349427011564, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 1050 + }, + { + "epoch": 0.009166070712180147, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 1051 + }, + { + "epoch": 0.00917479199734873, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1052 + }, + { + "epoch": 0.009183513282517312, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 1053 + }, + { + "epoch": 0.009192234567685894, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1148, + "step": 1054 + }, + { + "epoch": 0.009200955852854477, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 1055 + }, + { + "epoch": 0.00920967713802306, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.1082, + "step": 1056 + }, + { + "epoch": 0.009218398423191642, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.1045, + "step": 1057 + }, + { + "epoch": 0.009227119708360224, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 1058 + }, + { + "epoch": 0.009235840993528806, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1049, + "step": 1059 + }, + { + "epoch": 0.00924456227869739, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1060 + }, + { + "epoch": 0.00925328356386597, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.1159, + "step": 1061 + }, + { + "epoch": 0.009262004849034553, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.1103, + "step": 1062 + }, + { + "epoch": 0.009270726134203136, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.1141, + "step": 1063 + }, + { + "epoch": 0.00927944741937172, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 1064 + }, + { + "epoch": 0.0092881687045403, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0962, + "step": 1065 + }, + { + "epoch": 0.009296889989708883, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1059, + "step": 1066 + }, + { + "epoch": 0.009305611274877466, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 1067 + }, + { + "epoch": 0.009314332560046048, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.0999, + "step": 1068 + }, + { + "epoch": 0.00932305384521463, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 1069 + }, + { + "epoch": 0.009331775130383213, + "grad_norm": 0.58984375, + "learning_rate": 0.0005, + "loss": 1.1102, + "step": 1070 + }, + { + "epoch": 0.009340496415551796, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 1071 + }, + { + "epoch": 0.009349217700720377, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 1072 + }, + { + "epoch": 0.00935793898588896, + "grad_norm": 0.46484375, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 1073 + }, + { + "epoch": 0.009366660271057543, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.1292, + "step": 1074 + }, + { + "epoch": 0.009375381556226126, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 1075 + }, + { + "epoch": 0.009384102841394707, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1166, + "step": 1076 + }, + { + "epoch": 0.00939282412656329, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.121, + "step": 1077 + }, + { + "epoch": 0.009401545411731873, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 1078 + }, + { + "epoch": 0.009410266696900454, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 1079 + }, + { + "epoch": 0.009418987982069037, + "grad_norm": 0.458984375, + "learning_rate": 0.0005, + "loss": 1.119, + "step": 1080 + }, + { + "epoch": 0.00942770926723762, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1102, + "step": 1081 + }, + { + "epoch": 0.009436430552406203, + "grad_norm": 0.4140625, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 1082 + }, + { + "epoch": 0.009445151837574784, + "grad_norm": 0.73046875, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 1083 + }, + { + "epoch": 0.009453873122743367, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.1281, + "step": 1084 + }, + { + "epoch": 0.00946259440791195, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.1022, + "step": 1085 + }, + { + "epoch": 0.009471315693080533, + "grad_norm": 0.453125, + "learning_rate": 0.0005, + "loss": 1.1039, + "step": 1086 + }, + { + "epoch": 0.009480036978249114, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.1186, + "step": 1087 + }, + { + "epoch": 0.009488758263417697, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 1.1059, + "step": 1088 + }, + { + "epoch": 0.00949747954858628, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.131, + "step": 1089 + }, + { + "epoch": 0.009506200833754861, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 1090 + }, + { + "epoch": 0.009514922118923444, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1027, + "step": 1091 + }, + { + "epoch": 0.009523643404092027, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.1212, + "step": 1092 + }, + { + "epoch": 0.00953236468926061, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1174, + "step": 1093 + }, + { + "epoch": 0.009541085974429191, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 1094 + }, + { + "epoch": 0.009549807259597774, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0997, + "step": 1095 + }, + { + "epoch": 0.009558528544766357, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.1055, + "step": 1096 + }, + { + "epoch": 0.009567249829934938, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1128, + "step": 1097 + }, + { + "epoch": 0.009575971115103521, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1058, + "step": 1098 + }, + { + "epoch": 0.009584692400272104, + "grad_norm": 0.44921875, + "learning_rate": 0.0005, + "loss": 1.1081, + "step": 1099 + }, + { + "epoch": 0.009593413685440687, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.1132, + "step": 1100 + }, + { + "epoch": 0.009602134970609268, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 1101 + }, + { + "epoch": 0.009610856255777851, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.1149, + "step": 1102 + }, + { + "epoch": 0.009619577540946434, + "grad_norm": 0.48046875, + "learning_rate": 0.0005, + "loss": 1.1221, + "step": 1103 + }, + { + "epoch": 0.009628298826115017, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 1104 + }, + { + "epoch": 0.009637020111283598, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.1036, + "step": 1105 + }, + { + "epoch": 0.009645741396452181, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.1073, + "step": 1106 + }, + { + "epoch": 0.009654462681620764, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 1107 + }, + { + "epoch": 0.009663183966789345, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 1108 + }, + { + "epoch": 0.009671905251957928, + "grad_norm": 0.38671875, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 1109 + }, + { + "epoch": 0.009680626537126511, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.1, + "step": 1110 + }, + { + "epoch": 0.009689347822295094, + "grad_norm": 0.83203125, + "learning_rate": 0.0005, + "loss": 1.1039, + "step": 1111 + }, + { + "epoch": 0.009698069107463675, + "grad_norm": 0.65625, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 1112 + }, + { + "epoch": 0.009706790392632258, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.1339, + "step": 1113 + }, + { + "epoch": 0.009715511677800841, + "grad_norm": 0.8984375, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 1114 + }, + { + "epoch": 0.009724232962969424, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.1323, + "step": 1115 + }, + { + "epoch": 0.009732954248138005, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 1116 + }, + { + "epoch": 0.009741675533306588, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.1136, + "step": 1117 + }, + { + "epoch": 0.009750396818475171, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.1078, + "step": 1118 + }, + { + "epoch": 0.009759118103643752, + "grad_norm": 0.486328125, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 1119 + }, + { + "epoch": 0.009767839388812335, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 1120 + }, + { + "epoch": 0.009776560673980918, + "grad_norm": 0.455078125, + "learning_rate": 0.0005, + "loss": 1.1013, + "step": 1121 + }, + { + "epoch": 0.009785281959149501, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.1139, + "step": 1122 + }, + { + "epoch": 0.009794003244318082, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 1123 + }, + { + "epoch": 0.009802724529486665, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 1124 + }, + { + "epoch": 0.009811445814655248, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 1125 + }, + { + "epoch": 0.00982016709982383, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.112, + "step": 1126 + }, + { + "epoch": 0.009828888384992412, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.1332, + "step": 1127 + }, + { + "epoch": 0.009837609670160995, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.1246, + "step": 1128 + }, + { + "epoch": 0.009846330955329578, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.1231, + "step": 1129 + }, + { + "epoch": 0.00985505224049816, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.098, + "step": 1130 + }, + { + "epoch": 0.009863773525666742, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 1131 + }, + { + "epoch": 0.009872494810835325, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 1132 + }, + { + "epoch": 0.009881216096003908, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1085, + "step": 1133 + }, + { + "epoch": 0.009889937381172489, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 1134 + }, + { + "epoch": 0.009898658666341072, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.1079, + "step": 1135 + }, + { + "epoch": 0.009907379951509655, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 1136 + }, + { + "epoch": 0.009916101236678236, + "grad_norm": 0.49609375, + "learning_rate": 0.0005, + "loss": 1.1067, + "step": 1137 + }, + { + "epoch": 0.009924822521846819, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 1138 + }, + { + "epoch": 0.009933543807015402, + "grad_norm": 0.396484375, + "learning_rate": 0.0005, + "loss": 1.0999, + "step": 1139 + }, + { + "epoch": 0.009942265092183985, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0924, + "step": 1140 + }, + { + "epoch": 0.009950986377352566, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1101, + "step": 1141 + }, + { + "epoch": 0.009959707662521149, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 1142 + }, + { + "epoch": 0.009968428947689732, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1143 + }, + { + "epoch": 0.009977150232858315, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1008, + "step": 1144 + }, + { + "epoch": 0.009985871518026896, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.1098, + "step": 1145 + }, + { + "epoch": 0.009994592803195479, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.1077, + "step": 1146 + }, + { + "epoch": 0.010003314088364062, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1133, + "step": 1147 + }, + { + "epoch": 0.010012035373532643, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.1323, + "step": 1148 + }, + { + "epoch": 0.010020756658701226, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 1149 + }, + { + "epoch": 0.010029477943869809, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 1150 + }, + { + "epoch": 0.010038199229038392, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.1106, + "step": 1151 + }, + { + "epoch": 0.010046920514206973, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 1152 + }, + { + "epoch": 0.010055641799375556, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 1153 + }, + { + "epoch": 0.010064363084544139, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 1154 + }, + { + "epoch": 0.01007308436971272, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 1155 + }, + { + "epoch": 0.010081805654881303, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 1156 + }, + { + "epoch": 0.010090526940049886, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 1157 + }, + { + "epoch": 0.010099248225218469, + "grad_norm": 0.47265625, + "learning_rate": 0.0005, + "loss": 1.1057, + "step": 1158 + }, + { + "epoch": 0.01010796951038705, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.1122, + "step": 1159 + }, + { + "epoch": 0.010116690795555633, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1160 + }, + { + "epoch": 0.010125412080724216, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1063, + "step": 1161 + }, + { + "epoch": 0.010134133365892799, + "grad_norm": 0.439453125, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 1162 + }, + { + "epoch": 0.01014285465106138, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 1163 + }, + { + "epoch": 0.010151575936229963, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 1164 + }, + { + "epoch": 0.010160297221398546, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0918, + "step": 1165 + }, + { + "epoch": 0.010169018506567127, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 1166 + }, + { + "epoch": 0.01017773979173571, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.1019, + "step": 1167 + }, + { + "epoch": 0.010186461076904293, + "grad_norm": 0.39453125, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 1168 + }, + { + "epoch": 0.010195182362072876, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.122, + "step": 1169 + }, + { + "epoch": 0.010203903647241457, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.1114, + "step": 1170 + }, + { + "epoch": 0.01021262493241004, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 1171 + }, + { + "epoch": 0.010221346217578623, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 1172 + }, + { + "epoch": 0.010230067502747206, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.1028, + "step": 1173 + }, + { + "epoch": 0.010238788787915787, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1167, + "step": 1174 + }, + { + "epoch": 0.01024751007308437, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0922, + "step": 1175 + }, + { + "epoch": 0.010256231358252953, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 1176 + }, + { + "epoch": 0.010264952643421534, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0988, + "step": 1177 + }, + { + "epoch": 0.010273673928590117, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1162, + "step": 1178 + }, + { + "epoch": 0.0102823952137587, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 1179 + }, + { + "epoch": 0.010291116498927283, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.1213, + "step": 1180 + }, + { + "epoch": 0.010299837784095864, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 1181 + }, + { + "epoch": 0.010308559069264447, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1089, + "step": 1182 + }, + { + "epoch": 0.01031728035443303, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.102, + "step": 1183 + }, + { + "epoch": 0.01032600163960161, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1008, + "step": 1184 + }, + { + "epoch": 0.010334722924770194, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 1185 + }, + { + "epoch": 0.010343444209938777, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 1186 + }, + { + "epoch": 0.01035216549510736, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1052, + "step": 1187 + }, + { + "epoch": 0.01036088678027594, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.1033, + "step": 1188 + }, + { + "epoch": 0.010369608065444524, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1063, + "step": 1189 + }, + { + "epoch": 0.010378329350613107, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.14, + "step": 1190 + }, + { + "epoch": 0.01038705063578169, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 1191 + }, + { + "epoch": 0.01039577192095027, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1093, + "step": 1192 + }, + { + "epoch": 0.010404493206118854, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1193 + }, + { + "epoch": 0.010413214491287437, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.1036, + "step": 1194 + }, + { + "epoch": 0.010421935776456018, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1093, + "step": 1195 + }, + { + "epoch": 0.0104306570616246, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 1196 + }, + { + "epoch": 0.010439378346793184, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 1197 + }, + { + "epoch": 0.010448099631961767, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 1198 + }, + { + "epoch": 0.010456820917130348, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 1199 + }, + { + "epoch": 0.01046554220229893, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0949, + "step": 1200 + }, + { + "epoch": 0.010474263487467514, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.1027, + "step": 1201 + }, + { + "epoch": 0.010482984772636096, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1155, + "step": 1202 + }, + { + "epoch": 0.010491706057804678, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 1203 + }, + { + "epoch": 0.01050042734297326, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 1204 + }, + { + "epoch": 0.010509148628141844, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.112, + "step": 1205 + }, + { + "epoch": 0.010517869913310425, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 1206 + }, + { + "epoch": 0.010526591198479008, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 1207 + }, + { + "epoch": 0.01053531248364759, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 1208 + }, + { + "epoch": 0.010544033768816173, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.117, + "step": 1209 + }, + { + "epoch": 0.010552755053984755, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 1210 + }, + { + "epoch": 0.010561476339153338, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 1211 + }, + { + "epoch": 0.01057019762432192, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 1212 + }, + { + "epoch": 0.010578918909490502, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 1213 + }, + { + "epoch": 0.010587640194659085, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1036, + "step": 1214 + }, + { + "epoch": 0.010596361479827668, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 1215 + }, + { + "epoch": 0.01060508276499625, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 1216 + }, + { + "epoch": 0.010613804050164832, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 1217 + }, + { + "epoch": 0.010622525335333415, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.1027, + "step": 1218 + }, + { + "epoch": 0.010631246620501997, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 1219 + }, + { + "epoch": 0.01063996790567058, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 1220 + }, + { + "epoch": 0.010648689190839162, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1006, + "step": 1221 + }, + { + "epoch": 0.010657410476007744, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.1057, + "step": 1222 + }, + { + "epoch": 0.010666131761176327, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 1223 + }, + { + "epoch": 0.010674853046344909, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 1224 + }, + { + "epoch": 0.010683574331513492, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 1225 + }, + { + "epoch": 0.010692295616682074, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0881, + "step": 1226 + }, + { + "epoch": 0.010701016901850657, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.098, + "step": 1227 + }, + { + "epoch": 0.010709738187019239, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 1228 + }, + { + "epoch": 0.010718459472187821, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1229 + }, + { + "epoch": 0.010727180757356404, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 1230 + }, + { + "epoch": 0.010735902042524987, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 1231 + }, + { + "epoch": 0.010744623327693568, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1163, + "step": 1232 + }, + { + "epoch": 0.010753344612862151, + "grad_norm": 0.490234375, + "learning_rate": 0.0005, + "loss": 1.1206, + "step": 1233 + }, + { + "epoch": 0.010762065898030734, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 1234 + }, + { + "epoch": 0.010770787183199316, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 1235 + }, + { + "epoch": 0.010779508468367898, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 1236 + }, + { + "epoch": 0.010788229753536481, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 1237 + }, + { + "epoch": 0.010796951038705064, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.1002, + "step": 1238 + }, + { + "epoch": 0.010805672323873645, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0994, + "step": 1239 + }, + { + "epoch": 0.010814393609042228, + "grad_norm": 0.435546875, + "learning_rate": 0.0005, + "loss": 1.1129, + "step": 1240 + }, + { + "epoch": 0.010823114894210811, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 1241 + }, + { + "epoch": 0.010831836179379392, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 1242 + }, + { + "epoch": 0.010840557464547975, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 1243 + }, + { + "epoch": 0.010849278749716558, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.0975, + "step": 1244 + }, + { + "epoch": 0.010858000034885141, + "grad_norm": 0.60546875, + "learning_rate": 0.0005, + "loss": 1.1113, + "step": 1245 + }, + { + "epoch": 0.010866721320053722, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 1246 + }, + { + "epoch": 0.010875442605222305, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 1247 + }, + { + "epoch": 0.010884163890390888, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 1248 + }, + { + "epoch": 0.010892885175559471, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 1249 + }, + { + "epoch": 0.010901606460728052, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.1179, + "step": 1250 + }, + { + "epoch": 0.010910327745896635, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 1251 + }, + { + "epoch": 0.010919049031065218, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 1252 + }, + { + "epoch": 0.0109277703162338, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 1253 + }, + { + "epoch": 0.010936491601402382, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.1042, + "step": 1254 + }, + { + "epoch": 0.010945212886570965, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.1098, + "step": 1255 + }, + { + "epoch": 0.010953934171739548, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 1256 + }, + { + "epoch": 0.01096265545690813, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 1257 + }, + { + "epoch": 0.010971376742076712, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.1107, + "step": 1258 + }, + { + "epoch": 0.010980098027245295, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1004, + "step": 1259 + }, + { + "epoch": 0.010988819312413878, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1079, + "step": 1260 + }, + { + "epoch": 0.01099754059758246, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.1028, + "step": 1261 + }, + { + "epoch": 0.011006261882751042, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.1055, + "step": 1262 + }, + { + "epoch": 0.011014983167919625, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1064, + "step": 1263 + }, + { + "epoch": 0.011023704453088206, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.095, + "step": 1264 + }, + { + "epoch": 0.01103242573825679, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.1113, + "step": 1265 + }, + { + "epoch": 0.011041147023425372, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1115, + "step": 1266 + }, + { + "epoch": 0.011049868308593955, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 1267 + }, + { + "epoch": 0.011058589593762536, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 1268 + }, + { + "epoch": 0.01106731087893112, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.1258, + "step": 1269 + }, + { + "epoch": 0.011076032164099702, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0917, + "step": 1270 + }, + { + "epoch": 0.011084753449268285, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1192, + "step": 1271 + }, + { + "epoch": 0.011093474734436866, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 1272 + }, + { + "epoch": 0.01110219601960545, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1033, + "step": 1273 + }, + { + "epoch": 0.011110917304774032, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 1274 + }, + { + "epoch": 0.011119638589942613, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 1275 + }, + { + "epoch": 0.011128359875111196, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 1276 + }, + { + "epoch": 0.011137081160279779, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0924, + "step": 1277 + }, + { + "epoch": 0.011145802445448362, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.1112, + "step": 1278 + }, + { + "epoch": 0.011154523730616943, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1279 + }, + { + "epoch": 0.011163245015785526, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 1280 + }, + { + "epoch": 0.011171966300954109, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 1281 + }, + { + "epoch": 0.01118068758612269, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 1282 + }, + { + "epoch": 0.011189408871291273, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.1038, + "step": 1283 + }, + { + "epoch": 0.011198130156459856, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 1284 + }, + { + "epoch": 0.011206851441628439, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0972, + "step": 1285 + }, + { + "epoch": 0.01121557272679702, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.1142, + "step": 1286 + }, + { + "epoch": 0.011224294011965603, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0999, + "step": 1287 + }, + { + "epoch": 0.011233015297134186, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 1288 + }, + { + "epoch": 0.011241736582302769, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1111, + "step": 1289 + }, + { + "epoch": 0.01125045786747135, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.1161, + "step": 1290 + }, + { + "epoch": 0.011259179152639933, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 1291 + }, + { + "epoch": 0.011267900437808516, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.0973, + "step": 1292 + }, + { + "epoch": 0.011276621722977097, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.1095, + "step": 1293 + }, + { + "epoch": 0.01128534300814568, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.1026, + "step": 1294 + }, + { + "epoch": 0.011294064293314263, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 1295 + }, + { + "epoch": 0.011302785578482846, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1009, + "step": 1296 + }, + { + "epoch": 0.011311506863651427, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 1297 + }, + { + "epoch": 0.01132022814882001, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0993, + "step": 1298 + }, + { + "epoch": 0.011328949433988593, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1299 + }, + { + "epoch": 0.011337670719157176, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 1.1064, + "step": 1300 + }, + { + "epoch": 0.011346392004325757, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1025, + "step": 1301 + }, + { + "epoch": 0.01135511328949434, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.1247, + "step": 1302 + }, + { + "epoch": 0.011363834574662923, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 1303 + }, + { + "epoch": 0.011372555859831504, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 1304 + }, + { + "epoch": 0.011381277145000087, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 1305 + }, + { + "epoch": 0.01138999843016867, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 1306 + }, + { + "epoch": 0.011398719715337253, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 1307 + }, + { + "epoch": 0.011407441000505834, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.1138, + "step": 1308 + }, + { + "epoch": 0.011416162285674417, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.1016, + "step": 1309 + }, + { + "epoch": 0.011424883570843, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1139, + "step": 1310 + }, + { + "epoch": 0.011433604856011581, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 1311 + }, + { + "epoch": 0.011442326141180164, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 1312 + }, + { + "epoch": 0.011451047426348747, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0949, + "step": 1313 + }, + { + "epoch": 0.01145976871151733, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 1314 + }, + { + "epoch": 0.011468489996685911, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 1315 + }, + { + "epoch": 0.011477211281854494, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0875, + "step": 1316 + }, + { + "epoch": 0.011485932567023077, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 1317 + }, + { + "epoch": 0.01149465385219166, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1103, + "step": 1318 + }, + { + "epoch": 0.011503375137360241, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.097, + "step": 1319 + }, + { + "epoch": 0.011512096422528824, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 1320 + }, + { + "epoch": 0.011520817707697407, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 1321 + }, + { + "epoch": 0.011529538992865988, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 1322 + }, + { + "epoch": 0.011538260278034571, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0915, + "step": 1323 + }, + { + "epoch": 0.011546981563203154, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.1049, + "step": 1324 + }, + { + "epoch": 0.011555702848371737, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0983, + "step": 1325 + }, + { + "epoch": 0.011564424133540318, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.101, + "step": 1326 + }, + { + "epoch": 0.0115731454187089, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 1327 + }, + { + "epoch": 0.011581866703877484, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 1328 + }, + { + "epoch": 0.011590587989046067, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 1329 + }, + { + "epoch": 0.011599309274214648, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 1330 + }, + { + "epoch": 0.01160803055938323, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1237, + "step": 1331 + }, + { + "epoch": 0.011616751844551814, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 1332 + }, + { + "epoch": 0.011625473129720395, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 1333 + }, + { + "epoch": 0.011634194414888978, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.102, + "step": 1334 + }, + { + "epoch": 0.01164291570005756, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 1335 + }, + { + "epoch": 0.011651636985226144, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0938, + "step": 1336 + }, + { + "epoch": 0.011660358270394725, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1087, + "step": 1337 + }, + { + "epoch": 0.011669079555563308, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 1338 + }, + { + "epoch": 0.01167780084073189, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.1105, + "step": 1339 + }, + { + "epoch": 0.011686522125900472, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1137, + "step": 1340 + }, + { + "epoch": 0.011695243411069055, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 1341 + }, + { + "epoch": 0.011703964696237638, + "grad_norm": 0.4609375, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 1342 + }, + { + "epoch": 0.01171268598140622, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 1343 + }, + { + "epoch": 0.011721407266574802, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 1344 + }, + { + "epoch": 0.011730128551743385, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0901, + "step": 1345 + }, + { + "epoch": 0.011738849836911968, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.091, + "step": 1346 + }, + { + "epoch": 0.01174757112208055, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 1347 + }, + { + "epoch": 0.011756292407249132, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1107, + "step": 1348 + }, + { + "epoch": 0.011765013692417715, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 1349 + }, + { + "epoch": 0.011773734977586298, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 1350 + }, + { + "epoch": 0.011782456262754879, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 1351 + }, + { + "epoch": 0.011791177547923462, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.1094, + "step": 1352 + }, + { + "epoch": 0.011799898833092045, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.1057, + "step": 1353 + }, + { + "epoch": 0.011808620118260628, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 1354 + }, + { + "epoch": 0.011817341403429209, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.1049, + "step": 1355 + }, + { + "epoch": 0.011826062688597792, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 1356 + }, + { + "epoch": 0.011834783973766375, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 1357 + }, + { + "epoch": 0.011843505258934958, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 1358 + }, + { + "epoch": 0.011852226544103539, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 1359 + }, + { + "epoch": 0.011860947829272122, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 1360 + }, + { + "epoch": 0.011869669114440705, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.1181, + "step": 1361 + }, + { + "epoch": 0.011878390399609286, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.1058, + "step": 1362 + }, + { + "epoch": 0.011887111684777869, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.1062, + "step": 1363 + }, + { + "epoch": 0.011895832969946452, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 1364 + }, + { + "epoch": 0.011904554255115034, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.1, + "step": 1365 + }, + { + "epoch": 0.011913275540283616, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.107, + "step": 1366 + }, + { + "epoch": 0.011921996825452199, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0983, + "step": 1367 + }, + { + "epoch": 0.011930718110620782, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 1368 + }, + { + "epoch": 0.011939439395789363, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 1369 + }, + { + "epoch": 0.011948160680957946, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0999, + "step": 1370 + }, + { + "epoch": 0.011956881966126529, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 1371 + }, + { + "epoch": 0.011965603251295111, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 1372 + }, + { + "epoch": 0.011974324536463693, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.1225, + "step": 1373 + }, + { + "epoch": 0.011983045821632276, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 1374 + }, + { + "epoch": 0.011991767106800858, + "grad_norm": 0.400390625, + "learning_rate": 0.0005, + "loss": 1.1148, + "step": 1375 + }, + { + "epoch": 0.012000488391969441, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1022, + "step": 1376 + }, + { + "epoch": 0.012009209677138023, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.1002, + "step": 1377 + }, + { + "epoch": 0.012017930962306606, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1231, + "step": 1378 + }, + { + "epoch": 0.012026652247475188, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.1241, + "step": 1379 + }, + { + "epoch": 0.01203537353264377, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.1019, + "step": 1380 + }, + { + "epoch": 0.012044094817812353, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 1381 + }, + { + "epoch": 0.012052816102980935, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 1382 + }, + { + "epoch": 0.012061537388149518, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.1029, + "step": 1383 + }, + { + "epoch": 0.0120702586733181, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 1384 + }, + { + "epoch": 0.012078979958486682, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.1009, + "step": 1385 + }, + { + "epoch": 0.012087701243655265, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 1386 + }, + { + "epoch": 0.012096422528823848, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1387 + }, + { + "epoch": 0.01210514381399243, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 1388 + }, + { + "epoch": 0.012113865099161012, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 1389 + }, + { + "epoch": 0.012122586384329595, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 1390 + }, + { + "epoch": 0.012131307669498177, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0983, + "step": 1391 + }, + { + "epoch": 0.01214002895466676, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 1392 + }, + { + "epoch": 0.012148750239835342, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0985, + "step": 1393 + }, + { + "epoch": 0.012157471525003925, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 1394 + }, + { + "epoch": 0.012166192810172506, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 1395 + }, + { + "epoch": 0.01217491409534109, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 1396 + }, + { + "epoch": 0.012183635380509672, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 1397 + }, + { + "epoch": 0.012192356665678254, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 1398 + }, + { + "epoch": 0.012201077950846836, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1108, + "step": 1399 + }, + { + "epoch": 0.01220979923601542, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.1038, + "step": 1400 + }, + { + "epoch": 0.012218520521184002, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.1029, + "step": 1401 + }, + { + "epoch": 0.012227241806352583, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0873, + "step": 1402 + }, + { + "epoch": 0.012235963091521166, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.1058, + "step": 1403 + }, + { + "epoch": 0.01224468437668975, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0913, + "step": 1404 + }, + { + "epoch": 0.012253405661858332, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 1405 + }, + { + "epoch": 0.012262126947026913, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.1028, + "step": 1406 + }, + { + "epoch": 0.012270848232195496, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 1407 + }, + { + "epoch": 0.01227956951736408, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.1177, + "step": 1408 + }, + { + "epoch": 0.01228829080253266, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 1409 + }, + { + "epoch": 0.012297012087701243, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1103, + "step": 1410 + }, + { + "epoch": 0.012305733372869826, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 1411 + }, + { + "epoch": 0.01231445465803841, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 1412 + }, + { + "epoch": 0.01232317594320699, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.1016, + "step": 1413 + }, + { + "epoch": 0.012331897228375573, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 1414 + }, + { + "epoch": 0.012340618513544156, + "grad_norm": 0.40625, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 1415 + }, + { + "epoch": 0.01234933979871274, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 1416 + }, + { + "epoch": 0.01235806108388132, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1417 + }, + { + "epoch": 0.012366782369049903, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.12, + "step": 1418 + }, + { + "epoch": 0.012375503654218486, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.098, + "step": 1419 + }, + { + "epoch": 0.012384224939387067, + "grad_norm": 0.32421875, + "learning_rate": 0.0005, + "loss": 1.0908, + "step": 1420 + }, + { + "epoch": 0.01239294622455565, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 1421 + }, + { + "epoch": 0.012401667509724233, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0938, + "step": 1422 + }, + { + "epoch": 0.012410388794892816, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 1423 + }, + { + "epoch": 0.012419110080061397, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0929, + "step": 1424 + }, + { + "epoch": 0.01242783136522998, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 1425 + }, + { + "epoch": 0.012436552650398563, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 1426 + }, + { + "epoch": 0.012445273935567144, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 1427 + }, + { + "epoch": 0.012453995220735727, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 1428 + }, + { + "epoch": 0.01246271650590431, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1095, + "step": 1429 + }, + { + "epoch": 0.012471437791072893, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 1430 + }, + { + "epoch": 0.012480159076241474, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.097, + "step": 1431 + }, + { + "epoch": 0.012488880361410057, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1432 + }, + { + "epoch": 0.01249760164657864, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 1433 + }, + { + "epoch": 0.012506322931747223, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 1434 + }, + { + "epoch": 0.012515044216915804, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 1435 + }, + { + "epoch": 0.012523765502084387, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1436 + }, + { + "epoch": 0.01253248678725297, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 1437 + }, + { + "epoch": 0.012541208072421551, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 1438 + }, + { + "epoch": 0.012549929357590134, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0958, + "step": 1439 + }, + { + "epoch": 0.012558650642758717, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 1440 + }, + { + "epoch": 0.0125673719279273, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1043, + "step": 1441 + }, + { + "epoch": 0.012576093213095881, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 1442 + }, + { + "epoch": 0.012584814498264464, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 1443 + }, + { + "epoch": 0.012593535783433047, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 1444 + }, + { + "epoch": 0.01260225706860163, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 1445 + }, + { + "epoch": 0.012610978353770211, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 1446 + }, + { + "epoch": 0.012619699638938794, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1447 + }, + { + "epoch": 0.012628420924107377, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 1448 + }, + { + "epoch": 0.012637142209275958, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 1449 + }, + { + "epoch": 0.012645863494444541, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.1032, + "step": 1450 + }, + { + "epoch": 0.012654584779613124, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 1451 + }, + { + "epoch": 0.012663306064781707, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0975, + "step": 1452 + }, + { + "epoch": 0.012672027349950288, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1453 + }, + { + "epoch": 0.012680748635118871, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 1454 + }, + { + "epoch": 0.012689469920287454, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 1455 + }, + { + "epoch": 0.012698191205456035, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 1456 + }, + { + "epoch": 0.012706912490624618, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.1044, + "step": 1457 + }, + { + "epoch": 0.012715633775793201, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 1458 + }, + { + "epoch": 0.012724355060961784, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 1459 + }, + { + "epoch": 0.012733076346130365, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 1460 + }, + { + "epoch": 0.012741797631298948, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 1461 + }, + { + "epoch": 0.012750518916467531, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.1193, + "step": 1462 + }, + { + "epoch": 0.012759240201636114, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 1463 + }, + { + "epoch": 0.012767961486804695, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.1033, + "step": 1464 + }, + { + "epoch": 0.012776682771973278, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.1062, + "step": 1465 + }, + { + "epoch": 0.012785404057141861, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0947, + "step": 1466 + }, + { + "epoch": 0.012794125342310442, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.1145, + "step": 1467 + }, + { + "epoch": 0.012802846627479025, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.093, + "step": 1468 + }, + { + "epoch": 0.012811567912647608, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 1469 + }, + { + "epoch": 0.01282028919781619, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 1470 + }, + { + "epoch": 0.012829010482984772, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1032, + "step": 1471 + }, + { + "epoch": 0.012837731768153355, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1472 + }, + { + "epoch": 0.012846453053321938, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 1473 + }, + { + "epoch": 0.01285517433849052, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 1474 + }, + { + "epoch": 0.012863895623659102, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 1475 + }, + { + "epoch": 0.012872616908827685, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 1476 + }, + { + "epoch": 0.012881338193996268, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 1477 + }, + { + "epoch": 0.012890059479164849, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 1478 + }, + { + "epoch": 0.012898780764333432, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 1479 + }, + { + "epoch": 0.012907502049502015, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 1480 + }, + { + "epoch": 0.012916223334670598, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 1481 + }, + { + "epoch": 0.012924944619839179, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.1076, + "step": 1482 + }, + { + "epoch": 0.012933665905007762, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0926, + "step": 1483 + }, + { + "epoch": 0.012942387190176345, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 1484 + }, + { + "epoch": 0.012951108475344926, + "grad_norm": 0.462890625, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 1485 + }, + { + "epoch": 0.012959829760513509, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 1486 + }, + { + "epoch": 0.012968551045682092, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.1055, + "step": 1487 + }, + { + "epoch": 0.012977272330850675, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.1049, + "step": 1488 + }, + { + "epoch": 0.012985993616019256, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1155, + "step": 1489 + }, + { + "epoch": 0.012994714901187839, + "grad_norm": 0.396484375, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 1490 + }, + { + "epoch": 0.013003436186356422, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1004, + "step": 1491 + }, + { + "epoch": 0.013012157471525005, + "grad_norm": 0.546875, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 1492 + }, + { + "epoch": 0.013020878756693586, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.0917, + "step": 1493 + }, + { + "epoch": 0.013029600041862169, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 1494 + }, + { + "epoch": 0.013038321327030752, + "grad_norm": 0.56640625, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 1495 + }, + { + "epoch": 0.013047042612199333, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 1496 + }, + { + "epoch": 0.013055763897367916, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 1497 + }, + { + "epoch": 0.013064485182536499, + "grad_norm": 0.51171875, + "learning_rate": 0.0005, + "loss": 1.1173, + "step": 1498 + }, + { + "epoch": 0.013073206467705082, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 1499 + }, + { + "epoch": 0.013081927752873663, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.1038, + "step": 1500 + }, + { + "epoch": 0.013090649038042246, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1501 + }, + { + "epoch": 0.013099370323210829, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 1502 + }, + { + "epoch": 0.013108091608379412, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 1503 + }, + { + "epoch": 0.013116812893547993, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 1504 + }, + { + "epoch": 0.013125534178716576, + "grad_norm": 0.5234375, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 1505 + }, + { + "epoch": 0.013134255463885159, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 1506 + }, + { + "epoch": 0.01314297674905374, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 1507 + }, + { + "epoch": 0.013151698034222323, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 1508 + }, + { + "epoch": 0.013160419319390906, + "grad_norm": 0.482421875, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 1509 + }, + { + "epoch": 0.013169140604559489, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.1097, + "step": 1510 + }, + { + "epoch": 0.01317786188972807, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1511 + }, + { + "epoch": 0.013186583174896653, + "grad_norm": 0.474609375, + "learning_rate": 0.0005, + "loss": 1.1032, + "step": 1512 + }, + { + "epoch": 0.013195304460065236, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 1513 + }, + { + "epoch": 0.013204025745233817, + "grad_norm": 0.44140625, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 1514 + }, + { + "epoch": 0.0132127470304024, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0958, + "step": 1515 + }, + { + "epoch": 0.013221468315570983, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 1516 + }, + { + "epoch": 0.013230189600739566, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 1517 + }, + { + "epoch": 0.013238910885908147, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 1518 + }, + { + "epoch": 0.01324763217107673, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 1519 + }, + { + "epoch": 0.013256353456245313, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 1520 + }, + { + "epoch": 0.013265074741413896, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 1521 + }, + { + "epoch": 0.013273796026582477, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1522 + }, + { + "epoch": 0.01328251731175106, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.1204, + "step": 1523 + }, + { + "epoch": 0.013291238596919643, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 1524 + }, + { + "epoch": 0.013299959882088224, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 1525 + }, + { + "epoch": 0.013308681167256807, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 1526 + }, + { + "epoch": 0.01331740245242539, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 1527 + }, + { + "epoch": 0.013326123737593972, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 1528 + }, + { + "epoch": 0.013334845022762554, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 1529 + }, + { + "epoch": 0.013343566307931137, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 1530 + }, + { + "epoch": 0.01335228759309972, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.1243, + "step": 1531 + }, + { + "epoch": 0.013361008878268302, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.1033, + "step": 1532 + }, + { + "epoch": 0.013369730163436884, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1533 + }, + { + "epoch": 0.013378451448605467, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1534 + }, + { + "epoch": 0.01338717273377405, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 1535 + }, + { + "epoch": 0.01339589401894263, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 1536 + }, + { + "epoch": 0.013404615304111214, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 1537 + }, + { + "epoch": 0.013413336589279796, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.1028, + "step": 1538 + }, + { + "epoch": 0.01342205787444838, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1539 + }, + { + "epoch": 0.01343077915961696, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 1540 + }, + { + "epoch": 0.013439500444785544, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 1541 + }, + { + "epoch": 0.013448221729954126, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 1542 + }, + { + "epoch": 0.013456943015122708, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.1001, + "step": 1543 + }, + { + "epoch": 0.01346566430029129, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.1076, + "step": 1544 + }, + { + "epoch": 0.013474385585459873, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0798, + "step": 1545 + }, + { + "epoch": 0.013483106870628456, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0964, + "step": 1546 + }, + { + "epoch": 0.013491828155797038, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 1547 + }, + { + "epoch": 0.01350054944096562, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 1548 + }, + { + "epoch": 0.013509270726134203, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 1549 + }, + { + "epoch": 0.013517992011302786, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 1550 + }, + { + "epoch": 0.013526713296471368, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0883, + "step": 1551 + }, + { + "epoch": 0.01353543458163995, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0918, + "step": 1552 + }, + { + "epoch": 0.013544155866808533, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 1553 + }, + { + "epoch": 0.013552877151977115, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.1067, + "step": 1554 + }, + { + "epoch": 0.013561598437145697, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 1555 + }, + { + "epoch": 0.01357031972231428, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 1556 + }, + { + "epoch": 0.013579041007482863, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 1557 + }, + { + "epoch": 0.013587762292651445, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.1016, + "step": 1558 + }, + { + "epoch": 0.013596483577820027, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.098, + "step": 1559 + }, + { + "epoch": 0.01360520486298861, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 1560 + }, + { + "epoch": 0.013613926148157193, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 1561 + }, + { + "epoch": 0.013622647433325774, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 1562 + }, + { + "epoch": 0.013631368718494357, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.0901, + "step": 1563 + }, + { + "epoch": 0.01364009000366294, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1123, + "step": 1564 + }, + { + "epoch": 0.013648811288831521, + "grad_norm": 0.427734375, + "learning_rate": 0.0005, + "loss": 1.0985, + "step": 1565 + }, + { + "epoch": 0.013657532574000104, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0958, + "step": 1566 + }, + { + "epoch": 0.013666253859168687, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 1567 + }, + { + "epoch": 0.01367497514433727, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 1568 + }, + { + "epoch": 0.013683696429505851, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1569 + }, + { + "epoch": 0.013692417714674434, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 1570 + }, + { + "epoch": 0.013701138999843017, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 1571 + }, + { + "epoch": 0.013709860285011598, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 1572 + }, + { + "epoch": 0.013718581570180181, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.1052, + "step": 1573 + }, + { + "epoch": 0.013727302855348764, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 1574 + }, + { + "epoch": 0.013736024140517347, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1575 + }, + { + "epoch": 0.013744745425685928, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 1576 + }, + { + "epoch": 0.013753466710854511, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 1577 + }, + { + "epoch": 0.013762187996023094, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.1134, + "step": 1578 + }, + { + "epoch": 0.013770909281191677, + "grad_norm": 0.60546875, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 1579 + }, + { + "epoch": 0.013779630566360258, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 1580 + }, + { + "epoch": 0.013788351851528841, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 1581 + }, + { + "epoch": 0.013797073136697424, + "grad_norm": 0.451171875, + "learning_rate": 0.0005, + "loss": 1.1193, + "step": 1582 + }, + { + "epoch": 0.013805794421866005, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 1583 + }, + { + "epoch": 0.013814515707034588, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.0913, + "step": 1584 + }, + { + "epoch": 0.013823236992203171, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0926, + "step": 1585 + }, + { + "epoch": 0.013831958277371754, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 1586 + }, + { + "epoch": 0.013840679562540335, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 1587 + }, + { + "epoch": 0.013849400847708918, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 1588 + }, + { + "epoch": 0.013858122132877501, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 1589 + }, + { + "epoch": 0.013866843418046084, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 1590 + }, + { + "epoch": 0.013875564703214665, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 1591 + }, + { + "epoch": 0.013884285988383248, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 1592 + }, + { + "epoch": 0.013893007273551831, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 1593 + }, + { + "epoch": 0.013901728558720412, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 1594 + }, + { + "epoch": 0.013910449843888995, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 1595 + }, + { + "epoch": 0.013919171129057578, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 1596 + }, + { + "epoch": 0.013927892414226161, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0925, + "step": 1597 + }, + { + "epoch": 0.013936613699394742, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 1598 + }, + { + "epoch": 0.013945334984563325, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 1599 + }, + { + "epoch": 0.013954056269731908, + "grad_norm": 0.423828125, + "learning_rate": 0.0005, + "loss": 1.097, + "step": 1600 + }, + { + "epoch": 0.01396277755490049, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 1601 + }, + { + "epoch": 0.013971498840069072, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 1602 + }, + { + "epoch": 0.013980220125237655, + "grad_norm": 0.38671875, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 1603 + }, + { + "epoch": 0.013988941410406238, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 1604 + }, + { + "epoch": 0.01399766269557482, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 1605 + }, + { + "epoch": 0.014006383980743402, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 1606 + }, + { + "epoch": 0.014015105265911985, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 1607 + }, + { + "epoch": 0.014023826551080568, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0969, + "step": 1608 + }, + { + "epoch": 0.01403254783624915, + "grad_norm": 0.390625, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 1609 + }, + { + "epoch": 0.014041269121417732, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 1610 + }, + { + "epoch": 0.014049990406586315, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 1611 + }, + { + "epoch": 0.014058711691754896, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 1612 + }, + { + "epoch": 0.014067432976923479, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.1055, + "step": 1613 + }, + { + "epoch": 0.014076154262092062, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 1614 + }, + { + "epoch": 0.014084875547260645, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 1615 + }, + { + "epoch": 0.014093596832429226, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 1616 + }, + { + "epoch": 0.014102318117597809, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0875, + "step": 1617 + }, + { + "epoch": 0.014111039402766392, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1618 + }, + { + "epoch": 0.014119760687934975, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.1218, + "step": 1619 + }, + { + "epoch": 0.014128481973103556, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 1620 + }, + { + "epoch": 0.014137203258272139, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1621 + }, + { + "epoch": 0.014145924543440722, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.1135, + "step": 1622 + }, + { + "epoch": 0.014154645828609303, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 1623 + }, + { + "epoch": 0.014163367113777886, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 1624 + }, + { + "epoch": 0.014172088398946469, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 1625 + }, + { + "epoch": 0.014180809684115052, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 1626 + }, + { + "epoch": 0.014189530969283633, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1627 + }, + { + "epoch": 0.014198252254452216, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 1628 + }, + { + "epoch": 0.014206973539620799, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1629 + }, + { + "epoch": 0.01421569482478938, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0981, + "step": 1630 + }, + { + "epoch": 0.014224416109957963, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0936, + "step": 1631 + }, + { + "epoch": 0.014233137395126546, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 1632 + }, + { + "epoch": 0.014241858680295129, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 1633 + }, + { + "epoch": 0.01425057996546371, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 1634 + }, + { + "epoch": 0.014259301250632293, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.113, + "step": 1635 + }, + { + "epoch": 0.014268022535800876, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 1636 + }, + { + "epoch": 0.014276743820969459, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 1637 + }, + { + "epoch": 0.01428546510613804, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 1638 + }, + { + "epoch": 0.014294186391306623, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 1639 + }, + { + "epoch": 0.014302907676475206, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1640 + }, + { + "epoch": 0.014311628961643787, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 1641 + }, + { + "epoch": 0.01432035024681237, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0889, + "step": 1642 + }, + { + "epoch": 0.014329071531980953, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 1643 + }, + { + "epoch": 0.014337792817149536, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 1644 + }, + { + "epoch": 0.014346514102318117, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 1645 + }, + { + "epoch": 0.0143552353874867, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1022, + "step": 1646 + }, + { + "epoch": 0.014363956672655283, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 1647 + }, + { + "epoch": 0.014372677957823866, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 1648 + }, + { + "epoch": 0.014381399242992447, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0949, + "step": 1649 + }, + { + "epoch": 0.01439012052816103, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0972, + "step": 1650 + }, + { + "epoch": 0.014398841813329613, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0868, + "step": 1651 + }, + { + "epoch": 0.014407563098498194, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 1652 + }, + { + "epoch": 0.014416284383666777, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 1653 + }, + { + "epoch": 0.01442500566883536, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 1654 + }, + { + "epoch": 0.014433726954003943, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 1655 + }, + { + "epoch": 0.014442448239172524, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 1656 + }, + { + "epoch": 0.014451169524341107, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 1657 + }, + { + "epoch": 0.01445989080950969, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.1176, + "step": 1658 + }, + { + "epoch": 0.014468612094678273, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.116, + "step": 1659 + }, + { + "epoch": 0.014477333379846854, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 1660 + }, + { + "epoch": 0.014486054665015437, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 1661 + }, + { + "epoch": 0.01449477595018402, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0955, + "step": 1662 + }, + { + "epoch": 0.0145034972353526, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 1663 + }, + { + "epoch": 0.014512218520521184, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 1664 + }, + { + "epoch": 0.014520939805689767, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 1665 + }, + { + "epoch": 0.01452966109085835, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 1666 + }, + { + "epoch": 0.01453838237602693, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0908, + "step": 1667 + }, + { + "epoch": 0.014547103661195514, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 1668 + }, + { + "epoch": 0.014555824946364097, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 1669 + }, + { + "epoch": 0.014564546231532678, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 1670 + }, + { + "epoch": 0.01457326751670126, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1671 + }, + { + "epoch": 0.014581988801869844, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 1672 + }, + { + "epoch": 0.014590710087038427, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 1673 + }, + { + "epoch": 0.014599431372207008, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 1674 + }, + { + "epoch": 0.01460815265737559, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.1192, + "step": 1675 + }, + { + "epoch": 0.014616873942544174, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 1676 + }, + { + "epoch": 0.014625595227712757, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 1677 + }, + { + "epoch": 0.014634316512881338, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 1678 + }, + { + "epoch": 0.01464303779804992, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 1679 + }, + { + "epoch": 0.014651759083218504, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 1680 + }, + { + "epoch": 0.014660480368387085, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 1681 + }, + { + "epoch": 0.014669201653555668, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 1682 + }, + { + "epoch": 0.01467792293872425, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.1079, + "step": 1683 + }, + { + "epoch": 0.014686644223892834, + "grad_norm": 0.40234375, + "learning_rate": 0.0005, + "loss": 1.0993, + "step": 1684 + }, + { + "epoch": 0.014695365509061415, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 1685 + }, + { + "epoch": 0.014704086794229998, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 1686 + }, + { + "epoch": 0.01471280807939858, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 1687 + }, + { + "epoch": 0.014721529364567163, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 1688 + }, + { + "epoch": 0.014730250649735745, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 1689 + }, + { + "epoch": 0.014738971934904328, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 1690 + }, + { + "epoch": 0.01474769322007291, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 1691 + }, + { + "epoch": 0.014756414505241492, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.1166, + "step": 1692 + }, + { + "epoch": 0.014765135790410075, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 1693 + }, + { + "epoch": 0.014773857075578658, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 1694 + }, + { + "epoch": 0.01478257836074724, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 1695 + }, + { + "epoch": 0.014791299645915822, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0798, + "step": 1696 + }, + { + "epoch": 0.014800020931084405, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 1697 + }, + { + "epoch": 0.014808742216252987, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 1698 + }, + { + "epoch": 0.014817463501421569, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 1699 + }, + { + "epoch": 0.014826184786590152, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 1700 + }, + { + "epoch": 0.014834906071758735, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1098, + "step": 1701 + }, + { + "epoch": 0.014843627356927317, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.102, + "step": 1702 + }, + { + "epoch": 0.014852348642095899, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 1703 + }, + { + "epoch": 0.014861069927264482, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 1704 + }, + { + "epoch": 0.014869791212433064, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 1705 + }, + { + "epoch": 0.014878512497601647, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 1706 + }, + { + "epoch": 0.014887233782770229, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 1707 + }, + { + "epoch": 0.014895955067938811, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0929, + "step": 1708 + }, + { + "epoch": 0.014904676353107394, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 1709 + }, + { + "epoch": 0.014913397638275976, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 1710 + }, + { + "epoch": 0.014922118923444559, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 1711 + }, + { + "epoch": 0.014930840208613141, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.1044, + "step": 1712 + }, + { + "epoch": 0.014939561493781724, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 1713 + }, + { + "epoch": 0.014948282778950306, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 1714 + }, + { + "epoch": 0.014957004064118888, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 1715 + }, + { + "epoch": 0.014965725349287471, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 1716 + }, + { + "epoch": 0.014974446634456054, + "grad_norm": 0.498046875, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 1717 + }, + { + "epoch": 0.014983167919624635, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.1046, + "step": 1718 + }, + { + "epoch": 0.014991889204793218, + "grad_norm": 0.474609375, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 1719 + }, + { + "epoch": 0.015000610489961801, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 1720 + }, + { + "epoch": 0.015009331775130383, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0915, + "step": 1721 + }, + { + "epoch": 0.015018053060298965, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 1722 + }, + { + "epoch": 0.015026774345467548, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 1723 + }, + { + "epoch": 0.015035495630636131, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.1059, + "step": 1724 + }, + { + "epoch": 0.015044216915804712, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 1725 + }, + { + "epoch": 0.015052938200973295, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 1726 + }, + { + "epoch": 0.015061659486141878, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 1727 + }, + { + "epoch": 0.01507038077131046, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 1728 + }, + { + "epoch": 0.015079102056479042, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 1729 + }, + { + "epoch": 0.015087823341647625, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 1730 + }, + { + "epoch": 0.015096544626816208, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 1731 + }, + { + "epoch": 0.01510526591198479, + "grad_norm": 0.54296875, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 1732 + }, + { + "epoch": 0.015113987197153372, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 1733 + }, + { + "epoch": 0.015122708482321955, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 1734 + }, + { + "epoch": 0.015131429767490538, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 1735 + }, + { + "epoch": 0.01514015105265912, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 1736 + }, + { + "epoch": 0.015148872337827702, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 1737 + }, + { + "epoch": 0.015157593622996285, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 1738 + }, + { + "epoch": 0.015166314908164866, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1739 + }, + { + "epoch": 0.01517503619333345, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 1740 + }, + { + "epoch": 0.015183757478502032, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 1741 + }, + { + "epoch": 0.015192478763670615, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 1742 + }, + { + "epoch": 0.015201200048839196, + "grad_norm": 0.404296875, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 1743 + }, + { + "epoch": 0.01520992133400778, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.086, + "step": 1744 + }, + { + "epoch": 0.015218642619176362, + "grad_norm": 0.494140625, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 1745 + }, + { + "epoch": 0.015227363904344945, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 1746 + }, + { + "epoch": 0.015236085189513526, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 1747 + }, + { + "epoch": 0.01524480647468211, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 1748 + }, + { + "epoch": 0.015253527759850692, + "grad_norm": 0.41015625, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 1749 + }, + { + "epoch": 0.015262249045019273, + "grad_norm": 0.7421875, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 1750 + }, + { + "epoch": 0.015270970330187856, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.0929, + "step": 1751 + }, + { + "epoch": 0.01527969161535644, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0952, + "step": 1752 + }, + { + "epoch": 0.015288412900525022, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.1017, + "step": 1753 + }, + { + "epoch": 0.015297134185693603, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1754 + }, + { + "epoch": 0.015305855470862186, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 1755 + }, + { + "epoch": 0.015314576756030769, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1756 + }, + { + "epoch": 0.01532329804119935, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 1757 + }, + { + "epoch": 0.015332019326367933, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 1758 + }, + { + "epoch": 0.015340740611536516, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 1759 + }, + { + "epoch": 0.015349461896705099, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 1760 + }, + { + "epoch": 0.01535818318187368, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.086, + "step": 1761 + }, + { + "epoch": 0.015366904467042263, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.1071, + "step": 1762 + }, + { + "epoch": 0.015375625752210846, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 1763 + }, + { + "epoch": 0.015384347037379429, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 1764 + }, + { + "epoch": 0.01539306832254801, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 1765 + }, + { + "epoch": 0.015401789607716593, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 1766 + }, + { + "epoch": 0.015410510892885176, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 1767 + }, + { + "epoch": 0.015419232178053757, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 1768 + }, + { + "epoch": 0.01542795346322234, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 1769 + }, + { + "epoch": 0.015436674748390923, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 1770 + }, + { + "epoch": 0.015445396033559506, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 1771 + }, + { + "epoch": 0.015454117318728087, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 1772 + }, + { + "epoch": 0.01546283860389667, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 1773 + }, + { + "epoch": 0.015471559889065253, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 1774 + }, + { + "epoch": 0.015480281174233836, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 1775 + }, + { + "epoch": 0.015489002459402417, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 1776 + }, + { + "epoch": 0.015497723744571, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 1777 + }, + { + "epoch": 0.015506445029739583, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 1778 + }, + { + "epoch": 0.015515166314908164, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 1779 + }, + { + "epoch": 0.015523887600076747, + "grad_norm": 0.4375, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 1780 + }, + { + "epoch": 0.01553260888524533, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 1781 + }, + { + "epoch": 0.015541330170413913, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.1046, + "step": 1782 + }, + { + "epoch": 0.015550051455582494, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 1783 + }, + { + "epoch": 0.015558772740751077, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0922, + "step": 1784 + }, + { + "epoch": 0.01556749402591966, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 1785 + }, + { + "epoch": 0.015576215311088241, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 1786 + }, + { + "epoch": 0.015584936596256824, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 1787 + }, + { + "epoch": 0.015593657881425407, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 1788 + }, + { + "epoch": 0.01560237916659399, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.1052, + "step": 1789 + }, + { + "epoch": 0.015611100451762571, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 1790 + }, + { + "epoch": 0.015619821736931154, + "grad_norm": 0.5703125, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 1791 + }, + { + "epoch": 0.015628543022099737, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 1792 + }, + { + "epoch": 0.01563726430726832, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 1793 + }, + { + "epoch": 0.015645985592436903, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 1794 + }, + { + "epoch": 0.015654706877605482, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 1795 + }, + { + "epoch": 0.015663428162774065, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.1032, + "step": 1796 + }, + { + "epoch": 0.015672149447942648, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0938, + "step": 1797 + }, + { + "epoch": 0.01568087073311123, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 1798 + }, + { + "epoch": 0.015689592018279814, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 1799 + }, + { + "epoch": 0.015698313303448397, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1800 + }, + { + "epoch": 0.01570703458861698, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 1801 + }, + { + "epoch": 0.015715755873785563, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 1802 + }, + { + "epoch": 0.015724477158954142, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 1803 + }, + { + "epoch": 0.015733198444122725, + "grad_norm": 0.80078125, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 1804 + }, + { + "epoch": 0.015741919729291308, + "grad_norm": 0.439453125, + "learning_rate": 0.0005, + "loss": 1.095, + "step": 1805 + }, + { + "epoch": 0.01575064101445989, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 1806 + }, + { + "epoch": 0.015759362299628474, + "grad_norm": 0.796875, + "learning_rate": 0.0005, + "loss": 1.1163, + "step": 1807 + }, + { + "epoch": 0.015768083584797057, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 1808 + }, + { + "epoch": 0.01577680486996564, + "grad_norm": 0.76953125, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 1809 + }, + { + "epoch": 0.01578552615513422, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 1810 + }, + { + "epoch": 0.015794247440302802, + "grad_norm": 0.4921875, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 1811 + }, + { + "epoch": 0.015802968725471385, + "grad_norm": 0.50390625, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 1812 + }, + { + "epoch": 0.015811690010639968, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 1813 + }, + { + "epoch": 0.01582041129580855, + "grad_norm": 0.61328125, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 1814 + }, + { + "epoch": 0.015829132580977134, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.1, + "step": 1815 + }, + { + "epoch": 0.015837853866145717, + "grad_norm": 0.59375, + "learning_rate": 0.0005, + "loss": 1.1027, + "step": 1816 + }, + { + "epoch": 0.015846575151314296, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0998, + "step": 1817 + }, + { + "epoch": 0.01585529643648288, + "grad_norm": 0.6875, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 1818 + }, + { + "epoch": 0.015864017721651462, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 1819 + }, + { + "epoch": 0.015872739006820045, + "grad_norm": 0.50390625, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 1820 + }, + { + "epoch": 0.015881460291988628, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 1821 + }, + { + "epoch": 0.01589018157715721, + "grad_norm": 0.59765625, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1822 + }, + { + "epoch": 0.015898902862325794, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 1823 + }, + { + "epoch": 0.015907624147494377, + "grad_norm": 0.72265625, + "learning_rate": 0.0005, + "loss": 1.1101, + "step": 1824 + }, + { + "epoch": 0.015916345432662956, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 1825 + }, + { + "epoch": 0.01592506671783154, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0969, + "step": 1826 + }, + { + "epoch": 0.015933788003000122, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 1827 + }, + { + "epoch": 0.015942509288168705, + "grad_norm": 0.447265625, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 1828 + }, + { + "epoch": 0.015951230573337288, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1013, + "step": 1829 + }, + { + "epoch": 0.01595995185850587, + "grad_norm": 0.5078125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 1830 + }, + { + "epoch": 0.015968673143674453, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 1831 + }, + { + "epoch": 0.015977394428843033, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 1832 + }, + { + "epoch": 0.015986115714011616, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 1833 + }, + { + "epoch": 0.0159948369991802, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1834 + }, + { + "epoch": 0.01600355828434878, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 1835 + }, + { + "epoch": 0.016012279569517365, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 1836 + }, + { + "epoch": 0.016021000854685948, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.091, + "step": 1837 + }, + { + "epoch": 0.01602972213985453, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 1838 + }, + { + "epoch": 0.01603844342502311, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 1839 + }, + { + "epoch": 0.016047164710191693, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 1840 + }, + { + "epoch": 0.016055885995360276, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.1015, + "step": 1841 + }, + { + "epoch": 0.01606460728052886, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 1842 + }, + { + "epoch": 0.01607332856569744, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 1843 + }, + { + "epoch": 0.016082049850866025, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1844 + }, + { + "epoch": 0.016090771136034607, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 1845 + }, + { + "epoch": 0.016099492421203187, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0981, + "step": 1846 + }, + { + "epoch": 0.01610821370637177, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0915, + "step": 1847 + }, + { + "epoch": 0.016116934991540353, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 1848 + }, + { + "epoch": 0.016125656276708936, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 1849 + }, + { + "epoch": 0.01613437756187752, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 1850 + }, + { + "epoch": 0.0161430988470461, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 1851 + }, + { + "epoch": 0.016151820132214684, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 1852 + }, + { + "epoch": 0.016160541417383267, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0925, + "step": 1853 + }, + { + "epoch": 0.016169262702551847, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 1854 + }, + { + "epoch": 0.01617798398772043, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 1855 + }, + { + "epoch": 0.016186705272889013, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 1856 + }, + { + "epoch": 0.016195426558057596, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 1857 + }, + { + "epoch": 0.01620414784322618, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.1044, + "step": 1858 + }, + { + "epoch": 0.01621286912839476, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 1859 + }, + { + "epoch": 0.016221590413563344, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 1860 + }, + { + "epoch": 0.016230311698731924, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 1861 + }, + { + "epoch": 0.016239032983900507, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 1862 + }, + { + "epoch": 0.01624775426906909, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 1863 + }, + { + "epoch": 0.016256475554237673, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.1024, + "step": 1864 + }, + { + "epoch": 0.016265196839406255, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 1865 + }, + { + "epoch": 0.01627391812457484, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0918, + "step": 1866 + }, + { + "epoch": 0.01628263940974342, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 1867 + }, + { + "epoch": 0.016291360694912, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 1868 + }, + { + "epoch": 0.016300081980080584, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.107, + "step": 1869 + }, + { + "epoch": 0.016308803265249167, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 1870 + }, + { + "epoch": 0.01631752455041775, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 1871 + }, + { + "epoch": 0.016326245835586332, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 1872 + }, + { + "epoch": 0.016334967120754915, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 1873 + }, + { + "epoch": 0.016343688405923498, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 1874 + }, + { + "epoch": 0.016352409691092078, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 1875 + }, + { + "epoch": 0.01636113097626066, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 1876 + }, + { + "epoch": 0.016369852261429244, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 1877 + }, + { + "epoch": 0.016378573546597826, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0957, + "step": 1878 + }, + { + "epoch": 0.01638729483176641, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 1879 + }, + { + "epoch": 0.016396016116934992, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 1880 + }, + { + "epoch": 0.016404737402103575, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 1881 + }, + { + "epoch": 0.016413458687272158, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 1882 + }, + { + "epoch": 0.016422179972440738, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 1883 + }, + { + "epoch": 0.01643090125760932, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 1884 + }, + { + "epoch": 0.016439622542777903, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 1885 + }, + { + "epoch": 0.016448343827946486, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 1886 + }, + { + "epoch": 0.01645706511311507, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 1887 + }, + { + "epoch": 0.016465786398283652, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 1888 + }, + { + "epoch": 0.016474507683452235, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 1889 + }, + { + "epoch": 0.016483228968620815, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 1890 + }, + { + "epoch": 0.016491950253789397, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 1891 + }, + { + "epoch": 0.01650067153895798, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 1892 + }, + { + "epoch": 0.016509392824126563, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 1893 + }, + { + "epoch": 0.016518114109295146, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 1894 + }, + { + "epoch": 0.01652683539446373, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0943, + "step": 1895 + }, + { + "epoch": 0.016535556679632312, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 1896 + }, + { + "epoch": 0.01654427796480089, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 1897 + }, + { + "epoch": 0.016552999249969474, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 1898 + }, + { + "epoch": 0.016561720535138057, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 1899 + }, + { + "epoch": 0.01657044182030664, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 1900 + }, + { + "epoch": 0.016579163105475223, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 1901 + }, + { + "epoch": 0.016587884390643806, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 1902 + }, + { + "epoch": 0.01659660567581239, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.1073, + "step": 1903 + }, + { + "epoch": 0.01660532696098097, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 1904 + }, + { + "epoch": 0.01661404824614955, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 1905 + }, + { + "epoch": 0.016622769531318134, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 1906 + }, + { + "epoch": 0.016631490816486717, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 1907 + }, + { + "epoch": 0.0166402121016553, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 1908 + }, + { + "epoch": 0.016648933386823883, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 1909 + }, + { + "epoch": 0.016657654671992466, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 1910 + }, + { + "epoch": 0.01666637595716105, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 1911 + }, + { + "epoch": 0.01667509724232963, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 1912 + }, + { + "epoch": 0.01668381852749821, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 1913 + }, + { + "epoch": 0.016692539812666794, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 1914 + }, + { + "epoch": 0.016701261097835377, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 1915 + }, + { + "epoch": 0.01670998238300396, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 1916 + }, + { + "epoch": 0.016718703668172543, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 1917 + }, + { + "epoch": 0.016727424953341126, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 1918 + }, + { + "epoch": 0.016736146238509705, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 1919 + }, + { + "epoch": 0.01674486752367829, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 1920 + }, + { + "epoch": 0.01675358880884687, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 1921 + }, + { + "epoch": 0.016762310094015454, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 1922 + }, + { + "epoch": 0.016771031379184037, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 1923 + }, + { + "epoch": 0.01677975266435262, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 1924 + }, + { + "epoch": 0.016788473949521203, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.1044, + "step": 1925 + }, + { + "epoch": 0.016797195234689782, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 1926 + }, + { + "epoch": 0.016805916519858365, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 1927 + }, + { + "epoch": 0.016814637805026948, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0913, + "step": 1928 + }, + { + "epoch": 0.01682335909019553, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 1929 + }, + { + "epoch": 0.016832080375364114, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 1930 + }, + { + "epoch": 0.016840801660532697, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 1931 + }, + { + "epoch": 0.01684952294570128, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 1932 + }, + { + "epoch": 0.01685824423086986, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 1933 + }, + { + "epoch": 0.016866965516038442, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 1934 + }, + { + "epoch": 0.016875686801207025, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 1935 + }, + { + "epoch": 0.016884408086375608, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 1936 + }, + { + "epoch": 0.01689312937154419, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 1937 + }, + { + "epoch": 0.016901850656712774, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 1938 + }, + { + "epoch": 0.016910571941881357, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 1939 + }, + { + "epoch": 0.01691929322704994, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 1940 + }, + { + "epoch": 0.01692801451221852, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0943, + "step": 1941 + }, + { + "epoch": 0.016936735797387102, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 1942 + }, + { + "epoch": 0.016945457082555685, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 1943 + }, + { + "epoch": 0.016954178367724268, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0994, + "step": 1944 + }, + { + "epoch": 0.01696289965289285, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 1945 + }, + { + "epoch": 0.016971620938061434, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 1946 + }, + { + "epoch": 0.016980342223230017, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 1947 + }, + { + "epoch": 0.016989063508398596, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 1948 + }, + { + "epoch": 0.01699778479356718, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.1137, + "step": 1949 + }, + { + "epoch": 0.017006506078735762, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 1950 + }, + { + "epoch": 0.017015227363904345, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0924, + "step": 1951 + }, + { + "epoch": 0.017023948649072928, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 1952 + }, + { + "epoch": 0.01703266993424151, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 1953 + }, + { + "epoch": 0.017041391219410094, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 1954 + }, + { + "epoch": 0.017050112504578673, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0952, + "step": 1955 + }, + { + "epoch": 0.017058833789747256, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 1956 + }, + { + "epoch": 0.01706755507491584, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 1957 + }, + { + "epoch": 0.017076276360084422, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 1958 + }, + { + "epoch": 0.017084997645253005, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 1959 + }, + { + "epoch": 0.017093718930421588, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 1960 + }, + { + "epoch": 0.01710244021559017, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 1961 + }, + { + "epoch": 0.01711116150075875, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 1962 + }, + { + "epoch": 0.017119882785927333, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 1963 + }, + { + "epoch": 0.017128604071095916, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.1025, + "step": 1964 + }, + { + "epoch": 0.0171373253562645, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 1965 + }, + { + "epoch": 0.017146046641433082, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 1966 + }, + { + "epoch": 0.017154767926601665, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 1967 + }, + { + "epoch": 0.017163489211770248, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 1968 + }, + { + "epoch": 0.01717221049693883, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 1969 + }, + { + "epoch": 0.01718093178210741, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0883, + "step": 1970 + }, + { + "epoch": 0.017189653067275993, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 1971 + }, + { + "epoch": 0.017198374352444576, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 1972 + }, + { + "epoch": 0.01720709563761316, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 1973 + }, + { + "epoch": 0.017215816922781742, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 1974 + }, + { + "epoch": 0.017224538207950325, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 1975 + }, + { + "epoch": 0.017233259493118908, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 1976 + }, + { + "epoch": 0.017241980778287487, + "grad_norm": 0.3984375, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 1977 + }, + { + "epoch": 0.01725070206345607, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0936, + "step": 1978 + }, + { + "epoch": 0.017259423348624653, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 1979 + }, + { + "epoch": 0.017268144633793236, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 1980 + }, + { + "epoch": 0.01727686591896182, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 1981 + }, + { + "epoch": 0.0172855872041304, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1982 + }, + { + "epoch": 0.017294308489298985, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 1983 + }, + { + "epoch": 0.017303029774467564, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 1984 + }, + { + "epoch": 0.017311751059636147, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 1985 + }, + { + "epoch": 0.01732047234480473, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 1986 + }, + { + "epoch": 0.017329193629973313, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 1987 + }, + { + "epoch": 0.017337914915141896, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 1988 + }, + { + "epoch": 0.01734663620031048, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 1989 + }, + { + "epoch": 0.01735535748547906, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0947, + "step": 1990 + }, + { + "epoch": 0.01736407877064764, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 1991 + }, + { + "epoch": 0.017372800055816224, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 1992 + }, + { + "epoch": 0.017381521340984807, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.1085, + "step": 1993 + }, + { + "epoch": 0.01739024262615339, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.102, + "step": 1994 + }, + { + "epoch": 0.017398963911321973, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 1995 + }, + { + "epoch": 0.017407685196490556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 1996 + }, + { + "epoch": 0.01741640648165914, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 1997 + }, + { + "epoch": 0.01742512776682772, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.1067, + "step": 1998 + }, + { + "epoch": 0.0174338490519963, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 1999 + }, + { + "epoch": 0.017442570337164884, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 2000 + }, + { + "epoch": 0.017451291622333467, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.1063, + "step": 2001 + }, + { + "epoch": 0.01746001290750205, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 2002 + }, + { + "epoch": 0.017468734192670633, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 2003 + }, + { + "epoch": 0.017477455477839215, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 2004 + }, + { + "epoch": 0.0174861767630078, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 2005 + }, + { + "epoch": 0.017494898048176378, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 2006 + }, + { + "epoch": 0.01750361933334496, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 2007 + }, + { + "epoch": 0.017512340618513544, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2008 + }, + { + "epoch": 0.017521061903682127, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0917, + "step": 2009 + }, + { + "epoch": 0.01752978318885071, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 2010 + }, + { + "epoch": 0.017538504474019292, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 2011 + }, + { + "epoch": 0.017547225759187875, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 2012 + }, + { + "epoch": 0.017555947044356455, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0798, + "step": 2013 + }, + { + "epoch": 0.017564668329525038, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0955, + "step": 2014 + }, + { + "epoch": 0.01757338961469362, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 2015 + }, + { + "epoch": 0.017582110899862204, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 2016 + }, + { + "epoch": 0.017590832185030787, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 2017 + }, + { + "epoch": 0.01759955347019937, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 2018 + }, + { + "epoch": 0.017608274755367952, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 2019 + }, + { + "epoch": 0.017616996040536532, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 2020 + }, + { + "epoch": 0.017625717325705115, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 2021 + }, + { + "epoch": 0.017634438610873698, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 2022 + }, + { + "epoch": 0.01764315989604228, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 2023 + }, + { + "epoch": 0.017651881181210863, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 2024 + }, + { + "epoch": 0.017660602466379446, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2025 + }, + { + "epoch": 0.01766932375154803, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 2026 + }, + { + "epoch": 0.017678045036716612, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 2027 + }, + { + "epoch": 0.01768676632188519, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0926, + "step": 2028 + }, + { + "epoch": 0.017695487607053775, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 2029 + }, + { + "epoch": 0.017704208892222358, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 2030 + }, + { + "epoch": 0.01771293017739094, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 2031 + }, + { + "epoch": 0.017721651462559523, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 2032 + }, + { + "epoch": 0.017730372747728106, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 2033 + }, + { + "epoch": 0.01773909403289669, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.092, + "step": 2034 + }, + { + "epoch": 0.01774781531806527, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 2035 + }, + { + "epoch": 0.01775653660323385, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0918, + "step": 2036 + }, + { + "epoch": 0.017765257888402435, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 2037 + }, + { + "epoch": 0.017773979173571017, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 2038 + }, + { + "epoch": 0.0177827004587396, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 2039 + }, + { + "epoch": 0.017791421743908183, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1058, + "step": 2040 + }, + { + "epoch": 0.017800143029076766, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 2041 + }, + { + "epoch": 0.017808864314245346, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 2042 + }, + { + "epoch": 0.01781758559941393, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.095, + "step": 2043 + }, + { + "epoch": 0.01782630688458251, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0804, + "step": 2044 + }, + { + "epoch": 0.017835028169751094, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 2045 + }, + { + "epoch": 0.017843749454919677, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 2046 + }, + { + "epoch": 0.01785247074008826, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 2047 + }, + { + "epoch": 0.017861192025256843, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2048 + }, + { + "epoch": 0.017869913310425423, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 2049 + }, + { + "epoch": 0.017878634595594006, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 2050 + }, + { + "epoch": 0.01788735588076259, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 2051 + }, + { + "epoch": 0.01789607716593117, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0807, + "step": 2052 + }, + { + "epoch": 0.017904798451099754, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 2053 + }, + { + "epoch": 0.017913519736268337, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.086, + "step": 2054 + }, + { + "epoch": 0.01792224102143692, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 2055 + }, + { + "epoch": 0.017930962306605503, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 2056 + }, + { + "epoch": 0.017939683591774083, + "grad_norm": 0.515625, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 2057 + }, + { + "epoch": 0.017948404876942665, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0988, + "step": 2058 + }, + { + "epoch": 0.01795712616211125, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 2059 + }, + { + "epoch": 0.01796584744727983, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0867, + "step": 2060 + }, + { + "epoch": 0.017974568732448414, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 2061 + }, + { + "epoch": 0.017983290017616997, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 2062 + }, + { + "epoch": 0.01799201130278558, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 2063 + }, + { + "epoch": 0.01800073258795416, + "grad_norm": 0.4765625, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 2064 + }, + { + "epoch": 0.018009453873122742, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0873, + "step": 2065 + }, + { + "epoch": 0.018018175158291325, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.086, + "step": 2066 + }, + { + "epoch": 0.01802689644345991, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0804, + "step": 2067 + }, + { + "epoch": 0.01803561772862849, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 2068 + }, + { + "epoch": 0.018044339013797074, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 2069 + }, + { + "epoch": 0.018053060298965657, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.103, + "step": 2070 + }, + { + "epoch": 0.018061781584134236, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0883, + "step": 2071 + }, + { + "epoch": 0.01807050286930282, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 2072 + }, + { + "epoch": 0.018079224154471402, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 2073 + }, + { + "epoch": 0.018087945439639985, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 2074 + }, + { + "epoch": 0.018096666724808568, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 2075 + }, + { + "epoch": 0.01810538800997715, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 2076 + }, + { + "epoch": 0.018114109295145734, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 2077 + }, + { + "epoch": 0.018122830580314313, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0883, + "step": 2078 + }, + { + "epoch": 0.018131551865482896, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 2079 + }, + { + "epoch": 0.01814027315065148, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 2080 + }, + { + "epoch": 0.018148994435820062, + "grad_norm": 0.353515625, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2081 + }, + { + "epoch": 0.018157715720988645, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0868, + "step": 2082 + }, + { + "epoch": 0.018166437006157228, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2083 + }, + { + "epoch": 0.01817515829132581, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 2084 + }, + { + "epoch": 0.018183879576494394, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 2085 + }, + { + "epoch": 0.018192600861662973, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 2086 + }, + { + "epoch": 0.018201322146831556, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 2087 + }, + { + "epoch": 0.01821004343200014, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 2088 + }, + { + "epoch": 0.018218764717168722, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 2089 + }, + { + "epoch": 0.018227486002337305, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 2090 + }, + { + "epoch": 0.018236207287505888, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 2091 + }, + { + "epoch": 0.01824492857267447, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 2092 + }, + { + "epoch": 0.01825364985784305, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 2093 + }, + { + "epoch": 0.018262371143011633, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2094 + }, + { + "epoch": 0.018271092428180216, + "grad_norm": 0.486328125, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 2095 + }, + { + "epoch": 0.0182798137133488, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2096 + }, + { + "epoch": 0.018288534998517382, + "grad_norm": 0.439453125, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 2097 + }, + { + "epoch": 0.018297256283685965, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 2098 + }, + { + "epoch": 0.018305977568854548, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 2099 + }, + { + "epoch": 0.018314698854023127, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2100 + }, + { + "epoch": 0.01832342013919171, + "grad_norm": 0.408203125, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 2101 + }, + { + "epoch": 0.018332141424360293, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2102 + }, + { + "epoch": 0.018340862709528876, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0934, + "step": 2103 + }, + { + "epoch": 0.01834958399469746, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 2104 + }, + { + "epoch": 0.018358305279866042, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 2105 + }, + { + "epoch": 0.018367026565034625, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 2106 + }, + { + "epoch": 0.018375747850203204, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 2107 + }, + { + "epoch": 0.018384469135371787, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 2108 + }, + { + "epoch": 0.01839319042054037, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 2109 + }, + { + "epoch": 0.018401911705708953, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 2110 + }, + { + "epoch": 0.018410632990877536, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 2111 + }, + { + "epoch": 0.01841935427604612, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 2112 + }, + { + "epoch": 0.018428075561214702, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 2113 + }, + { + "epoch": 0.018436796846383285, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 2114 + }, + { + "epoch": 0.018445518131551864, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 2115 + }, + { + "epoch": 0.018454239416720447, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 2116 + }, + { + "epoch": 0.01846296070188903, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2117 + }, + { + "epoch": 0.018471681987057613, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0981, + "step": 2118 + }, + { + "epoch": 0.018480403272226196, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 2119 + }, + { + "epoch": 0.01848912455739478, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 2120 + }, + { + "epoch": 0.01849784584256336, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 2121 + }, + { + "epoch": 0.01850656712773194, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2122 + }, + { + "epoch": 0.018515288412900524, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 2123 + }, + { + "epoch": 0.018524009698069107, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 2124 + }, + { + "epoch": 0.01853273098323769, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2125 + }, + { + "epoch": 0.018541452268406273, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 2126 + }, + { + "epoch": 0.018550173553574856, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 2127 + }, + { + "epoch": 0.01855889483874344, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 2128 + }, + { + "epoch": 0.018567616123912018, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0934, + "step": 2129 + }, + { + "epoch": 0.0185763374090806, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 2130 + }, + { + "epoch": 0.018585058694249184, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2131 + }, + { + "epoch": 0.018593779979417767, + "grad_norm": 0.703125, + "learning_rate": 0.0005, + "loss": 1.0804, + "step": 2132 + }, + { + "epoch": 0.01860250126458635, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 2133 + }, + { + "epoch": 0.018611222549754933, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 2134 + }, + { + "epoch": 0.018619943834923516, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 2135 + }, + { + "epoch": 0.018628665120092095, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 2136 + }, + { + "epoch": 0.018637386405260678, + "grad_norm": 0.470703125, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 2137 + }, + { + "epoch": 0.01864610769042926, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 2138 + }, + { + "epoch": 0.018654828975597844, + "grad_norm": 0.48828125, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2139 + }, + { + "epoch": 0.018663550260766427, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 2140 + }, + { + "epoch": 0.01867227154593501, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 2141 + }, + { + "epoch": 0.018680992831103593, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 2142 + }, + { + "epoch": 0.018689714116272176, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 2143 + }, + { + "epoch": 0.018698435401440755, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2144 + }, + { + "epoch": 0.018707156686609338, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 2145 + }, + { + "epoch": 0.01871587797177792, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 2146 + }, + { + "epoch": 0.018724599256946504, + "grad_norm": 0.447265625, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 2147 + }, + { + "epoch": 0.018733320542115087, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0963, + "step": 2148 + }, + { + "epoch": 0.01874204182728367, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0908, + "step": 2149 + }, + { + "epoch": 0.018750763112452253, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 2150 + }, + { + "epoch": 0.018759484397620832, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 2151 + }, + { + "epoch": 0.018768205682789415, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.092, + "step": 2152 + }, + { + "epoch": 0.018776926967957998, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 2153 + }, + { + "epoch": 0.01878564825312658, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.095, + "step": 2154 + }, + { + "epoch": 0.018794369538295164, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 2155 + }, + { + "epoch": 0.018803090823463747, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 2156 + }, + { + "epoch": 0.01881181210863233, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2157 + }, + { + "epoch": 0.01882053339380091, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 2158 + }, + { + "epoch": 0.018829254678969492, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 2159 + }, + { + "epoch": 0.018837975964138075, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 2160 + }, + { + "epoch": 0.018846697249306658, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 2161 + }, + { + "epoch": 0.01885541853447524, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 2162 + }, + { + "epoch": 0.018864139819643824, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2163 + }, + { + "epoch": 0.018872861104812406, + "grad_norm": 0.427734375, + "learning_rate": 0.0005, + "loss": 1.0881, + "step": 2164 + }, + { + "epoch": 0.018881582389980986, + "grad_norm": 0.451171875, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 2165 + }, + { + "epoch": 0.01889030367514957, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 2166 + }, + { + "epoch": 0.018899024960318152, + "grad_norm": 0.419921875, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 2167 + }, + { + "epoch": 0.018907746245486735, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 2168 + }, + { + "epoch": 0.018916467530655318, + "grad_norm": 0.78125, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 2169 + }, + { + "epoch": 0.0189251888158239, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 2170 + }, + { + "epoch": 0.018933910100992483, + "grad_norm": 0.451171875, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2171 + }, + { + "epoch": 0.018942631386161066, + "grad_norm": 0.474609375, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 2172 + }, + { + "epoch": 0.018951352671329646, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 2173 + }, + { + "epoch": 0.01896007395649823, + "grad_norm": 0.67578125, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 2174 + }, + { + "epoch": 0.01896879524166681, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 2175 + }, + { + "epoch": 0.018977516526835395, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0985, + "step": 2176 + }, + { + "epoch": 0.018986237812003978, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 2177 + }, + { + "epoch": 0.01899495909717256, + "grad_norm": 0.53125, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 2178 + }, + { + "epoch": 0.019003680382341143, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 2179 + }, + { + "epoch": 0.019012401667509723, + "grad_norm": 0.353515625, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 2180 + }, + { + "epoch": 0.019021122952678306, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2181 + }, + { + "epoch": 0.01902984423784689, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 2182 + }, + { + "epoch": 0.01903856552301547, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 2183 + }, + { + "epoch": 0.019047286808184054, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 2184 + }, + { + "epoch": 0.019056008093352637, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 2185 + }, + { + "epoch": 0.01906472937852122, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 2186 + }, + { + "epoch": 0.0190734506636898, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 2187 + }, + { + "epoch": 0.019082171948858383, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 2188 + }, + { + "epoch": 0.019090893234026966, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2189 + }, + { + "epoch": 0.01909961451919555, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 2190 + }, + { + "epoch": 0.01910833580436413, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 2191 + }, + { + "epoch": 0.019117057089532714, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 2192 + }, + { + "epoch": 0.019125778374701297, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2193 + }, + { + "epoch": 0.019134499659869877, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 2194 + }, + { + "epoch": 0.01914322094503846, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 2195 + }, + { + "epoch": 0.019151942230207043, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 2196 + }, + { + "epoch": 0.019160663515375626, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0988, + "step": 2197 + }, + { + "epoch": 0.01916938480054421, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 2198 + }, + { + "epoch": 0.01917810608571279, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 2199 + }, + { + "epoch": 0.019186827370881374, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0988, + "step": 2200 + }, + { + "epoch": 0.019195548656049957, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 2201 + }, + { + "epoch": 0.019204269941218537, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 2202 + }, + { + "epoch": 0.01921299122638712, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 2203 + }, + { + "epoch": 0.019221712511555702, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 2204 + }, + { + "epoch": 0.019230433796724285, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 2205 + }, + { + "epoch": 0.01923915508189287, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2206 + }, + { + "epoch": 0.01924787636706145, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 2207 + }, + { + "epoch": 0.019256597652230034, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 2208 + }, + { + "epoch": 0.019265318937398614, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 2209 + }, + { + "epoch": 0.019274040222567197, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 2210 + }, + { + "epoch": 0.01928276150773578, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0926, + "step": 2211 + }, + { + "epoch": 0.019291482792904362, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 2212 + }, + { + "epoch": 0.019300204078072945, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 2213 + }, + { + "epoch": 0.019308925363241528, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 2214 + }, + { + "epoch": 0.01931764664841011, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2215 + }, + { + "epoch": 0.01932636793357869, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 2216 + }, + { + "epoch": 0.019335089218747274, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 2217 + }, + { + "epoch": 0.019343810503915856, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 2218 + }, + { + "epoch": 0.01935253178908444, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 2219 + }, + { + "epoch": 0.019361253074253022, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0963, + "step": 2220 + }, + { + "epoch": 0.019369974359421605, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 2221 + }, + { + "epoch": 0.019378695644590188, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 2222 + }, + { + "epoch": 0.019387416929758768, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 2223 + }, + { + "epoch": 0.01939613821492735, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 2224 + }, + { + "epoch": 0.019404859500095933, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 2225 + }, + { + "epoch": 0.019413580785264516, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 2226 + }, + { + "epoch": 0.0194223020704331, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 2227 + }, + { + "epoch": 0.019431023355601682, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0975, + "step": 2228 + }, + { + "epoch": 0.019439744640770265, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 2229 + }, + { + "epoch": 0.019448465925938848, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0973, + "step": 2230 + }, + { + "epoch": 0.019457187211107427, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.1, + "step": 2231 + }, + { + "epoch": 0.01946590849627601, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 2232 + }, + { + "epoch": 0.019474629781444593, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 2233 + }, + { + "epoch": 0.019483351066613176, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 2234 + }, + { + "epoch": 0.01949207235178176, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2235 + }, + { + "epoch": 0.019500793636950342, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2236 + }, + { + "epoch": 0.019509514922118925, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 2237 + }, + { + "epoch": 0.019518236207287504, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2238 + }, + { + "epoch": 0.019526957492456087, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 2239 + }, + { + "epoch": 0.01953567877762467, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2240 + }, + { + "epoch": 0.019544400062793253, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 2241 + }, + { + "epoch": 0.019553121347961836, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 2242 + }, + { + "epoch": 0.01956184263313042, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 2243 + }, + { + "epoch": 0.019570563918299002, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0901, + "step": 2244 + }, + { + "epoch": 0.01957928520346758, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 2245 + }, + { + "epoch": 0.019588006488636164, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 2246 + }, + { + "epoch": 0.019596727773804747, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 2247 + }, + { + "epoch": 0.01960544905897333, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 2248 + }, + { + "epoch": 0.019614170344141913, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0949, + "step": 2249 + }, + { + "epoch": 0.019622891629310496, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2250 + }, + { + "epoch": 0.01963161291447908, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2251 + }, + { + "epoch": 0.01964033419964766, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0881, + "step": 2252 + }, + { + "epoch": 0.01964905548481624, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 2253 + }, + { + "epoch": 0.019657776769984824, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 2254 + }, + { + "epoch": 0.019666498055153407, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 2255 + }, + { + "epoch": 0.01967521934032199, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 2256 + }, + { + "epoch": 0.019683940625490573, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.1039, + "step": 2257 + }, + { + "epoch": 0.019692661910659156, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 2258 + }, + { + "epoch": 0.01970138319582774, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 2259 + }, + { + "epoch": 0.01971010448099632, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 2260 + }, + { + "epoch": 0.0197188257661649, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 2261 + }, + { + "epoch": 0.019727547051333484, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 2262 + }, + { + "epoch": 0.019736268336502067, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 2263 + }, + { + "epoch": 0.01974498962167065, + "grad_norm": 0.515625, + "learning_rate": 0.0005, + "loss": 1.0952, + "step": 2264 + }, + { + "epoch": 0.019753710906839233, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 2265 + }, + { + "epoch": 0.019762432192007816, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 2266 + }, + { + "epoch": 0.019771153477176395, + "grad_norm": 0.61328125, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 2267 + }, + { + "epoch": 0.019779874762344978, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 2268 + }, + { + "epoch": 0.01978859604751356, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 2269 + }, + { + "epoch": 0.019797317332682144, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 2270 + }, + { + "epoch": 0.019806038617850727, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 2271 + }, + { + "epoch": 0.01981475990301931, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 2272 + }, + { + "epoch": 0.019823481188187893, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 2273 + }, + { + "epoch": 0.019832202473356472, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 2274 + }, + { + "epoch": 0.019840923758525055, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0901, + "step": 2275 + }, + { + "epoch": 0.019849645043693638, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 2276 + }, + { + "epoch": 0.01985836632886222, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 2277 + }, + { + "epoch": 0.019867087614030804, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2278 + }, + { + "epoch": 0.019875808899199387, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 2279 + }, + { + "epoch": 0.01988453018436797, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2280 + }, + { + "epoch": 0.01989325146953655, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 2281 + }, + { + "epoch": 0.019901972754705132, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 2282 + }, + { + "epoch": 0.019910694039873715, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2283 + }, + { + "epoch": 0.019919415325042298, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 2284 + }, + { + "epoch": 0.01992813661021088, + "grad_norm": 0.74609375, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 2285 + }, + { + "epoch": 0.019936857895379464, + "grad_norm": 0.443359375, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 2286 + }, + { + "epoch": 0.019945579180548047, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 2287 + }, + { + "epoch": 0.01995430046571663, + "grad_norm": 0.47265625, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 2288 + }, + { + "epoch": 0.01996302175088521, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 2289 + }, + { + "epoch": 0.019971743036053792, + "grad_norm": 0.51953125, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 2290 + }, + { + "epoch": 0.019980464321222375, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 2291 + }, + { + "epoch": 0.019989185606390958, + "grad_norm": 0.796875, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2292 + }, + { + "epoch": 0.01999790689155954, + "grad_norm": 0.337890625, + "learning_rate": 0.0005, + "loss": 1.1189, + "step": 2293 + }, + { + "epoch": 0.020006628176728124, + "grad_norm": 0.451171875, + "learning_rate": 0.0005, + "loss": 1.0965, + "step": 2294 + }, + { + "epoch": 0.020015349461896707, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 2295 + }, + { + "epoch": 0.020024070747065286, + "grad_norm": 0.447265625, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 2296 + }, + { + "epoch": 0.02003279203223387, + "grad_norm": 0.4375, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 2297 + }, + { + "epoch": 0.020041513317402452, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 2298 + }, + { + "epoch": 0.020050234602571035, + "grad_norm": 0.67578125, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 2299 + }, + { + "epoch": 0.020058955887739618, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2300 + }, + { + "epoch": 0.0200676771729082, + "grad_norm": 0.578125, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2301 + }, + { + "epoch": 0.020076398458076784, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 2302 + }, + { + "epoch": 0.020085119743245363, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 2303 + }, + { + "epoch": 0.020093841028413946, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0919, + "step": 2304 + }, + { + "epoch": 0.02010256231358253, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 2305 + }, + { + "epoch": 0.020111283598751112, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 2306 + }, + { + "epoch": 0.020120004883919695, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 2307 + }, + { + "epoch": 0.020128726169088278, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 2308 + }, + { + "epoch": 0.02013744745425686, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 2309 + }, + { + "epoch": 0.02014616873942544, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 2310 + }, + { + "epoch": 0.020154890024594023, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 2311 + }, + { + "epoch": 0.020163611309762606, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0889, + "step": 2312 + }, + { + "epoch": 0.02017233259493119, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 2313 + }, + { + "epoch": 0.02018105388009977, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 2314 + }, + { + "epoch": 0.020189775165268355, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 2315 + }, + { + "epoch": 0.020198496450436938, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 2316 + }, + { + "epoch": 0.02020721773560552, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0981, + "step": 2317 + }, + { + "epoch": 0.0202159390207741, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2318 + }, + { + "epoch": 0.020224660305942683, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 2319 + }, + { + "epoch": 0.020233381591111266, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 2320 + }, + { + "epoch": 0.02024210287627985, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 2321 + }, + { + "epoch": 0.02025082416144843, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 2322 + }, + { + "epoch": 0.020259545446617015, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 2323 + }, + { + "epoch": 0.020268266731785597, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 2324 + }, + { + "epoch": 0.020276988016954177, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 2325 + }, + { + "epoch": 0.02028570930212276, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 2326 + }, + { + "epoch": 0.020294430587291343, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 2327 + }, + { + "epoch": 0.020303151872459926, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2328 + }, + { + "epoch": 0.02031187315762851, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 2329 + }, + { + "epoch": 0.02032059444279709, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 2330 + }, + { + "epoch": 0.020329315727965674, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 2331 + }, + { + "epoch": 0.020338037013134254, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 2332 + }, + { + "epoch": 0.020346758298302837, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 2333 + }, + { + "epoch": 0.02035547958347142, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 2334 + }, + { + "epoch": 0.020364200868640003, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 2335 + }, + { + "epoch": 0.020372922153808586, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2336 + }, + { + "epoch": 0.02038164343897717, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.1044, + "step": 2337 + }, + { + "epoch": 0.02039036472414575, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 2338 + }, + { + "epoch": 0.02039908600931433, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 2339 + }, + { + "epoch": 0.020407807294482914, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 2340 + }, + { + "epoch": 0.020416528579651497, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 2341 + }, + { + "epoch": 0.02042524986482008, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 2342 + }, + { + "epoch": 0.020433971149988663, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 2343 + }, + { + "epoch": 0.020442692435157245, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 2344 + }, + { + "epoch": 0.02045141372032583, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 2345 + }, + { + "epoch": 0.02046013500549441, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 2346 + }, + { + "epoch": 0.02046885629066299, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 2347 + }, + { + "epoch": 0.020477577575831574, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 2348 + }, + { + "epoch": 0.020486298861000157, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 2349 + }, + { + "epoch": 0.02049502014616874, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 2350 + }, + { + "epoch": 0.020503741431337322, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 2351 + }, + { + "epoch": 0.020512462716505905, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 2352 + }, + { + "epoch": 0.02052118400167449, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 2353 + }, + { + "epoch": 0.020529905286843068, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 2354 + }, + { + "epoch": 0.02053862657201165, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 2355 + }, + { + "epoch": 0.020547347857180234, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2356 + }, + { + "epoch": 0.020556069142348816, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 2357 + }, + { + "epoch": 0.0205647904275174, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 2358 + }, + { + "epoch": 0.020573511712685982, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 2359 + }, + { + "epoch": 0.020582232997854565, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2360 + }, + { + "epoch": 0.020590954283023145, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 2361 + }, + { + "epoch": 0.020599675568191728, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 2362 + }, + { + "epoch": 0.02060839685336031, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 2363 + }, + { + "epoch": 0.020617118138528893, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2364 + }, + { + "epoch": 0.020625839423697476, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 2365 + }, + { + "epoch": 0.02063456070886606, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0856, + "step": 2366 + }, + { + "epoch": 0.020643281994034642, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 2367 + }, + { + "epoch": 0.02065200327920322, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 2368 + }, + { + "epoch": 0.020660724564371805, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 2369 + }, + { + "epoch": 0.020669445849540388, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 2370 + }, + { + "epoch": 0.02067816713470897, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 2371 + }, + { + "epoch": 0.020686888419877553, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.1031, + "step": 2372 + }, + { + "epoch": 0.020695609705046136, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 2373 + }, + { + "epoch": 0.02070433099021472, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 2374 + }, + { + "epoch": 0.020713052275383302, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 2375 + }, + { + "epoch": 0.02072177356055188, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 2376 + }, + { + "epoch": 0.020730494845720464, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 2377 + }, + { + "epoch": 0.020739216130889047, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 2378 + }, + { + "epoch": 0.02074793741605763, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 2379 + }, + { + "epoch": 0.020756658701226213, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 2380 + }, + { + "epoch": 0.020765379986394796, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 2381 + }, + { + "epoch": 0.02077410127156338, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 2382 + }, + { + "epoch": 0.02078282255673196, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 2383 + }, + { + "epoch": 0.02079154384190054, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 2384 + }, + { + "epoch": 0.020800265127069124, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 2385 + }, + { + "epoch": 0.020808986412237707, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 2386 + }, + { + "epoch": 0.02081770769740629, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 2387 + }, + { + "epoch": 0.020826428982574873, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 2388 + }, + { + "epoch": 0.020835150267743456, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 2389 + }, + { + "epoch": 0.020843871552912036, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.091, + "step": 2390 + }, + { + "epoch": 0.02085259283808062, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 2391 + }, + { + "epoch": 0.0208613141232492, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 2392 + }, + { + "epoch": 0.020870035408417784, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2393 + }, + { + "epoch": 0.020878756693586367, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 2394 + }, + { + "epoch": 0.02088747797875495, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 2395 + }, + { + "epoch": 0.020896199263923533, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 2396 + }, + { + "epoch": 0.020904920549092113, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.1069, + "step": 2397 + }, + { + "epoch": 0.020913641834260695, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 2398 + }, + { + "epoch": 0.02092236311942928, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 2399 + }, + { + "epoch": 0.02093108440459786, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 2400 + }, + { + "epoch": 0.020939805689766444, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 2401 + }, + { + "epoch": 0.020948526974935027, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 2402 + }, + { + "epoch": 0.02095724826010361, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 2403 + }, + { + "epoch": 0.020965969545272193, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2404 + }, + { + "epoch": 0.020974690830440772, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 2405 + }, + { + "epoch": 0.020983412115609355, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 2406 + }, + { + "epoch": 0.020992133400777938, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2407 + }, + { + "epoch": 0.02100085468594652, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 2408 + }, + { + "epoch": 0.021009575971115104, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 2409 + }, + { + "epoch": 0.021018297256283687, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 2410 + }, + { + "epoch": 0.02102701854145227, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 2411 + }, + { + "epoch": 0.02103573982662085, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 2412 + }, + { + "epoch": 0.021044461111789432, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 2413 + }, + { + "epoch": 0.021053182396958015, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 2414 + }, + { + "epoch": 0.021061903682126598, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 2415 + }, + { + "epoch": 0.02107062496729518, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 2416 + }, + { + "epoch": 0.021079346252463764, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 2417 + }, + { + "epoch": 0.021088067537632347, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2418 + }, + { + "epoch": 0.021096788822800926, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 2419 + }, + { + "epoch": 0.02110551010796951, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2420 + }, + { + "epoch": 0.021114231393138092, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 2421 + }, + { + "epoch": 0.021122952678306675, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 2422 + }, + { + "epoch": 0.021131673963475258, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0928, + "step": 2423 + }, + { + "epoch": 0.02114039524864384, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 2424 + }, + { + "epoch": 0.021149116533812424, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 2425 + }, + { + "epoch": 0.021157837818981003, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 2426 + }, + { + "epoch": 0.021166559104149586, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 2427 + }, + { + "epoch": 0.02117528038931817, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 2428 + }, + { + "epoch": 0.021184001674486752, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 2429 + }, + { + "epoch": 0.021192722959655335, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 2430 + }, + { + "epoch": 0.021201444244823918, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 2431 + }, + { + "epoch": 0.0212101655299925, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 2432 + }, + { + "epoch": 0.021218886815161084, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 2433 + }, + { + "epoch": 0.021227608100329663, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 2434 + }, + { + "epoch": 0.021236329385498246, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 2435 + }, + { + "epoch": 0.02124505067066683, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0893, + "step": 2436 + }, + { + "epoch": 0.021253771955835412, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 2437 + }, + { + "epoch": 0.021262493241003995, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 2438 + }, + { + "epoch": 0.021271214526172578, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 2439 + }, + { + "epoch": 0.02127993581134116, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0995, + "step": 2440 + }, + { + "epoch": 0.02128865709650974, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 2441 + }, + { + "epoch": 0.021297378381678323, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 2442 + }, + { + "epoch": 0.021306099666846906, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 2443 + }, + { + "epoch": 0.02131482095201549, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2444 + }, + { + "epoch": 0.021323542237184072, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 2445 + }, + { + "epoch": 0.021332263522352655, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2446 + }, + { + "epoch": 0.021340984807521238, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 2447 + }, + { + "epoch": 0.021349706092689817, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 2448 + }, + { + "epoch": 0.0213584273778584, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2449 + }, + { + "epoch": 0.021367148663026983, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2450 + }, + { + "epoch": 0.021375869948195566, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 2451 + }, + { + "epoch": 0.02138459123336415, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 2452 + }, + { + "epoch": 0.021393312518532732, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2453 + }, + { + "epoch": 0.021402033803701315, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 2454 + }, + { + "epoch": 0.021410755088869894, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 2455 + }, + { + "epoch": 0.021419476374038477, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2456 + }, + { + "epoch": 0.02142819765920706, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 2457 + }, + { + "epoch": 0.021436918944375643, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 2458 + }, + { + "epoch": 0.021445640229544226, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 2459 + }, + { + "epoch": 0.02145436151471281, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 2460 + }, + { + "epoch": 0.02146308279988139, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 2461 + }, + { + "epoch": 0.021471804085049975, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2462 + }, + { + "epoch": 0.021480525370218554, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 2463 + }, + { + "epoch": 0.021489246655387137, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 2464 + }, + { + "epoch": 0.02149796794055572, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 2465 + }, + { + "epoch": 0.021506689225724303, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 2466 + }, + { + "epoch": 0.021515410510892886, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 2467 + }, + { + "epoch": 0.02152413179606147, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 2468 + }, + { + "epoch": 0.02153285308123005, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2469 + }, + { + "epoch": 0.02154157436639863, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 2470 + }, + { + "epoch": 0.021550295651567214, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 2471 + }, + { + "epoch": 0.021559016936735797, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 2472 + }, + { + "epoch": 0.02156773822190438, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 2473 + }, + { + "epoch": 0.021576459507072963, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 2474 + }, + { + "epoch": 0.021585180792241546, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2475 + }, + { + "epoch": 0.02159390207741013, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 2476 + }, + { + "epoch": 0.021602623362578708, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 2477 + }, + { + "epoch": 0.02161134464774729, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 2478 + }, + { + "epoch": 0.021620065932915874, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 2479 + }, + { + "epoch": 0.021628787218084457, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 2480 + }, + { + "epoch": 0.02163750850325304, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 2481 + }, + { + "epoch": 0.021646229788421623, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 2482 + }, + { + "epoch": 0.021654951073590206, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 2483 + }, + { + "epoch": 0.021663672358758785, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2484 + }, + { + "epoch": 0.021672393643927368, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2485 + }, + { + "epoch": 0.02168111492909595, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 2486 + }, + { + "epoch": 0.021689836214264534, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 2487 + }, + { + "epoch": 0.021698557499433117, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0875, + "step": 2488 + }, + { + "epoch": 0.0217072787846017, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 2489 + }, + { + "epoch": 0.021716000069770282, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 2490 + }, + { + "epoch": 0.021724721354938865, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2491 + }, + { + "epoch": 0.021733442640107445, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 2492 + }, + { + "epoch": 0.021742163925276028, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 2493 + }, + { + "epoch": 0.02175088521044461, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 2494 + }, + { + "epoch": 0.021759606495613194, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2495 + }, + { + "epoch": 0.021768327780781777, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 2496 + }, + { + "epoch": 0.02177704906595036, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 2497 + }, + { + "epoch": 0.021785770351118942, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2498 + }, + { + "epoch": 0.021794491636287522, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 2499 + }, + { + "epoch": 0.021803212921456105, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 2500 + }, + { + "epoch": 0.021811934206624688, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.1137, + "step": 2501 + }, + { + "epoch": 0.02182065549179327, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 2502 + }, + { + "epoch": 0.021829376776961854, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 2503 + }, + { + "epoch": 0.021838098062130436, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 2504 + }, + { + "epoch": 0.02184681934729902, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 2505 + }, + { + "epoch": 0.0218555406324676, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.086, + "step": 2506 + }, + { + "epoch": 0.021864261917636182, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 2507 + }, + { + "epoch": 0.021872983202804765, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2508 + }, + { + "epoch": 0.021881704487973348, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 2509 + }, + { + "epoch": 0.02189042577314193, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 2510 + }, + { + "epoch": 0.021899147058310513, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 2511 + }, + { + "epoch": 0.021907868343479096, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 2512 + }, + { + "epoch": 0.021916589628647676, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0798, + "step": 2513 + }, + { + "epoch": 0.02192531091381626, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 2514 + }, + { + "epoch": 0.02193403219898484, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 2515 + }, + { + "epoch": 0.021942753484153425, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 2516 + }, + { + "epoch": 0.021951474769322007, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 2517 + }, + { + "epoch": 0.02196019605449059, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 2518 + }, + { + "epoch": 0.021968917339659173, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 2519 + }, + { + "epoch": 0.021977638624827756, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 2520 + }, + { + "epoch": 0.021986359909996336, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 2521 + }, + { + "epoch": 0.02199508119516492, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0893, + "step": 2522 + }, + { + "epoch": 0.0220038024803335, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 2523 + }, + { + "epoch": 0.022012523765502084, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 2524 + }, + { + "epoch": 0.022021245050670667, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 2525 + }, + { + "epoch": 0.02202996633583925, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 2526 + }, + { + "epoch": 0.022038687621007833, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 2527 + }, + { + "epoch": 0.022047408906176413, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 2528 + }, + { + "epoch": 0.022056130191344996, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 2529 + }, + { + "epoch": 0.02206485147651358, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 2530 + }, + { + "epoch": 0.02207357276168216, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 2531 + }, + { + "epoch": 0.022082294046850744, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 2532 + }, + { + "epoch": 0.022091015332019327, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 2533 + }, + { + "epoch": 0.02209973661718791, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 2534 + }, + { + "epoch": 0.02210845790235649, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 2535 + }, + { + "epoch": 0.022117179187525073, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 2536 + }, + { + "epoch": 0.022125900472693655, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.093, + "step": 2537 + }, + { + "epoch": 0.02213462175786224, + "grad_norm": 0.359375, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 2538 + }, + { + "epoch": 0.02214334304303082, + "grad_norm": 0.49609375, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 2539 + }, + { + "epoch": 0.022152064328199404, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 2540 + }, + { + "epoch": 0.022160785613367987, + "grad_norm": 0.55859375, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 2541 + }, + { + "epoch": 0.02216950689853657, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 2542 + }, + { + "epoch": 0.02217822818370515, + "grad_norm": 0.7734375, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 2543 + }, + { + "epoch": 0.022186949468873732, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 2544 + }, + { + "epoch": 0.022195670754042315, + "grad_norm": 0.609375, + "learning_rate": 0.0005, + "loss": 1.0936, + "step": 2545 + }, + { + "epoch": 0.0222043920392109, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 2546 + }, + { + "epoch": 0.02221311332437948, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2547 + }, + { + "epoch": 0.022221834609548064, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 2548 + }, + { + "epoch": 0.022230555894716647, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 2549 + }, + { + "epoch": 0.022239277179885227, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2550 + }, + { + "epoch": 0.02224799846505381, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 2551 + }, + { + "epoch": 0.022256719750222392, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 2552 + }, + { + "epoch": 0.022265441035390975, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2553 + }, + { + "epoch": 0.022274162320559558, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 2554 + }, + { + "epoch": 0.02228288360572814, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2555 + }, + { + "epoch": 0.022291604890896724, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 2556 + }, + { + "epoch": 0.022300326176065303, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 2557 + }, + { + "epoch": 0.022309047461233886, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 2558 + }, + { + "epoch": 0.02231776874640247, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 2559 + }, + { + "epoch": 0.022326490031571052, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 2560 + }, + { + "epoch": 0.022335211316739635, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 2561 + }, + { + "epoch": 0.022343932601908218, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2562 + }, + { + "epoch": 0.0223526538870768, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 2563 + }, + { + "epoch": 0.02236137517224538, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 2564 + }, + { + "epoch": 0.022370096457413963, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2565 + }, + { + "epoch": 0.022378817742582546, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2566 + }, + { + "epoch": 0.02238753902775113, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 2567 + }, + { + "epoch": 0.022396260312919712, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 2568 + }, + { + "epoch": 0.022404981598088295, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 2569 + }, + { + "epoch": 0.022413702883256878, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 2570 + }, + { + "epoch": 0.02242242416842546, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 2571 + }, + { + "epoch": 0.02243114545359404, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 2572 + }, + { + "epoch": 0.022439866738762623, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 2573 + }, + { + "epoch": 0.022448588023931206, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 2574 + }, + { + "epoch": 0.02245730930909979, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 2575 + }, + { + "epoch": 0.022466030594268372, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 2576 + }, + { + "epoch": 0.022474751879436955, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2577 + }, + { + "epoch": 0.022483473164605538, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 2578 + }, + { + "epoch": 0.022492194449774117, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 2579 + }, + { + "epoch": 0.0225009157349427, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 2580 + }, + { + "epoch": 0.022509637020111283, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2581 + }, + { + "epoch": 0.022518358305279866, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2582 + }, + { + "epoch": 0.02252707959044845, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 2583 + }, + { + "epoch": 0.022535800875617032, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 2584 + }, + { + "epoch": 0.022544522160785615, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 2585 + }, + { + "epoch": 0.022553243445954194, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2586 + }, + { + "epoch": 0.022561964731122777, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2587 + }, + { + "epoch": 0.02257068601629136, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2588 + }, + { + "epoch": 0.022579407301459943, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 2589 + }, + { + "epoch": 0.022588128586628526, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 2590 + }, + { + "epoch": 0.02259684987179711, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 2591 + }, + { + "epoch": 0.022605571156965692, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 2592 + }, + { + "epoch": 0.02261429244213427, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 2593 + }, + { + "epoch": 0.022623013727302854, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0975, + "step": 2594 + }, + { + "epoch": 0.022631735012471437, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 2595 + }, + { + "epoch": 0.02264045629764002, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 2596 + }, + { + "epoch": 0.022649177582808603, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2597 + }, + { + "epoch": 0.022657898867977186, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 2598 + }, + { + "epoch": 0.02266662015314577, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 2599 + }, + { + "epoch": 0.02267534143831435, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2600 + }, + { + "epoch": 0.02268406272348293, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2601 + }, + { + "epoch": 0.022692784008651514, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0881, + "step": 2602 + }, + { + "epoch": 0.022701505293820097, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 2603 + }, + { + "epoch": 0.02271022657898868, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 2604 + }, + { + "epoch": 0.022718947864157263, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 2605 + }, + { + "epoch": 0.022727669149325846, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0867, + "step": 2606 + }, + { + "epoch": 0.02273639043449443, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 2607 + }, + { + "epoch": 0.022745111719663008, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 2608 + }, + { + "epoch": 0.02275383300483159, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 2609 + }, + { + "epoch": 0.022762554290000174, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 2610 + }, + { + "epoch": 0.022771275575168757, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 2611 + }, + { + "epoch": 0.02277999686033734, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0981, + "step": 2612 + }, + { + "epoch": 0.022788718145505923, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 2613 + }, + { + "epoch": 0.022797439430674506, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 2614 + }, + { + "epoch": 0.022806160715843085, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 2615 + }, + { + "epoch": 0.022814882001011668, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.1055, + "step": 2616 + }, + { + "epoch": 0.02282360328618025, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 2617 + }, + { + "epoch": 0.022832324571348834, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 2618 + }, + { + "epoch": 0.022841045856517417, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0973, + "step": 2619 + }, + { + "epoch": 0.022849767141686, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 2620 + }, + { + "epoch": 0.022858488426854583, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 2621 + }, + { + "epoch": 0.022867209712023162, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2622 + }, + { + "epoch": 0.022875930997191745, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 2623 + }, + { + "epoch": 0.022884652282360328, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 2624 + }, + { + "epoch": 0.02289337356752891, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 2625 + }, + { + "epoch": 0.022902094852697494, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 2626 + }, + { + "epoch": 0.022910816137866077, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.092, + "step": 2627 + }, + { + "epoch": 0.02291953742303466, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 2628 + }, + { + "epoch": 0.022928258708203243, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.1022, + "step": 2629 + }, + { + "epoch": 0.022936979993371822, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2630 + }, + { + "epoch": 0.022945701278540405, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 2631 + }, + { + "epoch": 0.022954422563708988, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 2632 + }, + { + "epoch": 0.02296314384887757, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 2633 + }, + { + "epoch": 0.022971865134046154, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 2634 + }, + { + "epoch": 0.022980586419214737, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2635 + }, + { + "epoch": 0.02298930770438332, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 2636 + }, + { + "epoch": 0.0229980289895519, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 2637 + }, + { + "epoch": 0.023006750274720482, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0901, + "step": 2638 + }, + { + "epoch": 0.023015471559889065, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 2639 + }, + { + "epoch": 0.023024192845057648, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 2640 + }, + { + "epoch": 0.02303291413022623, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 2641 + }, + { + "epoch": 0.023041635415394814, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 2642 + }, + { + "epoch": 0.023050356700563397, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 2643 + }, + { + "epoch": 0.023059077985731976, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2644 + }, + { + "epoch": 0.02306779927090056, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 2645 + }, + { + "epoch": 0.023076520556069142, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 2646 + }, + { + "epoch": 0.023085241841237725, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 2647 + }, + { + "epoch": 0.023093963126406308, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 2648 + }, + { + "epoch": 0.02310268441157489, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 2649 + }, + { + "epoch": 0.023111405696743473, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 2650 + }, + { + "epoch": 0.023120126981912053, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 2651 + }, + { + "epoch": 0.023128848267080636, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 2652 + }, + { + "epoch": 0.02313756955224922, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 2653 + }, + { + "epoch": 0.0231462908374178, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 2654 + }, + { + "epoch": 0.023155012122586385, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.1067, + "step": 2655 + }, + { + "epoch": 0.023163733407754968, + "grad_norm": 0.439453125, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 2656 + }, + { + "epoch": 0.02317245469292355, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 2657 + }, + { + "epoch": 0.023181175978092133, + "grad_norm": 0.671875, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 2658 + }, + { + "epoch": 0.023189897263260713, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 2659 + }, + { + "epoch": 0.023198618548429296, + "grad_norm": 0.62109375, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 2660 + }, + { + "epoch": 0.02320733983359788, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 2661 + }, + { + "epoch": 0.02321606111876646, + "grad_norm": 0.447265625, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 2662 + }, + { + "epoch": 0.023224782403935045, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 2663 + }, + { + "epoch": 0.023233503689103627, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 2664 + }, + { + "epoch": 0.02324222497427221, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 2665 + }, + { + "epoch": 0.02325094625944079, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 2666 + }, + { + "epoch": 0.023259667544609373, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 2667 + }, + { + "epoch": 0.023268388829777956, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2668 + }, + { + "epoch": 0.02327711011494654, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 2669 + }, + { + "epoch": 0.02328583140011512, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2670 + }, + { + "epoch": 0.023294552685283704, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 2671 + }, + { + "epoch": 0.023303273970452287, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2672 + }, + { + "epoch": 0.023311995255620867, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 2673 + }, + { + "epoch": 0.02332071654078945, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 2674 + }, + { + "epoch": 0.023329437825958033, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0943, + "step": 2675 + }, + { + "epoch": 0.023338159111126616, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 2676 + }, + { + "epoch": 0.0233468803962952, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 2677 + }, + { + "epoch": 0.02335560168146378, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 2678 + }, + { + "epoch": 0.023364322966632364, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 2679 + }, + { + "epoch": 0.023373044251800944, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 2680 + }, + { + "epoch": 0.023381765536969527, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 2681 + }, + { + "epoch": 0.02339048682213811, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 2682 + }, + { + "epoch": 0.023399208107306693, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 2683 + }, + { + "epoch": 0.023407929392475275, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 2684 + }, + { + "epoch": 0.02341665067764386, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 2685 + }, + { + "epoch": 0.02342537196281244, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2686 + }, + { + "epoch": 0.023434093247981024, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 2687 + }, + { + "epoch": 0.023442814533149604, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 2688 + }, + { + "epoch": 0.023451535818318187, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 2689 + }, + { + "epoch": 0.02346025710348677, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 2690 + }, + { + "epoch": 0.023468978388655352, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 2691 + }, + { + "epoch": 0.023477699673823935, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 2692 + }, + { + "epoch": 0.023486420958992518, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2693 + }, + { + "epoch": 0.0234951422441611, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 2694 + }, + { + "epoch": 0.02350386352932968, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 2695 + }, + { + "epoch": 0.023512584814498264, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2696 + }, + { + "epoch": 0.023521306099666846, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 2697 + }, + { + "epoch": 0.02353002738483543, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0873, + "step": 2698 + }, + { + "epoch": 0.023538748670004012, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2699 + }, + { + "epoch": 0.023547469955172595, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 2700 + }, + { + "epoch": 0.023556191240341178, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 2701 + }, + { + "epoch": 0.023564912525509758, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 2702 + }, + { + "epoch": 0.02357363381067834, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.084, + "step": 2703 + }, + { + "epoch": 0.023582355095846923, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 2704 + }, + { + "epoch": 0.023591076381015506, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 2705 + }, + { + "epoch": 0.02359979766618409, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 2706 + }, + { + "epoch": 0.023608518951352672, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2707 + }, + { + "epoch": 0.023617240236521255, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 2708 + }, + { + "epoch": 0.023625961521689835, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2709 + }, + { + "epoch": 0.023634682806858417, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 2710 + }, + { + "epoch": 0.023643404092027, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2711 + }, + { + "epoch": 0.023652125377195583, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 2712 + }, + { + "epoch": 0.023660846662364166, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 2713 + }, + { + "epoch": 0.02366956794753275, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0963, + "step": 2714 + }, + { + "epoch": 0.023678289232701332, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2715 + }, + { + "epoch": 0.023687010517869915, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 2716 + }, + { + "epoch": 0.023695731803038494, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 2717 + }, + { + "epoch": 0.023704453088207077, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 2718 + }, + { + "epoch": 0.02371317437337566, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 2719 + }, + { + "epoch": 0.023721895658544243, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 2720 + }, + { + "epoch": 0.023730616943712826, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 2721 + }, + { + "epoch": 0.02373933822888141, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 2722 + }, + { + "epoch": 0.023748059514049992, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 2723 + }, + { + "epoch": 0.02375678079921857, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2724 + }, + { + "epoch": 0.023765502084387154, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 2725 + }, + { + "epoch": 0.023774223369555737, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 2726 + }, + { + "epoch": 0.02378294465472432, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 2727 + }, + { + "epoch": 0.023791665939892903, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2728 + }, + { + "epoch": 0.023800387225061486, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 2729 + }, + { + "epoch": 0.02380910851023007, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 2730 + }, + { + "epoch": 0.02381782979539865, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 2731 + }, + { + "epoch": 0.02382655108056723, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 2732 + }, + { + "epoch": 0.023835272365735814, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 2733 + }, + { + "epoch": 0.023843993650904397, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 2734 + }, + { + "epoch": 0.02385271493607298, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 2735 + }, + { + "epoch": 0.023861436221241563, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 2736 + }, + { + "epoch": 0.023870157506410146, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 2737 + }, + { + "epoch": 0.023878878791578725, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 2738 + }, + { + "epoch": 0.02388760007674731, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 2739 + }, + { + "epoch": 0.02389632136191589, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 2740 + }, + { + "epoch": 0.023905042647084474, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 2741 + }, + { + "epoch": 0.023913763932253057, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 2742 + }, + { + "epoch": 0.02392248521742164, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 2743 + }, + { + "epoch": 0.023931206502590223, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 2744 + }, + { + "epoch": 0.023939927787758806, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0964, + "step": 2745 + }, + { + "epoch": 0.023948649072927385, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 2746 + }, + { + "epoch": 0.023957370358095968, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.1064, + "step": 2747 + }, + { + "epoch": 0.02396609164326455, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 2748 + }, + { + "epoch": 0.023974812928433134, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 2749 + }, + { + "epoch": 0.023983534213601717, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 2750 + }, + { + "epoch": 0.0239922554987703, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 2751 + }, + { + "epoch": 0.024000976783938883, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 2752 + }, + { + "epoch": 0.024009698069107462, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 2753 + }, + { + "epoch": 0.024018419354276045, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 2754 + }, + { + "epoch": 0.024027140639444628, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 2755 + }, + { + "epoch": 0.02403586192461321, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 2756 + }, + { + "epoch": 0.024044583209781794, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 2757 + }, + { + "epoch": 0.024053304494950377, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 2758 + }, + { + "epoch": 0.02406202578011896, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2759 + }, + { + "epoch": 0.02407074706528754, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2760 + }, + { + "epoch": 0.024079468350456122, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 2761 + }, + { + "epoch": 0.024088189635624705, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 2762 + }, + { + "epoch": 0.024096910920793288, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 2763 + }, + { + "epoch": 0.02410563220596187, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 2764 + }, + { + "epoch": 0.024114353491130454, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 2765 + }, + { + "epoch": 0.024123074776299037, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 2766 + }, + { + "epoch": 0.024131796061467616, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0873, + "step": 2767 + }, + { + "epoch": 0.0241405173466362, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 2768 + }, + { + "epoch": 0.024149238631804782, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 2769 + }, + { + "epoch": 0.024157959916973365, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 2770 + }, + { + "epoch": 0.024166681202141948, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 2771 + }, + { + "epoch": 0.02417540248731053, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 2772 + }, + { + "epoch": 0.024184123772479114, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 2773 + }, + { + "epoch": 0.024192845057647697, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 2774 + }, + { + "epoch": 0.024201566342816276, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 2775 + }, + { + "epoch": 0.02421028762798486, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 2776 + }, + { + "epoch": 0.024219008913153442, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2777 + }, + { + "epoch": 0.024227730198322025, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 2778 + }, + { + "epoch": 0.024236451483490608, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 2779 + }, + { + "epoch": 0.02424517276865919, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 2780 + }, + { + "epoch": 0.024253894053827774, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 2781 + }, + { + "epoch": 0.024262615338996353, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 2782 + }, + { + "epoch": 0.024271336624164936, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 2783 + }, + { + "epoch": 0.02428005790933352, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 2784 + }, + { + "epoch": 0.024288779194502102, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 2785 + }, + { + "epoch": 0.024297500479670685, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 2786 + }, + { + "epoch": 0.024306221764839268, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 2787 + }, + { + "epoch": 0.02431494305000785, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 2788 + }, + { + "epoch": 0.02432366433517643, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 2789 + }, + { + "epoch": 0.024332385620345013, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 2790 + }, + { + "epoch": 0.024341106905513596, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 2791 + }, + { + "epoch": 0.02434982819068218, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2792 + }, + { + "epoch": 0.024358549475850762, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 2793 + }, + { + "epoch": 0.024367270761019345, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 2794 + }, + { + "epoch": 0.024375992046187928, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 2795 + }, + { + "epoch": 0.024384713331356507, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 2796 + }, + { + "epoch": 0.02439343461652509, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 2797 + }, + { + "epoch": 0.024402155901693673, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 2798 + }, + { + "epoch": 0.024410877186862256, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 2799 + }, + { + "epoch": 0.02441959847203084, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2800 + }, + { + "epoch": 0.02442831975719942, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 2801 + }, + { + "epoch": 0.024437041042368005, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 2802 + }, + { + "epoch": 0.024445762327536587, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 2803 + }, + { + "epoch": 0.024454483612705167, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 2804 + }, + { + "epoch": 0.02446320489787375, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 2805 + }, + { + "epoch": 0.024471926183042333, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 2806 + }, + { + "epoch": 0.024480647468210916, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 2807 + }, + { + "epoch": 0.0244893687533795, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 2808 + }, + { + "epoch": 0.02449809003854808, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 2809 + }, + { + "epoch": 0.024506811323716664, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 2810 + }, + { + "epoch": 0.024515532608885244, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2811 + }, + { + "epoch": 0.024524253894053827, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 2812 + }, + { + "epoch": 0.02453297517922241, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2813 + }, + { + "epoch": 0.024541696464390993, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 2814 + }, + { + "epoch": 0.024550417749559576, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2815 + }, + { + "epoch": 0.02455913903472816, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 2816 + }, + { + "epoch": 0.02456786031989674, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 2817 + }, + { + "epoch": 0.02457658160506532, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 2818 + }, + { + "epoch": 0.024585302890233904, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 2819 + }, + { + "epoch": 0.024594024175402487, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 2820 + }, + { + "epoch": 0.02460274546057107, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 2821 + }, + { + "epoch": 0.024611466745739653, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 2822 + }, + { + "epoch": 0.024620188030908235, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 2823 + }, + { + "epoch": 0.02462890931607682, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 2824 + }, + { + "epoch": 0.024637630601245398, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 2825 + }, + { + "epoch": 0.02464635188641398, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 2826 + }, + { + "epoch": 0.024655073171582564, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0942, + "step": 2827 + }, + { + "epoch": 0.024663794456751147, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 2828 + }, + { + "epoch": 0.02467251574191973, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 2829 + }, + { + "epoch": 0.024681237027088312, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 2830 + }, + { + "epoch": 0.024689958312256895, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 2831 + }, + { + "epoch": 0.02469867959742548, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2832 + }, + { + "epoch": 0.024707400882594058, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 2833 + }, + { + "epoch": 0.02471612216776264, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 2834 + }, + { + "epoch": 0.024724843452931224, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0804, + "step": 2835 + }, + { + "epoch": 0.024733564738099807, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 2836 + }, + { + "epoch": 0.02474228602326839, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 2837 + }, + { + "epoch": 0.024751007308436972, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 2838 + }, + { + "epoch": 0.024759728593605555, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 2839 + }, + { + "epoch": 0.024768449878774135, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 2840 + }, + { + "epoch": 0.024777171163942718, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2841 + }, + { + "epoch": 0.0247858924491113, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 2842 + }, + { + "epoch": 0.024794613734279883, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 2843 + }, + { + "epoch": 0.024803335019448466, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 2844 + }, + { + "epoch": 0.02481205630461705, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 2845 + }, + { + "epoch": 0.024820777589785632, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 2846 + }, + { + "epoch": 0.02482949887495421, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2847 + }, + { + "epoch": 0.024838220160122795, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 2848 + }, + { + "epoch": 0.024846941445291378, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 2849 + }, + { + "epoch": 0.02485566273045996, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 2850 + }, + { + "epoch": 0.024864384015628543, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 2851 + }, + { + "epoch": 0.024873105300797126, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 2852 + }, + { + "epoch": 0.02488182658596571, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 2853 + }, + { + "epoch": 0.02489054787113429, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 2854 + }, + { + "epoch": 0.02489926915630287, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 2855 + }, + { + "epoch": 0.024907990441471455, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 2856 + }, + { + "epoch": 0.024916711726640037, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 2857 + }, + { + "epoch": 0.02492543301180862, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 2858 + }, + { + "epoch": 0.024934154296977203, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 2859 + }, + { + "epoch": 0.024942875582145786, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 2860 + }, + { + "epoch": 0.02495159686731437, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.1, + "step": 2861 + }, + { + "epoch": 0.02496031815248295, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0893, + "step": 2862 + }, + { + "epoch": 0.02496903943765153, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 2863 + }, + { + "epoch": 0.024977760722820114, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 2864 + }, + { + "epoch": 0.024986482007988697, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2865 + }, + { + "epoch": 0.02499520329315728, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 2866 + }, + { + "epoch": 0.025003924578325863, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 2867 + }, + { + "epoch": 0.025012645863494446, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 2868 + }, + { + "epoch": 0.025021367148663026, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 2869 + }, + { + "epoch": 0.02503008843383161, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0856, + "step": 2870 + }, + { + "epoch": 0.02503880971900019, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 2871 + }, + { + "epoch": 0.025047531004168774, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 2872 + }, + { + "epoch": 0.025056252289337357, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 2873 + }, + { + "epoch": 0.02506497357450594, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 2874 + }, + { + "epoch": 0.025073694859674523, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 2875 + }, + { + "epoch": 0.025082416144843103, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 2876 + }, + { + "epoch": 0.025091137430011685, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 2877 + }, + { + "epoch": 0.02509985871518027, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 2878 + }, + { + "epoch": 0.02510858000034885, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 2879 + }, + { + "epoch": 0.025117301285517434, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 2880 + }, + { + "epoch": 0.025126022570686017, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 2881 + }, + { + "epoch": 0.0251347438558546, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 2882 + }, + { + "epoch": 0.02514346514102318, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 2883 + }, + { + "epoch": 0.025152186426191762, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.096, + "step": 2884 + }, + { + "epoch": 0.025160907711360345, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0868, + "step": 2885 + }, + { + "epoch": 0.02516962899652893, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2886 + }, + { + "epoch": 0.02517835028169751, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 2887 + }, + { + "epoch": 0.025187071566866094, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2888 + }, + { + "epoch": 0.025195792852034677, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 2889 + }, + { + "epoch": 0.02520451413720326, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 2890 + }, + { + "epoch": 0.02521323542237184, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2891 + }, + { + "epoch": 0.025221956707540422, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 2892 + }, + { + "epoch": 0.025230677992709005, + "grad_norm": 0.359375, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 2893 + }, + { + "epoch": 0.025239399277877588, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 2894 + }, + { + "epoch": 0.02524812056304617, + "grad_norm": 0.4296875, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 2895 + }, + { + "epoch": 0.025256841848214754, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 2896 + }, + { + "epoch": 0.025265563133383337, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2897 + }, + { + "epoch": 0.025274284418551916, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 2898 + }, + { + "epoch": 0.0252830057037205, + "grad_norm": 0.421875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 2899 + }, + { + "epoch": 0.025291726988889082, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 2900 + }, + { + "epoch": 0.025300448274057665, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 2901 + }, + { + "epoch": 0.025309169559226248, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 2902 + }, + { + "epoch": 0.02531789084439483, + "grad_norm": 0.515625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2903 + }, + { + "epoch": 0.025326612129563414, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 2904 + }, + { + "epoch": 0.025335333414731993, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 2905 + }, + { + "epoch": 0.025344054699900576, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 2906 + }, + { + "epoch": 0.02535277598506916, + "grad_norm": 0.44140625, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 2907 + }, + { + "epoch": 0.025361497270237742, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 2908 + }, + { + "epoch": 0.025370218555406325, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 2909 + }, + { + "epoch": 0.025378939840574908, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 2910 + }, + { + "epoch": 0.02538766112574349, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 2911 + }, + { + "epoch": 0.02539638241091207, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 2912 + }, + { + "epoch": 0.025405103696080653, + "grad_norm": 0.640625, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 2913 + }, + { + "epoch": 0.025413824981249236, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 2914 + }, + { + "epoch": 0.02542254626641782, + "grad_norm": 0.515625, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 2915 + }, + { + "epoch": 0.025431267551586402, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 2916 + }, + { + "epoch": 0.025439988836754985, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 2917 + }, + { + "epoch": 0.025448710121923568, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 2918 + }, + { + "epoch": 0.02545743140709215, + "grad_norm": 0.404296875, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 2919 + }, + { + "epoch": 0.02546615269226073, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 2920 + }, + { + "epoch": 0.025474873977429313, + "grad_norm": 0.404296875, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 2921 + }, + { + "epoch": 0.025483595262597896, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 2922 + }, + { + "epoch": 0.02549231654776648, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 2923 + }, + { + "epoch": 0.025501037832935062, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2924 + }, + { + "epoch": 0.025509759118103645, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2925 + }, + { + "epoch": 0.025518480403272228, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 2926 + }, + { + "epoch": 0.025527201688440807, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0938, + "step": 2927 + }, + { + "epoch": 0.02553592297360939, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 2928 + }, + { + "epoch": 0.025544644258777973, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 2929 + }, + { + "epoch": 0.025553365543946556, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2930 + }, + { + "epoch": 0.02556208682911514, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 2931 + }, + { + "epoch": 0.025570808114283722, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 2932 + }, + { + "epoch": 0.025579529399452305, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 2933 + }, + { + "epoch": 0.025588250684620884, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 2934 + }, + { + "epoch": 0.025596971969789467, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 2935 + }, + { + "epoch": 0.02560569325495805, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 2936 + }, + { + "epoch": 0.025614414540126633, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 2937 + }, + { + "epoch": 0.025623135825295216, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 2938 + }, + { + "epoch": 0.0256318571104638, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 2939 + }, + { + "epoch": 0.02564057839563238, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 2940 + }, + { + "epoch": 0.02564929968080096, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 2941 + }, + { + "epoch": 0.025658020965969544, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 2942 + }, + { + "epoch": 0.025666742251138127, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 2943 + }, + { + "epoch": 0.02567546353630671, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 2944 + }, + { + "epoch": 0.025684184821475293, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 2945 + }, + { + "epoch": 0.025692906106643876, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 2946 + }, + { + "epoch": 0.02570162739181246, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 2947 + }, + { + "epoch": 0.02571034867698104, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 2948 + }, + { + "epoch": 0.02571906996214962, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 2949 + }, + { + "epoch": 0.025727791247318204, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 2950 + }, + { + "epoch": 0.025736512532486787, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 2951 + }, + { + "epoch": 0.02574523381765537, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 2952 + }, + { + "epoch": 0.025753955102823953, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 2953 + }, + { + "epoch": 0.025762676387992536, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 2954 + }, + { + "epoch": 0.02577139767316112, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 2955 + }, + { + "epoch": 0.025780118958329698, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 2956 + }, + { + "epoch": 0.02578884024349828, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 2957 + }, + { + "epoch": 0.025797561528666864, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 2958 + }, + { + "epoch": 0.025806282813835447, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 2959 + }, + { + "epoch": 0.02581500409900403, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 2960 + }, + { + "epoch": 0.025823725384172613, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 2961 + }, + { + "epoch": 0.025832446669341196, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 2962 + }, + { + "epoch": 0.025841167954509775, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 2963 + }, + { + "epoch": 0.025849889239678358, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 2964 + }, + { + "epoch": 0.02585861052484694, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 2965 + }, + { + "epoch": 0.025867331810015524, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 2966 + }, + { + "epoch": 0.025876053095184107, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2967 + }, + { + "epoch": 0.02588477438035269, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 2968 + }, + { + "epoch": 0.025893495665521273, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 2969 + }, + { + "epoch": 0.025902216950689852, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 2970 + }, + { + "epoch": 0.025910938235858435, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 2971 + }, + { + "epoch": 0.025919659521027018, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 2972 + }, + { + "epoch": 0.0259283808061956, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 2973 + }, + { + "epoch": 0.025937102091364184, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 2974 + }, + { + "epoch": 0.025945823376532767, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 2975 + }, + { + "epoch": 0.02595454466170135, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 2976 + }, + { + "epoch": 0.025963265946869932, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2977 + }, + { + "epoch": 0.025971987232038512, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 2978 + }, + { + "epoch": 0.025980708517207095, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 2979 + }, + { + "epoch": 0.025989429802375678, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 2980 + }, + { + "epoch": 0.02599815108754426, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 2981 + }, + { + "epoch": 0.026006872372712844, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 2982 + }, + { + "epoch": 0.026015593657881426, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 2983 + }, + { + "epoch": 0.02602431494305001, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 2984 + }, + { + "epoch": 0.02603303622821859, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 2985 + }, + { + "epoch": 0.026041757513387172, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 2986 + }, + { + "epoch": 0.026050478798555755, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2987 + }, + { + "epoch": 0.026059200083724338, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2988 + }, + { + "epoch": 0.02606792136889292, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 2989 + }, + { + "epoch": 0.026076642654061503, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 2990 + }, + { + "epoch": 0.026085363939230086, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 2991 + }, + { + "epoch": 0.026094085224398666, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 2992 + }, + { + "epoch": 0.02610280650956725, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 2993 + }, + { + "epoch": 0.02611152779473583, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2994 + }, + { + "epoch": 0.026120249079904415, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2995 + }, + { + "epoch": 0.026128970365072998, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 2996 + }, + { + "epoch": 0.02613769165024158, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 2997 + }, + { + "epoch": 0.026146412935410163, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2998 + }, + { + "epoch": 0.026155134220578743, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.1108, + "step": 2999 + }, + { + "epoch": 0.026163855505747326, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 3000 + }, + { + "epoch": 0.02617257679091591, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 3001 + }, + { + "epoch": 0.02618129807608449, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 3002 + }, + { + "epoch": 0.026190019361253074, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 3003 + }, + { + "epoch": 0.026198740646421657, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 3004 + }, + { + "epoch": 0.02620746193159024, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 3005 + }, + { + "epoch": 0.026216183216758823, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 3006 + }, + { + "epoch": 0.026224904501927403, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 3007 + }, + { + "epoch": 0.026233625787095986, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 3008 + }, + { + "epoch": 0.02624234707226457, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 3009 + }, + { + "epoch": 0.02625106835743315, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 3010 + }, + { + "epoch": 0.026259789642601734, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 3011 + }, + { + "epoch": 0.026268510927770317, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 3012 + }, + { + "epoch": 0.0262772322129389, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 3013 + }, + { + "epoch": 0.02628595349810748, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 3014 + }, + { + "epoch": 0.026294674783276063, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 3015 + }, + { + "epoch": 0.026303396068444646, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3016 + }, + { + "epoch": 0.02631211735361323, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 3017 + }, + { + "epoch": 0.02632083863878181, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 3018 + }, + { + "epoch": 0.026329559923950394, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0905, + "step": 3019 + }, + { + "epoch": 0.026338281209118977, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 3020 + }, + { + "epoch": 0.026347002494287557, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 3021 + }, + { + "epoch": 0.02635572377945614, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 3022 + }, + { + "epoch": 0.026364445064624722, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 3023 + }, + { + "epoch": 0.026373166349793305, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3024 + }, + { + "epoch": 0.02638188763496189, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 3025 + }, + { + "epoch": 0.02639060892013047, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 3026 + }, + { + "epoch": 0.026399330205299054, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 3027 + }, + { + "epoch": 0.026408051490467634, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3028 + }, + { + "epoch": 0.026416772775636217, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3029 + }, + { + "epoch": 0.0264254940608048, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 3030 + }, + { + "epoch": 0.026434215345973382, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 3031 + }, + { + "epoch": 0.026442936631141965, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 3032 + }, + { + "epoch": 0.026451657916310548, + "grad_norm": 0.396484375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3033 + }, + { + "epoch": 0.02646037920147913, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3034 + }, + { + "epoch": 0.026469100486647714, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 3035 + }, + { + "epoch": 0.026477821771816294, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3036 + }, + { + "epoch": 0.026486543056984876, + "grad_norm": 0.435546875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3037 + }, + { + "epoch": 0.02649526434215346, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 3038 + }, + { + "epoch": 0.026503985627322042, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 3039 + }, + { + "epoch": 0.026512706912490625, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 3040 + }, + { + "epoch": 0.026521428197659208, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 3041 + }, + { + "epoch": 0.02653014948282779, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 3042 + }, + { + "epoch": 0.02653887076799637, + "grad_norm": 0.408203125, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 3043 + }, + { + "epoch": 0.026547592053164953, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3044 + }, + { + "epoch": 0.026556313338333536, + "grad_norm": 0.45703125, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 3045 + }, + { + "epoch": 0.02656503462350212, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 3046 + }, + { + "epoch": 0.026573755908670702, + "grad_norm": 0.69140625, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 3047 + }, + { + "epoch": 0.026582477193839285, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 3048 + }, + { + "epoch": 0.026591198479007868, + "grad_norm": 0.494140625, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 3049 + }, + { + "epoch": 0.026599919764176447, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 3050 + }, + { + "epoch": 0.02660864104934503, + "grad_norm": 0.47265625, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 3051 + }, + { + "epoch": 0.026617362334513613, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3052 + }, + { + "epoch": 0.026626083619682196, + "grad_norm": 0.609375, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 3053 + }, + { + "epoch": 0.02663480490485078, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 3054 + }, + { + "epoch": 0.026643526190019362, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3055 + }, + { + "epoch": 0.026652247475187945, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 3056 + }, + { + "epoch": 0.026660968760356524, + "grad_norm": 0.6171875, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 3057 + }, + { + "epoch": 0.026669690045525107, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 3058 + }, + { + "epoch": 0.02667841133069369, + "grad_norm": 0.859375, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 3059 + }, + { + "epoch": 0.026687132615862273, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3060 + }, + { + "epoch": 0.026695853901030856, + "grad_norm": 0.82421875, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 3061 + }, + { + "epoch": 0.02670457518619944, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 3062 + }, + { + "epoch": 0.026713296471368022, + "grad_norm": 0.53125, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 3063 + }, + { + "epoch": 0.026722017756536605, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 3064 + }, + { + "epoch": 0.026730739041705184, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3065 + }, + { + "epoch": 0.026739460326873767, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 3066 + }, + { + "epoch": 0.02674818161204235, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 3067 + }, + { + "epoch": 0.026756902897210933, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3068 + }, + { + "epoch": 0.026765624182379516, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 3069 + }, + { + "epoch": 0.0267743454675481, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 3070 + }, + { + "epoch": 0.026783066752716682, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3071 + }, + { + "epoch": 0.02679178803788526, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3072 + }, + { + "epoch": 0.026800509323053844, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 3073 + }, + { + "epoch": 0.026809230608222427, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0883, + "step": 3074 + }, + { + "epoch": 0.02681795189339101, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 3075 + }, + { + "epoch": 0.026826673178559593, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 3076 + }, + { + "epoch": 0.026835394463728176, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 3077 + }, + { + "epoch": 0.02684411574889676, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 3078 + }, + { + "epoch": 0.02685283703406534, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 3079 + }, + { + "epoch": 0.02686155831923392, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 3080 + }, + { + "epoch": 0.026870279604402504, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0963, + "step": 3081 + }, + { + "epoch": 0.026879000889571087, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 3082 + }, + { + "epoch": 0.02688772217473967, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 3083 + }, + { + "epoch": 0.026896443459908253, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 3084 + }, + { + "epoch": 0.026905164745076836, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 3085 + }, + { + "epoch": 0.026913886030245415, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 3086 + }, + { + "epoch": 0.026922607315413998, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 3087 + }, + { + "epoch": 0.02693132860058258, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 3088 + }, + { + "epoch": 0.026940049885751164, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 3089 + }, + { + "epoch": 0.026948771170919747, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 3090 + }, + { + "epoch": 0.02695749245608833, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 3091 + }, + { + "epoch": 0.026966213741256913, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 3092 + }, + { + "epoch": 0.026974935026425496, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 3093 + }, + { + "epoch": 0.026983656311594075, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 3094 + }, + { + "epoch": 0.026992377596762658, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 3095 + }, + { + "epoch": 0.02700109888193124, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 3096 + }, + { + "epoch": 0.027009820167099824, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 3097 + }, + { + "epoch": 0.027018541452268407, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 3098 + }, + { + "epoch": 0.02702726273743699, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 3099 + }, + { + "epoch": 0.027035984022605573, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3100 + }, + { + "epoch": 0.027044705307774152, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 3101 + }, + { + "epoch": 0.027053426592942735, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 3102 + }, + { + "epoch": 0.027062147878111318, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 3103 + }, + { + "epoch": 0.0270708691632799, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 3104 + }, + { + "epoch": 0.027079590448448484, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 3105 + }, + { + "epoch": 0.027088311733617067, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 3106 + }, + { + "epoch": 0.02709703301878565, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 3107 + }, + { + "epoch": 0.02710575430395423, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 3108 + }, + { + "epoch": 0.027114475589122812, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 3109 + }, + { + "epoch": 0.027123196874291395, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 3110 + }, + { + "epoch": 0.027131918159459978, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3111 + }, + { + "epoch": 0.02714063944462856, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 3112 + }, + { + "epoch": 0.027149360729797144, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 3113 + }, + { + "epoch": 0.027158082014965727, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 3114 + }, + { + "epoch": 0.027166803300134306, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3115 + }, + { + "epoch": 0.02717552458530289, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 3116 + }, + { + "epoch": 0.027184245870471472, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 3117 + }, + { + "epoch": 0.027192967155640055, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 3118 + }, + { + "epoch": 0.027201688440808638, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3119 + }, + { + "epoch": 0.02721040972597722, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 3120 + }, + { + "epoch": 0.027219131011145804, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0949, + "step": 3121 + }, + { + "epoch": 0.027227852296314387, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3122 + }, + { + "epoch": 0.027236573581482966, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 3123 + }, + { + "epoch": 0.02724529486665155, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 3124 + }, + { + "epoch": 0.027254016151820132, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3125 + }, + { + "epoch": 0.027262737436988715, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 3126 + }, + { + "epoch": 0.027271458722157298, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 3127 + }, + { + "epoch": 0.02728018000732588, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 3128 + }, + { + "epoch": 0.027288901292494464, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 3129 + }, + { + "epoch": 0.027297622577663043, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 3130 + }, + { + "epoch": 0.027306343862831626, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 3131 + }, + { + "epoch": 0.02731506514800021, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 3132 + }, + { + "epoch": 0.02732378643316879, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 3133 + }, + { + "epoch": 0.027332507718337375, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 3134 + }, + { + "epoch": 0.027341229003505958, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 3135 + }, + { + "epoch": 0.02734995028867454, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3136 + }, + { + "epoch": 0.02735867157384312, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 3137 + }, + { + "epoch": 0.027367392859011703, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 3138 + }, + { + "epoch": 0.027376114144180286, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 3139 + }, + { + "epoch": 0.02738483542934887, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 3140 + }, + { + "epoch": 0.02739355671451745, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3141 + }, + { + "epoch": 0.027402277999686035, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 3142 + }, + { + "epoch": 0.027410999284854617, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 3143 + }, + { + "epoch": 0.027419720570023197, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 3144 + }, + { + "epoch": 0.02742844185519178, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 3145 + }, + { + "epoch": 0.027437163140360363, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 3146 + }, + { + "epoch": 0.027445884425528946, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 3147 + }, + { + "epoch": 0.02745460571069753, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 3148 + }, + { + "epoch": 0.02746332699586611, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3149 + }, + { + "epoch": 0.027472048281034694, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3150 + }, + { + "epoch": 0.027480769566203277, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 3151 + }, + { + "epoch": 0.027489490851371857, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 3152 + }, + { + "epoch": 0.02749821213654044, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 3153 + }, + { + "epoch": 0.027506933421709023, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 3154 + }, + { + "epoch": 0.027515654706877606, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3155 + }, + { + "epoch": 0.02752437599204619, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3156 + }, + { + "epoch": 0.02753309727721477, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 3157 + }, + { + "epoch": 0.027541818562383354, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0868, + "step": 3158 + }, + { + "epoch": 0.027550539847551934, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 3159 + }, + { + "epoch": 0.027559261132720517, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3160 + }, + { + "epoch": 0.0275679824178891, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 3161 + }, + { + "epoch": 0.027576703703057683, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3162 + }, + { + "epoch": 0.027585424988226265, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 3163 + }, + { + "epoch": 0.02759414627339485, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 3164 + }, + { + "epoch": 0.02760286755856343, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 3165 + }, + { + "epoch": 0.02761158884373201, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 3166 + }, + { + "epoch": 0.027620310128900594, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3167 + }, + { + "epoch": 0.027629031414069177, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3168 + }, + { + "epoch": 0.02763775269923776, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 3169 + }, + { + "epoch": 0.027646473984406342, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 3170 + }, + { + "epoch": 0.027655195269574925, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 3171 + }, + { + "epoch": 0.02766391655474351, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 3172 + }, + { + "epoch": 0.027672637839912088, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.091, + "step": 3173 + }, + { + "epoch": 0.02768135912508067, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 3174 + }, + { + "epoch": 0.027690080410249254, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 3175 + }, + { + "epoch": 0.027698801695417836, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 3176 + }, + { + "epoch": 0.02770752298058642, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3177 + }, + { + "epoch": 0.027716244265755002, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3178 + }, + { + "epoch": 0.027724965550923585, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 3179 + }, + { + "epoch": 0.027733686836092168, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3180 + }, + { + "epoch": 0.027742408121260748, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3181 + }, + { + "epoch": 0.02775112940642933, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 3182 + }, + { + "epoch": 0.027759850691597913, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 3183 + }, + { + "epoch": 0.027768571976766496, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 3184 + }, + { + "epoch": 0.02777729326193508, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 3185 + }, + { + "epoch": 0.027786014547103662, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3186 + }, + { + "epoch": 0.027794735832272245, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3187 + }, + { + "epoch": 0.027803457117440825, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 3188 + }, + { + "epoch": 0.027812178402609408, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 3189 + }, + { + "epoch": 0.02782089968777799, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 3190 + }, + { + "epoch": 0.027829620972946573, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 3191 + }, + { + "epoch": 0.027838342258115156, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 3192 + }, + { + "epoch": 0.02784706354328374, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 3193 + }, + { + "epoch": 0.027855784828452322, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 3194 + }, + { + "epoch": 0.0278645061136209, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 3195 + }, + { + "epoch": 0.027873227398789484, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 3196 + }, + { + "epoch": 0.027881948683958067, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3197 + }, + { + "epoch": 0.02789066996912665, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 3198 + }, + { + "epoch": 0.027899391254295233, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 3199 + }, + { + "epoch": 0.027908112539463816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3200 + }, + { + "epoch": 0.0279168338246324, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 3201 + }, + { + "epoch": 0.02792555510980098, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 3202 + }, + { + "epoch": 0.02793427639496956, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3203 + }, + { + "epoch": 0.027942997680138144, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 3204 + }, + { + "epoch": 0.027951718965306727, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 3205 + }, + { + "epoch": 0.02796044025047531, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 3206 + }, + { + "epoch": 0.027969161535643893, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0873, + "step": 3207 + }, + { + "epoch": 0.027977882820812476, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 3208 + }, + { + "epoch": 0.02798660410598106, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 3209 + }, + { + "epoch": 0.02799532539114964, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 3210 + }, + { + "epoch": 0.02800404667631822, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 3211 + }, + { + "epoch": 0.028012767961486804, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3212 + }, + { + "epoch": 0.028021489246655387, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 3213 + }, + { + "epoch": 0.02803021053182397, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 3214 + }, + { + "epoch": 0.028038931816992553, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3215 + }, + { + "epoch": 0.028047653102161136, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 3216 + }, + { + "epoch": 0.028056374387329715, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 3217 + }, + { + "epoch": 0.0280650956724983, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3218 + }, + { + "epoch": 0.02807381695766688, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 3219 + }, + { + "epoch": 0.028082538242835464, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 3220 + }, + { + "epoch": 0.028091259528004047, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 3221 + }, + { + "epoch": 0.02809998081317263, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 3222 + }, + { + "epoch": 0.028108702098341213, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 3223 + }, + { + "epoch": 0.028117423383509792, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 3224 + }, + { + "epoch": 0.028126144668678375, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 3225 + }, + { + "epoch": 0.028134865953846958, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3226 + }, + { + "epoch": 0.02814358723901554, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 3227 + }, + { + "epoch": 0.028152308524184124, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 3228 + }, + { + "epoch": 0.028161029809352707, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3229 + }, + { + "epoch": 0.02816975109452129, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 3230 + }, + { + "epoch": 0.02817847237968987, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 3231 + }, + { + "epoch": 0.028187193664858452, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.091, + "step": 3232 + }, + { + "epoch": 0.028195914950027035, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 3233 + }, + { + "epoch": 0.028204636235195618, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3234 + }, + { + "epoch": 0.0282133575203642, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3235 + }, + { + "epoch": 0.028222078805532784, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 3236 + }, + { + "epoch": 0.028230800090701367, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 3237 + }, + { + "epoch": 0.02823952137586995, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 3238 + }, + { + "epoch": 0.02824824266103853, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 3239 + }, + { + "epoch": 0.028256963946207112, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 3240 + }, + { + "epoch": 0.028265685231375695, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 3241 + }, + { + "epoch": 0.028274406516544278, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 3242 + }, + { + "epoch": 0.02828312780171286, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 3243 + }, + { + "epoch": 0.028291849086881444, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3244 + }, + { + "epoch": 0.028300570372050027, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 3245 + }, + { + "epoch": 0.028309291657218606, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 3246 + }, + { + "epoch": 0.02831801294238719, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 3247 + }, + { + "epoch": 0.028326734227555772, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 3248 + }, + { + "epoch": 0.028335455512724355, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 3249 + }, + { + "epoch": 0.028344176797892938, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 3250 + }, + { + "epoch": 0.02835289808306152, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 3251 + }, + { + "epoch": 0.028361619368230104, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 3252 + }, + { + "epoch": 0.028370340653398683, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3253 + }, + { + "epoch": 0.028379061938567266, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 3254 + }, + { + "epoch": 0.02838778322373585, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 3255 + }, + { + "epoch": 0.028396504508904432, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3256 + }, + { + "epoch": 0.028405225794073015, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 3257 + }, + { + "epoch": 0.028413947079241598, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 3258 + }, + { + "epoch": 0.02842266836441018, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 3259 + }, + { + "epoch": 0.02843138964957876, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3260 + }, + { + "epoch": 0.028440110934747343, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 3261 + }, + { + "epoch": 0.028448832219915926, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 3262 + }, + { + "epoch": 0.02845755350508451, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 3263 + }, + { + "epoch": 0.028466274790253092, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 3264 + }, + { + "epoch": 0.028474996075421675, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 3265 + }, + { + "epoch": 0.028483717360590258, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3266 + }, + { + "epoch": 0.02849243864575884, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 3267 + }, + { + "epoch": 0.02850115993092742, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3268 + }, + { + "epoch": 0.028509881216096003, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 3269 + }, + { + "epoch": 0.028518602501264586, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3270 + }, + { + "epoch": 0.02852732378643317, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 3271 + }, + { + "epoch": 0.028536045071601752, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 3272 + }, + { + "epoch": 0.028544766356770335, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 3273 + }, + { + "epoch": 0.028553487641938918, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 3274 + }, + { + "epoch": 0.028562208927107497, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 3275 + }, + { + "epoch": 0.02857093021227608, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 3276 + }, + { + "epoch": 0.028579651497444663, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 3277 + }, + { + "epoch": 0.028588372782613246, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3278 + }, + { + "epoch": 0.02859709406778183, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3279 + }, + { + "epoch": 0.02860581535295041, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 3280 + }, + { + "epoch": 0.028614536638118995, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 3281 + }, + { + "epoch": 0.028623257923287574, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 3282 + }, + { + "epoch": 0.028631979208456157, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 3283 + }, + { + "epoch": 0.02864070049362474, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 3284 + }, + { + "epoch": 0.028649421778793323, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 3285 + }, + { + "epoch": 0.028658143063961906, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 3286 + }, + { + "epoch": 0.02866686434913049, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3287 + }, + { + "epoch": 0.02867558563429907, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 3288 + }, + { + "epoch": 0.028684306919467654, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 3289 + }, + { + "epoch": 0.028693028204636234, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 3290 + }, + { + "epoch": 0.028701749489804817, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 3291 + }, + { + "epoch": 0.0287104707749734, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 3292 + }, + { + "epoch": 0.028719192060141983, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 3293 + }, + { + "epoch": 0.028727913345310566, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 3294 + }, + { + "epoch": 0.02873663463047915, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 3295 + }, + { + "epoch": 0.02874535591564773, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 3296 + }, + { + "epoch": 0.02875407720081631, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 3297 + }, + { + "epoch": 0.028762798485984894, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3298 + }, + { + "epoch": 0.028771519771153477, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 3299 + }, + { + "epoch": 0.02878024105632206, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3300 + }, + { + "epoch": 0.028788962341490643, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 3301 + }, + { + "epoch": 0.028797683626659226, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 3302 + }, + { + "epoch": 0.02880640491182781, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 3303 + }, + { + "epoch": 0.028815126196996388, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 3304 + }, + { + "epoch": 0.02882384748216497, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 3305 + }, + { + "epoch": 0.028832568767333554, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3306 + }, + { + "epoch": 0.028841290052502137, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 3307 + }, + { + "epoch": 0.02885001133767072, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 3308 + }, + { + "epoch": 0.028858732622839302, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 3309 + }, + { + "epoch": 0.028867453908007885, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 3310 + }, + { + "epoch": 0.028876175193176465, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3311 + }, + { + "epoch": 0.028884896478345048, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 3312 + }, + { + "epoch": 0.02889361776351363, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 3313 + }, + { + "epoch": 0.028902339048682214, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 3314 + }, + { + "epoch": 0.028911060333850797, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 3315 + }, + { + "epoch": 0.02891978161901938, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 3316 + }, + { + "epoch": 0.028928502904187962, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 3317 + }, + { + "epoch": 0.028937224189356545, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 3318 + }, + { + "epoch": 0.028945945474525125, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 3319 + }, + { + "epoch": 0.028954666759693708, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 3320 + }, + { + "epoch": 0.02896338804486229, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 3321 + }, + { + "epoch": 0.028972109330030874, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 3322 + }, + { + "epoch": 0.028980830615199456, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 3323 + }, + { + "epoch": 0.02898955190036804, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0835, + "step": 3324 + }, + { + "epoch": 0.028998273185536622, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0807, + "step": 3325 + }, + { + "epoch": 0.0290069944707052, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 3326 + }, + { + "epoch": 0.029015715755873785, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 3327 + }, + { + "epoch": 0.029024437041042368, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 3328 + }, + { + "epoch": 0.02903315832621095, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0807, + "step": 3329 + }, + { + "epoch": 0.029041879611379533, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 3330 + }, + { + "epoch": 0.029050600896548116, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 3331 + }, + { + "epoch": 0.0290593221817167, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 3332 + }, + { + "epoch": 0.02906804346688528, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 3333 + }, + { + "epoch": 0.02907676475205386, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 3334 + }, + { + "epoch": 0.029085486037222445, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3335 + }, + { + "epoch": 0.029094207322391027, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 3336 + }, + { + "epoch": 0.02910292860755961, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3337 + }, + { + "epoch": 0.029111649892728193, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 3338 + }, + { + "epoch": 0.029120371177896776, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 3339 + }, + { + "epoch": 0.029129092463065356, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 3340 + }, + { + "epoch": 0.02913781374823394, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 3341 + }, + { + "epoch": 0.02914653503340252, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 3342 + }, + { + "epoch": 0.029155256318571104, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 3343 + }, + { + "epoch": 0.029163977603739687, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 3344 + }, + { + "epoch": 0.02917269888890827, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 3345 + }, + { + "epoch": 0.029181420174076853, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3346 + }, + { + "epoch": 0.029190141459245436, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 3347 + }, + { + "epoch": 0.029198862744414016, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 3348 + }, + { + "epoch": 0.0292075840295826, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 3349 + }, + { + "epoch": 0.02921630531475118, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3350 + }, + { + "epoch": 0.029225026599919764, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 3351 + }, + { + "epoch": 0.029233747885088347, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 3352 + }, + { + "epoch": 0.02924246917025693, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 3353 + }, + { + "epoch": 0.029251190455425513, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 3354 + }, + { + "epoch": 0.029259911740594093, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 3355 + }, + { + "epoch": 0.029268633025762675, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 3356 + }, + { + "epoch": 0.02927735431093126, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 3357 + }, + { + "epoch": 0.02928607559609984, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3358 + }, + { + "epoch": 0.029294796881268424, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 3359 + }, + { + "epoch": 0.029303518166437007, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 3360 + }, + { + "epoch": 0.02931223945160559, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 3361 + }, + { + "epoch": 0.02932096073677417, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 3362 + }, + { + "epoch": 0.029329682021942752, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 3363 + }, + { + "epoch": 0.029338403307111335, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 3364 + }, + { + "epoch": 0.02934712459227992, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3365 + }, + { + "epoch": 0.0293558458774485, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 3366 + }, + { + "epoch": 0.029364567162617084, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 3367 + }, + { + "epoch": 0.029373288447785667, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3368 + }, + { + "epoch": 0.029382009732954247, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 3369 + }, + { + "epoch": 0.02939073101812283, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 3370 + }, + { + "epoch": 0.029399452303291412, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 3371 + }, + { + "epoch": 0.029408173588459995, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 3372 + }, + { + "epoch": 0.029416894873628578, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 3373 + }, + { + "epoch": 0.02942561615879716, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 3374 + }, + { + "epoch": 0.029434337443965744, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 3375 + }, + { + "epoch": 0.029443058729134327, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 3376 + }, + { + "epoch": 0.029451780014302906, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3377 + }, + { + "epoch": 0.02946050129947149, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 3378 + }, + { + "epoch": 0.029469222584640072, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 3379 + }, + { + "epoch": 0.029477943869808655, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3380 + }, + { + "epoch": 0.029486665154977238, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 3381 + }, + { + "epoch": 0.02949538644014582, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 3382 + }, + { + "epoch": 0.029504107725314404, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3383 + }, + { + "epoch": 0.029512829010482983, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3384 + }, + { + "epoch": 0.029521550295651566, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 3385 + }, + { + "epoch": 0.02953027158082015, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 3386 + }, + { + "epoch": 0.029538992865988732, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 3387 + }, + { + "epoch": 0.029547714151157315, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3388 + }, + { + "epoch": 0.029556435436325898, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3389 + }, + { + "epoch": 0.02956515672149448, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 3390 + }, + { + "epoch": 0.02957387800666306, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 3391 + }, + { + "epoch": 0.029582599291831643, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 3392 + }, + { + "epoch": 0.029591320577000226, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 3393 + }, + { + "epoch": 0.02960004186216881, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 3394 + }, + { + "epoch": 0.029608763147337392, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 3395 + }, + { + "epoch": 0.029617484432505975, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 3396 + }, + { + "epoch": 0.029626205717674558, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 3397 + }, + { + "epoch": 0.029634927002843137, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 3398 + }, + { + "epoch": 0.02964364828801172, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3399 + }, + { + "epoch": 0.029652369573180303, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3400 + }, + { + "epoch": 0.029661090858348886, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 3401 + }, + { + "epoch": 0.02966981214351747, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 3402 + }, + { + "epoch": 0.029678533428686052, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 3403 + }, + { + "epoch": 0.029687254713854635, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3404 + }, + { + "epoch": 0.029695975999023218, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 3405 + }, + { + "epoch": 0.029704697284191797, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 3406 + }, + { + "epoch": 0.02971341856936038, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 3407 + }, + { + "epoch": 0.029722139854528963, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 3408 + }, + { + "epoch": 0.029730861139697546, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 3409 + }, + { + "epoch": 0.02973958242486613, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 3410 + }, + { + "epoch": 0.029748303710034712, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 3411 + }, + { + "epoch": 0.029757024995203295, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3412 + }, + { + "epoch": 0.029765746280371874, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 3413 + }, + { + "epoch": 0.029774467565540457, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 3414 + }, + { + "epoch": 0.02978318885070904, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 3415 + }, + { + "epoch": 0.029791910135877623, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 3416 + }, + { + "epoch": 0.029800631421046206, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 3417 + }, + { + "epoch": 0.02980935270621479, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3418 + }, + { + "epoch": 0.02981807399138337, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 3419 + }, + { + "epoch": 0.02982679527655195, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 3420 + }, + { + "epoch": 0.029835516561720534, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 3421 + }, + { + "epoch": 0.029844237846889117, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 3422 + }, + { + "epoch": 0.0298529591320577, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 3423 + }, + { + "epoch": 0.029861680417226283, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 3424 + }, + { + "epoch": 0.029870401702394866, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 3425 + }, + { + "epoch": 0.02987912298756345, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 3426 + }, + { + "epoch": 0.029887844272732028, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 3427 + }, + { + "epoch": 0.02989656555790061, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 3428 + }, + { + "epoch": 0.029905286843069194, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3429 + }, + { + "epoch": 0.029914008128237777, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 3430 + }, + { + "epoch": 0.02992272941340636, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3431 + }, + { + "epoch": 0.029931450698574943, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 3432 + }, + { + "epoch": 0.029940171983743526, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 3433 + }, + { + "epoch": 0.02994889326891211, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 3434 + }, + { + "epoch": 0.029957614554080688, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 3435 + }, + { + "epoch": 0.02996633583924927, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 3436 + }, + { + "epoch": 0.029975057124417854, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3437 + }, + { + "epoch": 0.029983778409586437, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3438 + }, + { + "epoch": 0.02999249969475502, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 3439 + }, + { + "epoch": 0.030001220979923603, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3440 + }, + { + "epoch": 0.030009942265092186, + "grad_norm": 0.3984375, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 3441 + }, + { + "epoch": 0.030018663550260765, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0798, + "step": 3442 + }, + { + "epoch": 0.030027384835429348, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 3443 + }, + { + "epoch": 0.03003610612059793, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3444 + }, + { + "epoch": 0.030044827405766514, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 3445 + }, + { + "epoch": 0.030053548690935097, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 3446 + }, + { + "epoch": 0.03006226997610368, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 3447 + }, + { + "epoch": 0.030070991261272263, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 3448 + }, + { + "epoch": 0.030079712546440842, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 3449 + }, + { + "epoch": 0.030088433831609425, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 3450 + }, + { + "epoch": 0.030097155116778008, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3451 + }, + { + "epoch": 0.03010587640194659, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 3452 + }, + { + "epoch": 0.030114597687115174, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 3453 + }, + { + "epoch": 0.030123318972283757, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 3454 + }, + { + "epoch": 0.03013204025745234, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 3455 + }, + { + "epoch": 0.03014076154262092, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3456 + }, + { + "epoch": 0.030149482827789502, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3457 + }, + { + "epoch": 0.030158204112958085, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 3458 + }, + { + "epoch": 0.030166925398126668, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 3459 + }, + { + "epoch": 0.03017564668329525, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 3460 + }, + { + "epoch": 0.030184367968463834, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 3461 + }, + { + "epoch": 0.030193089253632416, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 3462 + }, + { + "epoch": 0.030201810538801, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 3463 + }, + { + "epoch": 0.03021053182396958, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 3464 + }, + { + "epoch": 0.030219253109138162, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 3465 + }, + { + "epoch": 0.030227974394306745, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 3466 + }, + { + "epoch": 0.030236695679475328, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 3467 + }, + { + "epoch": 0.03024541696464391, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 3468 + }, + { + "epoch": 0.030254138249812493, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 3469 + }, + { + "epoch": 0.030262859534981076, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 3470 + }, + { + "epoch": 0.030271580820149656, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 3471 + }, + { + "epoch": 0.03028030210531824, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 3472 + }, + { + "epoch": 0.03028902339048682, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 3473 + }, + { + "epoch": 0.030297744675655405, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 3474 + }, + { + "epoch": 0.030306465960823988, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 3475 + }, + { + "epoch": 0.03031518724599257, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0867, + "step": 3476 + }, + { + "epoch": 0.030323908531161153, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 3477 + }, + { + "epoch": 0.030332629816329733, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 3478 + }, + { + "epoch": 0.030341351101498316, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 3479 + }, + { + "epoch": 0.0303500723866669, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 3480 + }, + { + "epoch": 0.03035879367183548, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 3481 + }, + { + "epoch": 0.030367514957004065, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3482 + }, + { + "epoch": 0.030376236242172647, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 3483 + }, + { + "epoch": 0.03038495752734123, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 3484 + }, + { + "epoch": 0.03039367881250981, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 3485 + }, + { + "epoch": 0.030402400097678393, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 3486 + }, + { + "epoch": 0.030411121382846976, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3487 + }, + { + "epoch": 0.03041984266801556, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 3488 + }, + { + "epoch": 0.03042856395318414, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 3489 + }, + { + "epoch": 0.030437285238352724, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3490 + }, + { + "epoch": 0.030446006523521307, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0964, + "step": 3491 + }, + { + "epoch": 0.03045472780868989, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3492 + }, + { + "epoch": 0.03046344909385847, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 3493 + }, + { + "epoch": 0.030472170379027053, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 3494 + }, + { + "epoch": 0.030480891664195636, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 3495 + }, + { + "epoch": 0.03048961294936422, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 3496 + }, + { + "epoch": 0.0304983342345328, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3497 + }, + { + "epoch": 0.030507055519701384, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 3498 + }, + { + "epoch": 0.030515776804869967, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 3499 + }, + { + "epoch": 0.030524498090038547, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3500 + }, + { + "epoch": 0.03053321937520713, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 3501 + }, + { + "epoch": 0.030541940660375713, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3502 + }, + { + "epoch": 0.030550661945544295, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 3503 + }, + { + "epoch": 0.03055938323071288, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 3504 + }, + { + "epoch": 0.03056810451588146, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 3505 + }, + { + "epoch": 0.030576825801050044, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 3506 + }, + { + "epoch": 0.030585547086218624, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 3507 + }, + { + "epoch": 0.030594268371387207, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 3508 + }, + { + "epoch": 0.03060298965655579, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 3509 + }, + { + "epoch": 0.030611710941724372, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 3510 + }, + { + "epoch": 0.030620432226892955, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 3511 + }, + { + "epoch": 0.030629153512061538, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 3512 + }, + { + "epoch": 0.03063787479723012, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3513 + }, + { + "epoch": 0.0306465960823987, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 3514 + }, + { + "epoch": 0.030655317367567284, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3515 + }, + { + "epoch": 0.030664038652735866, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 3516 + }, + { + "epoch": 0.03067275993790445, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0807, + "step": 3517 + }, + { + "epoch": 0.030681481223073032, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 3518 + }, + { + "epoch": 0.030690202508241615, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3519 + }, + { + "epoch": 0.030698923793410198, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3520 + }, + { + "epoch": 0.03070764507857878, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3521 + }, + { + "epoch": 0.03071636636374736, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3522 + }, + { + "epoch": 0.030725087648915943, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3523 + }, + { + "epoch": 0.030733808934084526, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3524 + }, + { + "epoch": 0.03074253021925311, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 3525 + }, + { + "epoch": 0.030751251504421692, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 3526 + }, + { + "epoch": 0.030759972789590275, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 3527 + }, + { + "epoch": 0.030768694074758858, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3528 + }, + { + "epoch": 0.030777415359927437, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 3529 + }, + { + "epoch": 0.03078613664509602, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 3530 + }, + { + "epoch": 0.030794857930264603, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 3531 + }, + { + "epoch": 0.030803579215433186, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 3532 + }, + { + "epoch": 0.03081230050060177, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 3533 + }, + { + "epoch": 0.030821021785770352, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3534 + }, + { + "epoch": 0.030829743070938935, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3535 + }, + { + "epoch": 0.030838464356107514, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3536 + }, + { + "epoch": 0.030847185641276097, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 3537 + }, + { + "epoch": 0.03085590692644468, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3538 + }, + { + "epoch": 0.030864628211613263, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 3539 + }, + { + "epoch": 0.030873349496781846, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 3540 + }, + { + "epoch": 0.03088207078195043, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 3541 + }, + { + "epoch": 0.030890792067119012, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 3542 + }, + { + "epoch": 0.03089951335228759, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 3543 + }, + { + "epoch": 0.030908234637456174, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3544 + }, + { + "epoch": 0.030916955922624757, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 3545 + }, + { + "epoch": 0.03092567720779334, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 3546 + }, + { + "epoch": 0.030934398492961923, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 3547 + }, + { + "epoch": 0.030943119778130506, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 3548 + }, + { + "epoch": 0.03095184106329909, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 3549 + }, + { + "epoch": 0.030960562348467672, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 3550 + }, + { + "epoch": 0.03096928363363625, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 3551 + }, + { + "epoch": 0.030978004918804834, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 3552 + }, + { + "epoch": 0.030986726203973417, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 3553 + }, + { + "epoch": 0.030995447489142, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 3554 + }, + { + "epoch": 0.031004168774310583, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3555 + }, + { + "epoch": 0.031012890059479166, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 3556 + }, + { + "epoch": 0.03102161134464775, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 3557 + }, + { + "epoch": 0.03103033262981633, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 3558 + }, + { + "epoch": 0.03103905391498491, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 3559 + }, + { + "epoch": 0.031047775200153494, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 3560 + }, + { + "epoch": 0.031056496485322077, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3561 + }, + { + "epoch": 0.03106521777049066, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 3562 + }, + { + "epoch": 0.031073939055659243, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3563 + }, + { + "epoch": 0.031082660340827826, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 3564 + }, + { + "epoch": 0.031091381625996405, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 3565 + }, + { + "epoch": 0.031100102911164988, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 3566 + }, + { + "epoch": 0.03110882419633357, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 3567 + }, + { + "epoch": 0.031117545481502154, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 3568 + }, + { + "epoch": 0.031126266766670737, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 3569 + }, + { + "epoch": 0.03113498805183932, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 3570 + }, + { + "epoch": 0.031143709337007903, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 3571 + }, + { + "epoch": 0.031152430622176482, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 3572 + }, + { + "epoch": 0.031161151907345065, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 3573 + }, + { + "epoch": 0.031169873192513648, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 3574 + }, + { + "epoch": 0.03117859447768223, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 3575 + }, + { + "epoch": 0.031187315762850814, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 3576 + }, + { + "epoch": 0.031196037048019397, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 3577 + }, + { + "epoch": 0.03120475833318798, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 3578 + }, + { + "epoch": 0.031213479618356563, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 3579 + }, + { + "epoch": 0.031222200903525142, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0867, + "step": 3580 + }, + { + "epoch": 0.031230922188693725, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3581 + }, + { + "epoch": 0.031239643473862308, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 3582 + }, + { + "epoch": 0.03124836475903089, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 3583 + }, + { + "epoch": 0.031257086044199474, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 3584 + }, + { + "epoch": 0.03126580732936805, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 3585 + }, + { + "epoch": 0.03127452861453664, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 3586 + }, + { + "epoch": 0.03128324989970522, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 3587 + }, + { + "epoch": 0.031291971184873806, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3588 + }, + { + "epoch": 0.031300692470042385, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 3589 + }, + { + "epoch": 0.031309413755210964, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 3590 + }, + { + "epoch": 0.03131813504037955, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 3591 + }, + { + "epoch": 0.03132685632554813, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 3592 + }, + { + "epoch": 0.03133557761071672, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3593 + }, + { + "epoch": 0.031344298895885296, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 3594 + }, + { + "epoch": 0.03135302018105388, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3595 + }, + { + "epoch": 0.03136174146622246, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 3596 + }, + { + "epoch": 0.03137046275139105, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3597 + }, + { + "epoch": 0.03137918403655963, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 3598 + }, + { + "epoch": 0.03138790532172821, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 3599 + }, + { + "epoch": 0.031396626606896794, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 3600 + }, + { + "epoch": 0.03140534789206537, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 3601 + }, + { + "epoch": 0.03141406917723396, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 3602 + }, + { + "epoch": 0.03142279046240254, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 3603 + }, + { + "epoch": 0.031431511747571125, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 3604 + }, + { + "epoch": 0.031440233032739705, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3605 + }, + { + "epoch": 0.031448954317908284, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 3606 + }, + { + "epoch": 0.03145767560307687, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 3607 + }, + { + "epoch": 0.03146639688824545, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 3608 + }, + { + "epoch": 0.031475118173414036, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 3609 + }, + { + "epoch": 0.031483839458582616, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 3610 + }, + { + "epoch": 0.0314925607437512, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 3611 + }, + { + "epoch": 0.03150128202891978, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 3612 + }, + { + "epoch": 0.03151000331408836, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0807, + "step": 3613 + }, + { + "epoch": 0.03151872459925695, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 3614 + }, + { + "epoch": 0.03152744588442553, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 3615 + }, + { + "epoch": 0.03153616716959411, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3616 + }, + { + "epoch": 0.03154488845476269, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 3617 + }, + { + "epoch": 0.03155360973993128, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 3618 + }, + { + "epoch": 0.03156233102509986, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 3619 + }, + { + "epoch": 0.03157105231026844, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 3620 + }, + { + "epoch": 0.031579773595437025, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 3621 + }, + { + "epoch": 0.031588494880605604, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 3622 + }, + { + "epoch": 0.03159721616577419, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3623 + }, + { + "epoch": 0.03160593745094277, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3624 + }, + { + "epoch": 0.031614658736111356, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 3625 + }, + { + "epoch": 0.031623380021279936, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 3626 + }, + { + "epoch": 0.031632101306448515, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3627 + }, + { + "epoch": 0.0316408225916171, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 3628 + }, + { + "epoch": 0.03164954387678568, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 3629 + }, + { + "epoch": 0.03165826516195427, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3630 + }, + { + "epoch": 0.03166698644712285, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 3631 + }, + { + "epoch": 0.03167570773229143, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3632 + }, + { + "epoch": 0.03168442901746001, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3633 + }, + { + "epoch": 0.03169315030262859, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 3634 + }, + { + "epoch": 0.03170187158779718, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 3635 + }, + { + "epoch": 0.03171059287296576, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 3636 + }, + { + "epoch": 0.031719314158134344, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 3637 + }, + { + "epoch": 0.031728035443302924, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 3638 + }, + { + "epoch": 0.03173675672847151, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 3639 + }, + { + "epoch": 0.03174547801364009, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3640 + }, + { + "epoch": 0.03175419929880867, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3641 + }, + { + "epoch": 0.031762920583977255, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 3642 + }, + { + "epoch": 0.031771641869145835, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 3643 + }, + { + "epoch": 0.03178036315431442, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 3644 + }, + { + "epoch": 0.031789084439483, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 3645 + }, + { + "epoch": 0.03179780572465159, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 3646 + }, + { + "epoch": 0.03180652700982017, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 3647 + }, + { + "epoch": 0.03181524829498875, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 3648 + }, + { + "epoch": 0.03182396958015733, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 3649 + }, + { + "epoch": 0.03183269086532591, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 3650 + }, + { + "epoch": 0.0318414121504945, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 3651 + }, + { + "epoch": 0.03185013343566308, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 3652 + }, + { + "epoch": 0.031858854720831664, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3653 + }, + { + "epoch": 0.031867576006000244, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 3654 + }, + { + "epoch": 0.03187629729116883, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 3655 + }, + { + "epoch": 0.03188501857633741, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 3656 + }, + { + "epoch": 0.03189373986150599, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 3657 + }, + { + "epoch": 0.031902461146674575, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 3658 + }, + { + "epoch": 0.031911182431843155, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 3659 + }, + { + "epoch": 0.03191990371701174, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3660 + }, + { + "epoch": 0.03192862500218032, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 3661 + }, + { + "epoch": 0.03193734628734891, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 3662 + }, + { + "epoch": 0.031946067572517486, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3663 + }, + { + "epoch": 0.031954788857686066, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 3664 + }, + { + "epoch": 0.03196351014285465, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 3665 + }, + { + "epoch": 0.03197223142802323, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3666 + }, + { + "epoch": 0.03198095271319182, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 3667 + }, + { + "epoch": 0.0319896739983604, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 3668 + }, + { + "epoch": 0.031998395283528984, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3669 + }, + { + "epoch": 0.03200711656869756, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 3670 + }, + { + "epoch": 0.03201583785386614, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 3671 + }, + { + "epoch": 0.03202455913903473, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 3672 + }, + { + "epoch": 0.03203328042420331, + "grad_norm": 0.62890625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 3673 + }, + { + "epoch": 0.032042001709371895, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 3674 + }, + { + "epoch": 0.032050722994540475, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 3675 + }, + { + "epoch": 0.03205944427970906, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 3676 + }, + { + "epoch": 0.03206816556487764, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 3677 + }, + { + "epoch": 0.03207688685004622, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 3678 + }, + { + "epoch": 0.032085608135214806, + "grad_norm": 0.412109375, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3679 + }, + { + "epoch": 0.032094329420383386, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3680 + }, + { + "epoch": 0.03210305070555197, + "grad_norm": 0.40625, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 3681 + }, + { + "epoch": 0.03211177199072055, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 3682 + }, + { + "epoch": 0.03212049327588914, + "grad_norm": 0.41015625, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 3683 + }, + { + "epoch": 0.03212921456105772, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 3684 + }, + { + "epoch": 0.0321379358462263, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 3685 + }, + { + "epoch": 0.03214665713139488, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.098, + "step": 3686 + }, + { + "epoch": 0.03215537841656346, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 3687 + }, + { + "epoch": 0.03216409970173205, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 3688 + }, + { + "epoch": 0.03217282098690063, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 3689 + }, + { + "epoch": 0.032181542272069215, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 3690 + }, + { + "epoch": 0.032190263557237794, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 3691 + }, + { + "epoch": 0.032198984842406374, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 3692 + }, + { + "epoch": 0.03220770612757496, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 3693 + }, + { + "epoch": 0.03221642741274354, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 3694 + }, + { + "epoch": 0.032225148697912126, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 3695 + }, + { + "epoch": 0.032233869983080705, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 3696 + }, + { + "epoch": 0.03224259126824929, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 3697 + }, + { + "epoch": 0.03225131255341787, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 3698 + }, + { + "epoch": 0.03226003383858645, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3699 + }, + { + "epoch": 0.03226875512375504, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3700 + }, + { + "epoch": 0.03227747640892362, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3701 + }, + { + "epoch": 0.0322861976940922, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 3702 + }, + { + "epoch": 0.03229491897926078, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3703 + }, + { + "epoch": 0.03230364026442937, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 3704 + }, + { + "epoch": 0.03231236154959795, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 3705 + }, + { + "epoch": 0.032321082834766535, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3706 + }, + { + "epoch": 0.032329804119935114, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.088, + "step": 3707 + }, + { + "epoch": 0.032338525405103694, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 3708 + }, + { + "epoch": 0.03234724669027228, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 3709 + }, + { + "epoch": 0.03235596797544086, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 3710 + }, + { + "epoch": 0.032364689260609446, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 3711 + }, + { + "epoch": 0.032373410545778025, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 3712 + }, + { + "epoch": 0.03238213183094661, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3713 + }, + { + "epoch": 0.03239085311611519, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 3714 + }, + { + "epoch": 0.03239957440128377, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 3715 + }, + { + "epoch": 0.03240829568645236, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 3716 + }, + { + "epoch": 0.032417016971620936, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 3717 + }, + { + "epoch": 0.03242573825678952, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 3718 + }, + { + "epoch": 0.0324344595419581, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 3719 + }, + { + "epoch": 0.03244318082712669, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 3720 + }, + { + "epoch": 0.03245190211229527, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 3721 + }, + { + "epoch": 0.03246062339746385, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 3722 + }, + { + "epoch": 0.032469344682632434, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3723 + }, + { + "epoch": 0.03247806596780101, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 3724 + }, + { + "epoch": 0.0324867872529696, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3725 + }, + { + "epoch": 0.03249550853813818, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3726 + }, + { + "epoch": 0.032504229823306766, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3727 + }, + { + "epoch": 0.032512951108475345, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3728 + }, + { + "epoch": 0.032521672393643924, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3729 + }, + { + "epoch": 0.03253039367881251, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 3730 + }, + { + "epoch": 0.03253911496398109, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 3731 + }, + { + "epoch": 0.03254783624914968, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 3732 + }, + { + "epoch": 0.032556557534318256, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3733 + }, + { + "epoch": 0.03256527881948684, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 3734 + }, + { + "epoch": 0.03257400010465542, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 3735 + }, + { + "epoch": 0.032582721389824, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 3736 + }, + { + "epoch": 0.03259144267499259, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3737 + }, + { + "epoch": 0.03260016396016117, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 3738 + }, + { + "epoch": 0.032608885245329754, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3739 + }, + { + "epoch": 0.03261760653049833, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 3740 + }, + { + "epoch": 0.03262632781566692, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3741 + }, + { + "epoch": 0.0326350491008355, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3742 + }, + { + "epoch": 0.03264377038600408, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 3743 + }, + { + "epoch": 0.032652491671172665, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 3744 + }, + { + "epoch": 0.032661212956341244, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3745 + }, + { + "epoch": 0.03266993424150983, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3746 + }, + { + "epoch": 0.03267865552667841, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3747 + }, + { + "epoch": 0.032687376811846997, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3748 + }, + { + "epoch": 0.032696098097015576, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 3749 + }, + { + "epoch": 0.032704819382184155, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3750 + }, + { + "epoch": 0.03271354066735274, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 3751 + }, + { + "epoch": 0.03272226195252132, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 3752 + }, + { + "epoch": 0.03273098323768991, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 3753 + }, + { + "epoch": 0.03273970452285849, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3754 + }, + { + "epoch": 0.032748425808027073, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3755 + }, + { + "epoch": 0.03275714709319565, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 3756 + }, + { + "epoch": 0.03276586837836423, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 3757 + }, + { + "epoch": 0.03277458966353282, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 3758 + }, + { + "epoch": 0.0327833109487014, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 3759 + }, + { + "epoch": 0.032792032233869985, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 3760 + }, + { + "epoch": 0.032800753519038564, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3761 + }, + { + "epoch": 0.03280947480420715, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3762 + }, + { + "epoch": 0.03281819608937573, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3763 + }, + { + "epoch": 0.032826917374544316, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 3764 + }, + { + "epoch": 0.032835638659712896, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 3765 + }, + { + "epoch": 0.032844359944881475, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 3766 + }, + { + "epoch": 0.03285308123005006, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 3767 + }, + { + "epoch": 0.03286180251521864, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 3768 + }, + { + "epoch": 0.03287052380038723, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 3769 + }, + { + "epoch": 0.03287924508555581, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 3770 + }, + { + "epoch": 0.03288796637072439, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 3771 + }, + { + "epoch": 0.03289668765589297, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 3772 + }, + { + "epoch": 0.03290540894106155, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 3773 + }, + { + "epoch": 0.03291413022623014, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 3774 + }, + { + "epoch": 0.03292285151139872, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 3775 + }, + { + "epoch": 0.032931572796567304, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 3776 + }, + { + "epoch": 0.032940294081735884, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 3777 + }, + { + "epoch": 0.03294901536690447, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3778 + }, + { + "epoch": 0.03295773665207305, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 3779 + }, + { + "epoch": 0.03296645793724163, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 3780 + }, + { + "epoch": 0.032975179222410216, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 3781 + }, + { + "epoch": 0.032983900507578795, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3782 + }, + { + "epoch": 0.03299262179274738, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 3783 + }, + { + "epoch": 0.03300134307791596, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0975, + "step": 3784 + }, + { + "epoch": 0.03301006436308455, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 3785 + }, + { + "epoch": 0.03301878564825313, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 3786 + }, + { + "epoch": 0.033027506933421706, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0867, + "step": 3787 + }, + { + "epoch": 0.03303622821859029, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 3788 + }, + { + "epoch": 0.03304494950375887, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 3789 + }, + { + "epoch": 0.03305367078892746, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 3790 + }, + { + "epoch": 0.03306239207409604, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3791 + }, + { + "epoch": 0.033071113359264624, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 3792 + }, + { + "epoch": 0.033079834644433204, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 3793 + }, + { + "epoch": 0.03308855592960178, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 3794 + }, + { + "epoch": 0.03309727721477037, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 3795 + }, + { + "epoch": 0.03310599849993895, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 3796 + }, + { + "epoch": 0.033114719785107535, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 3797 + }, + { + "epoch": 0.033123441070276115, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 3798 + }, + { + "epoch": 0.0331321623554447, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 3799 + }, + { + "epoch": 0.03314088364061328, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 3800 + }, + { + "epoch": 0.03314960492578186, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 3801 + }, + { + "epoch": 0.033158326210950446, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3802 + }, + { + "epoch": 0.033167047496119026, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3803 + }, + { + "epoch": 0.03317576878128761, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 3804 + }, + { + "epoch": 0.03318449006645619, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 3805 + }, + { + "epoch": 0.03319321135162478, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 3806 + }, + { + "epoch": 0.03320193263679336, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3807 + }, + { + "epoch": 0.03321065392196194, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0875, + "step": 3808 + }, + { + "epoch": 0.03321937520713052, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 3809 + }, + { + "epoch": 0.0332280964922991, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 3810 + }, + { + "epoch": 0.03323681777746769, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 3811 + }, + { + "epoch": 0.03324553906263627, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 3812 + }, + { + "epoch": 0.033254260347804855, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 3813 + }, + { + "epoch": 0.033262981632973435, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3814 + }, + { + "epoch": 0.033271702918142014, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 3815 + }, + { + "epoch": 0.0332804242033106, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 3816 + }, + { + "epoch": 0.03328914548847918, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 3817 + }, + { + "epoch": 0.033297866773647766, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 3818 + }, + { + "epoch": 0.033306588058816346, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 3819 + }, + { + "epoch": 0.03331530934398493, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 3820 + }, + { + "epoch": 0.03332403062915351, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3821 + }, + { + "epoch": 0.0333327519143221, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 3822 + }, + { + "epoch": 0.03334147319949068, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 3823 + }, + { + "epoch": 0.03335019448465926, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3824 + }, + { + "epoch": 0.03335891576982784, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 3825 + }, + { + "epoch": 0.03336763705499642, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 3826 + }, + { + "epoch": 0.03337635834016501, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3827 + }, + { + "epoch": 0.03338507962533359, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3828 + }, + { + "epoch": 0.033393800910502175, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 3829 + }, + { + "epoch": 0.033402522195670754, + "grad_norm": 0.3828125, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 3830 + }, + { + "epoch": 0.033411243480839334, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 3831 + }, + { + "epoch": 0.03341996476600792, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3832 + }, + { + "epoch": 0.0334286860511765, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3833 + }, + { + "epoch": 0.033437407336345086, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 3834 + }, + { + "epoch": 0.033446128621513666, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3835 + }, + { + "epoch": 0.03345484990668225, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 3836 + }, + { + "epoch": 0.03346357119185083, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3837 + }, + { + "epoch": 0.03347229247701941, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 3838 + }, + { + "epoch": 0.033481013762188, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 3839 + }, + { + "epoch": 0.03348973504735658, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 3840 + }, + { + "epoch": 0.03349845633252516, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3841 + }, + { + "epoch": 0.03350717761769374, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 3842 + }, + { + "epoch": 0.03351589890286233, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3843 + }, + { + "epoch": 0.03352462018803091, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 3844 + }, + { + "epoch": 0.03353334147319949, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 3845 + }, + { + "epoch": 0.033542062758368074, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 3846 + }, + { + "epoch": 0.033550784043536654, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 3847 + }, + { + "epoch": 0.03355950532870524, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 3848 + }, + { + "epoch": 0.03356822661387382, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 3849 + }, + { + "epoch": 0.033576947899042406, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 3850 + }, + { + "epoch": 0.033585669184210985, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 3851 + }, + { + "epoch": 0.033594390469379565, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 3852 + }, + { + "epoch": 0.03360311175454815, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 3853 + }, + { + "epoch": 0.03361183303971673, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3854 + }, + { + "epoch": 0.03362055432488532, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 3855 + }, + { + "epoch": 0.033629275610053896, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 3856 + }, + { + "epoch": 0.03363799689522248, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 3857 + }, + { + "epoch": 0.03364671818039106, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 3858 + }, + { + "epoch": 0.03365543946555964, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 3859 + }, + { + "epoch": 0.03366416075072823, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3860 + }, + { + "epoch": 0.03367288203589681, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 3861 + }, + { + "epoch": 0.033681603321065394, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 3862 + }, + { + "epoch": 0.03369032460623397, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3863 + }, + { + "epoch": 0.03369904589140256, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3864 + }, + { + "epoch": 0.03370776717657114, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 3865 + }, + { + "epoch": 0.03371648846173972, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3866 + }, + { + "epoch": 0.033725209746908305, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 3867 + }, + { + "epoch": 0.033733931032076885, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 3868 + }, + { + "epoch": 0.03374265231724547, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 3869 + }, + { + "epoch": 0.03375137360241405, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3870 + }, + { + "epoch": 0.03376009488758264, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 3871 + }, + { + "epoch": 0.033768816172751216, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 3872 + }, + { + "epoch": 0.033777537457919796, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3873 + }, + { + "epoch": 0.03378625874308838, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3874 + }, + { + "epoch": 0.03379498002825696, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 3875 + }, + { + "epoch": 0.03380370131342555, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3876 + }, + { + "epoch": 0.03381242259859413, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 3877 + }, + { + "epoch": 0.033821143883762714, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 3878 + }, + { + "epoch": 0.03382986516893129, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3879 + }, + { + "epoch": 0.03383858645409988, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 3880 + }, + { + "epoch": 0.03384730773926846, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 3881 + }, + { + "epoch": 0.03385602902443704, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3882 + }, + { + "epoch": 0.033864750309605625, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 3883 + }, + { + "epoch": 0.033873471594774204, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3884 + }, + { + "epoch": 0.03388219287994279, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3885 + }, + { + "epoch": 0.03389091416511137, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 3886 + }, + { + "epoch": 0.03389963545027996, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3887 + }, + { + "epoch": 0.033908356735448536, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 3888 + }, + { + "epoch": 0.033917078020617115, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 3889 + }, + { + "epoch": 0.0339257993057857, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3890 + }, + { + "epoch": 0.03393452059095428, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 3891 + }, + { + "epoch": 0.03394324187612287, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 3892 + }, + { + "epoch": 0.03395196316129145, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 3893 + }, + { + "epoch": 0.033960684446460034, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 3894 + }, + { + "epoch": 0.03396940573162861, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 3895 + }, + { + "epoch": 0.03397812701679719, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 3896 + }, + { + "epoch": 0.03398684830196578, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3897 + }, + { + "epoch": 0.03399556958713436, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 3898 + }, + { + "epoch": 0.034004290872302945, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 3899 + }, + { + "epoch": 0.034013012157471524, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3900 + }, + { + "epoch": 0.03402173344264011, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 3901 + }, + { + "epoch": 0.03403045472780869, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3902 + }, + { + "epoch": 0.03403917601297727, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3903 + }, + { + "epoch": 0.034047897298145856, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 3904 + }, + { + "epoch": 0.034056618583314435, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 3905 + }, + { + "epoch": 0.03406533986848302, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 3906 + }, + { + "epoch": 0.0340740611536516, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 3907 + }, + { + "epoch": 0.03408278243882019, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 3908 + }, + { + "epoch": 0.03409150372398877, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 3909 + }, + { + "epoch": 0.034100225009157346, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 3910 + }, + { + "epoch": 0.03410894629432593, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 3911 + }, + { + "epoch": 0.03411766757949451, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 3912 + }, + { + "epoch": 0.0341263888646631, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 3913 + }, + { + "epoch": 0.03413511014983168, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 3914 + }, + { + "epoch": 0.034143831435000264, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 3915 + }, + { + "epoch": 0.034152552720168844, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 3916 + }, + { + "epoch": 0.03416127400533742, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3917 + }, + { + "epoch": 0.03416999529050601, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 3918 + }, + { + "epoch": 0.03417871657567459, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 3919 + }, + { + "epoch": 0.034187437860843176, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 3920 + }, + { + "epoch": 0.034196159146011755, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 3921 + }, + { + "epoch": 0.03420488043118034, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 3922 + }, + { + "epoch": 0.03421360171634892, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 3923 + }, + { + "epoch": 0.0342223230015175, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3924 + }, + { + "epoch": 0.03423104428668609, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3925 + }, + { + "epoch": 0.034239765571854666, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 3926 + }, + { + "epoch": 0.03424848685702325, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 3927 + }, + { + "epoch": 0.03425720814219183, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 3928 + }, + { + "epoch": 0.03426592942736042, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3929 + }, + { + "epoch": 0.034274650712529, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 3930 + }, + { + "epoch": 0.03428337199769758, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 3931 + }, + { + "epoch": 0.034292093282866164, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 3932 + }, + { + "epoch": 0.03430081456803474, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3933 + }, + { + "epoch": 0.03430953585320333, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3934 + }, + { + "epoch": 0.03431825713837191, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 3935 + }, + { + "epoch": 0.034326978423540495, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 3936 + }, + { + "epoch": 0.034335699708709075, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 3937 + }, + { + "epoch": 0.03434442099387766, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 3938 + }, + { + "epoch": 0.03435314227904624, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3939 + }, + { + "epoch": 0.03436186356421482, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 3940 + }, + { + "epoch": 0.034370584849383407, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 3941 + }, + { + "epoch": 0.034379306134551986, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 3942 + }, + { + "epoch": 0.03438802741972057, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 3943 + }, + { + "epoch": 0.03439674870488915, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 3944 + }, + { + "epoch": 0.03440546999005774, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 3945 + }, + { + "epoch": 0.03441419127522632, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 3946 + }, + { + "epoch": 0.0344229125603949, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3947 + }, + { + "epoch": 0.034431633845563483, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 3948 + }, + { + "epoch": 0.03444035513073206, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 3949 + }, + { + "epoch": 0.03444907641590065, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 3950 + }, + { + "epoch": 0.03445779770106923, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 3951 + }, + { + "epoch": 0.034466518986237815, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 3952 + }, + { + "epoch": 0.034475240271406395, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 3953 + }, + { + "epoch": 0.034483961556574974, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 3954 + }, + { + "epoch": 0.03449268284174356, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 3955 + }, + { + "epoch": 0.03450140412691214, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 3956 + }, + { + "epoch": 0.034510125412080726, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 3957 + }, + { + "epoch": 0.034518846697249306, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 3958 + }, + { + "epoch": 0.03452756798241789, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3959 + }, + { + "epoch": 0.03453628926758647, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 3960 + }, + { + "epoch": 0.03454501055275505, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 3961 + }, + { + "epoch": 0.03455373183792364, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 3962 + }, + { + "epoch": 0.03456245312309222, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 3963 + }, + { + "epoch": 0.0345711744082608, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 3964 + }, + { + "epoch": 0.03457989569342938, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 3965 + }, + { + "epoch": 0.03458861697859797, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3966 + }, + { + "epoch": 0.03459733826376655, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 3967 + }, + { + "epoch": 0.03460605954893513, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3968 + }, + { + "epoch": 0.034614780834103714, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 3969 + }, + { + "epoch": 0.034623502119272294, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3970 + }, + { + "epoch": 0.03463222340444088, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 3971 + }, + { + "epoch": 0.03464094468960946, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 3972 + }, + { + "epoch": 0.034649665974778046, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 3973 + }, + { + "epoch": 0.034658387259946626, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 3974 + }, + { + "epoch": 0.034667108545115205, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 3975 + }, + { + "epoch": 0.03467582983028379, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 3976 + }, + { + "epoch": 0.03468455111545237, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3977 + }, + { + "epoch": 0.03469327240062096, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 3978 + }, + { + "epoch": 0.03470199368578954, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 3979 + }, + { + "epoch": 0.03471071497095812, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 3980 + }, + { + "epoch": 0.0347194362561267, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 3981 + }, + { + "epoch": 0.03472815754129528, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 3982 + }, + { + "epoch": 0.03473687882646387, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3983 + }, + { + "epoch": 0.03474560011163245, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 3984 + }, + { + "epoch": 0.034754321396801034, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3985 + }, + { + "epoch": 0.034763042681969614, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 3986 + }, + { + "epoch": 0.0347717639671382, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 3987 + }, + { + "epoch": 0.03478048525230678, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 3988 + }, + { + "epoch": 0.03478920653747536, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 3989 + }, + { + "epoch": 0.034797927822643945, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 3990 + }, + { + "epoch": 0.034806649107812525, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3991 + }, + { + "epoch": 0.03481537039298111, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 3992 + }, + { + "epoch": 0.03482409167814969, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 3993 + }, + { + "epoch": 0.03483281296331828, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3994 + }, + { + "epoch": 0.034841534248486856, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3995 + }, + { + "epoch": 0.03485025553365544, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 3996 + }, + { + "epoch": 0.03485897681882402, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 3997 + }, + { + "epoch": 0.0348676981039926, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 3998 + }, + { + "epoch": 0.03487641938916119, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3999 + }, + { + "epoch": 0.03488514067432977, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 4000 + }, + { + "epoch": 0.034893861959498354, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4001 + }, + { + "epoch": 0.03490258324466693, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4002 + }, + { + "epoch": 0.03491130452983552, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4003 + }, + { + "epoch": 0.0349200258150041, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 4004 + }, + { + "epoch": 0.03492874710017268, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 4005 + }, + { + "epoch": 0.034937468385341265, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4006 + }, + { + "epoch": 0.034946189670509845, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4007 + }, + { + "epoch": 0.03495491095567843, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4008 + }, + { + "epoch": 0.03496363224084701, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4009 + }, + { + "epoch": 0.0349723535260156, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 4010 + }, + { + "epoch": 0.034981074811184176, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4011 + }, + { + "epoch": 0.034989796096352756, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 4012 + }, + { + "epoch": 0.03499851738152134, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 4013 + }, + { + "epoch": 0.03500723866668992, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4014 + }, + { + "epoch": 0.03501595995185851, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4015 + }, + { + "epoch": 0.03502468123702709, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4016 + }, + { + "epoch": 0.035033402522195674, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 4017 + }, + { + "epoch": 0.03504212380736425, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 4018 + }, + { + "epoch": 0.03505084509253283, + "grad_norm": 0.640625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 4019 + }, + { + "epoch": 0.03505956637770142, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4020 + }, + { + "epoch": 0.03506828766287, + "grad_norm": 0.490234375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 4021 + }, + { + "epoch": 0.035077008948038585, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 4022 + }, + { + "epoch": 0.035085730233207164, + "grad_norm": 0.69921875, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 4023 + }, + { + "epoch": 0.03509445151837575, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 4024 + }, + { + "epoch": 0.03510317280354433, + "grad_norm": 0.66015625, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 4025 + }, + { + "epoch": 0.03511189408871291, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 4026 + }, + { + "epoch": 0.035120615373881496, + "grad_norm": 0.765625, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 4027 + }, + { + "epoch": 0.035129336659050076, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4028 + }, + { + "epoch": 0.03513805794421866, + "grad_norm": 0.578125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4029 + }, + { + "epoch": 0.03514677922938724, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 4030 + }, + { + "epoch": 0.03515550051455583, + "grad_norm": 0.82421875, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 4031 + }, + { + "epoch": 0.03516422179972441, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 4032 + }, + { + "epoch": 0.03517294308489299, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4033 + }, + { + "epoch": 0.03518166437006157, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4034 + }, + { + "epoch": 0.03519038565523015, + "grad_norm": 0.7109375, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 4035 + }, + { + "epoch": 0.03519910694039874, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4036 + }, + { + "epoch": 0.03520782822556732, + "grad_norm": 0.78515625, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 4037 + }, + { + "epoch": 0.035216549510735905, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4038 + }, + { + "epoch": 0.035225270795904484, + "grad_norm": 0.796875, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 4039 + }, + { + "epoch": 0.035233992081073064, + "grad_norm": 0.5078125, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 4040 + }, + { + "epoch": 0.03524271336624165, + "grad_norm": 0.74609375, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4041 + }, + { + "epoch": 0.03525143465141023, + "grad_norm": 0.69921875, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4042 + }, + { + "epoch": 0.035260155936578816, + "grad_norm": 0.69921875, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 4043 + }, + { + "epoch": 0.035268877221747395, + "grad_norm": 0.6328125, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 4044 + }, + { + "epoch": 0.03527759850691598, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4045 + }, + { + "epoch": 0.03528631979208456, + "grad_norm": 0.6015625, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 4046 + }, + { + "epoch": 0.03529504107725314, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 4047 + }, + { + "epoch": 0.03530376236242173, + "grad_norm": 0.466796875, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 4048 + }, + { + "epoch": 0.035312483647590306, + "grad_norm": 0.447265625, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 4049 + }, + { + "epoch": 0.03532120493275889, + "grad_norm": 0.447265625, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 4050 + }, + { + "epoch": 0.03532992621792747, + "grad_norm": 0.462890625, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 4051 + }, + { + "epoch": 0.03533864750309606, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 4052 + }, + { + "epoch": 0.03534736878826464, + "grad_norm": 0.353515625, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4053 + }, + { + "epoch": 0.035356090073433225, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 4054 + }, + { + "epoch": 0.035364811358601804, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 4055 + }, + { + "epoch": 0.03537353264377038, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4056 + }, + { + "epoch": 0.03538225392893897, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 4057 + }, + { + "epoch": 0.03539097521410755, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4058 + }, + { + "epoch": 0.035399696499276136, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4059 + }, + { + "epoch": 0.035408417784444715, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4060 + }, + { + "epoch": 0.0354171390696133, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 4061 + }, + { + "epoch": 0.03542586035478188, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 4062 + }, + { + "epoch": 0.03543458163995046, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 4063 + }, + { + "epoch": 0.03544330292511905, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 4064 + }, + { + "epoch": 0.035452024210287626, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4065 + }, + { + "epoch": 0.03546074549545621, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4066 + }, + { + "epoch": 0.03546946678062479, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4067 + }, + { + "epoch": 0.03547818806579338, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4068 + }, + { + "epoch": 0.03548690935096196, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4069 + }, + { + "epoch": 0.03549563063613054, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4070 + }, + { + "epoch": 0.035504351921299124, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4071 + }, + { + "epoch": 0.0355130732064677, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0937, + "step": 4072 + }, + { + "epoch": 0.03552179449163629, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4073 + }, + { + "epoch": 0.03553051577680487, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 4074 + }, + { + "epoch": 0.035539237061973455, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4075 + }, + { + "epoch": 0.035547958347142035, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4076 + }, + { + "epoch": 0.035556679632310614, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 4077 + }, + { + "epoch": 0.0355654009174792, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4078 + }, + { + "epoch": 0.03557412220264778, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4079 + }, + { + "epoch": 0.03558284348781637, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 4080 + }, + { + "epoch": 0.035591564772984946, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4081 + }, + { + "epoch": 0.03560028605815353, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4082 + }, + { + "epoch": 0.03560900734332211, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4083 + }, + { + "epoch": 0.03561772862849069, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 4084 + }, + { + "epoch": 0.03562644991365928, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4085 + }, + { + "epoch": 0.03563517119882786, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 4086 + }, + { + "epoch": 0.035643892483996444, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4087 + }, + { + "epoch": 0.03565261376916502, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4088 + }, + { + "epoch": 0.03566133505433361, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 4089 + }, + { + "epoch": 0.03567005633950219, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 4090 + }, + { + "epoch": 0.03567877762467077, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4091 + }, + { + "epoch": 0.035687498909839355, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4092 + }, + { + "epoch": 0.035696220195007934, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 4093 + }, + { + "epoch": 0.03570494148017652, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 4094 + }, + { + "epoch": 0.0357136627653451, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4095 + }, + { + "epoch": 0.035722384050513686, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4096 + }, + { + "epoch": 0.035731105335682266, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4097 + }, + { + "epoch": 0.035739826620850845, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 4098 + }, + { + "epoch": 0.03574854790601943, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4099 + }, + { + "epoch": 0.03575726919118801, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4100 + }, + { + "epoch": 0.0357659904763566, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4101 + }, + { + "epoch": 0.03577471176152518, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4102 + }, + { + "epoch": 0.03578343304669376, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 4103 + }, + { + "epoch": 0.03579215433186234, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0795, + "step": 4104 + }, + { + "epoch": 0.03580087561703092, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4105 + }, + { + "epoch": 0.03580959690219951, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 4106 + }, + { + "epoch": 0.03581831818736809, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4107 + }, + { + "epoch": 0.035827039472536674, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4108 + }, + { + "epoch": 0.035835760757705254, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4109 + }, + { + "epoch": 0.03584448204287384, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 4110 + }, + { + "epoch": 0.03585320332804242, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 4111 + }, + { + "epoch": 0.035861924613211006, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4112 + }, + { + "epoch": 0.035870645898379586, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4113 + }, + { + "epoch": 0.035879367183548165, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4114 + }, + { + "epoch": 0.03588808846871675, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4115 + }, + { + "epoch": 0.03589680975388533, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4116 + }, + { + "epoch": 0.03590553103905392, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4117 + }, + { + "epoch": 0.0359142523242225, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 4118 + }, + { + "epoch": 0.03592297360939108, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 4119 + }, + { + "epoch": 0.03593169489455966, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4120 + }, + { + "epoch": 0.03594041617972824, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4121 + }, + { + "epoch": 0.03594913746489683, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4122 + }, + { + "epoch": 0.03595785875006541, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4123 + }, + { + "epoch": 0.035966580035233994, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4124 + }, + { + "epoch": 0.035975301320402574, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4125 + }, + { + "epoch": 0.03598402260557116, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4126 + }, + { + "epoch": 0.03599274389073974, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 4127 + }, + { + "epoch": 0.03600146517590832, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4128 + }, + { + "epoch": 0.036010186461076905, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 4129 + }, + { + "epoch": 0.036018907746245485, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 4130 + }, + { + "epoch": 0.03602762903141407, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4131 + }, + { + "epoch": 0.03603635031658265, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4132 + }, + { + "epoch": 0.03604507160175124, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4133 + }, + { + "epoch": 0.03605379288691982, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4134 + }, + { + "epoch": 0.036062514172088396, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4135 + }, + { + "epoch": 0.03607123545725698, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4136 + }, + { + "epoch": 0.03607995674242556, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4137 + }, + { + "epoch": 0.03608867802759415, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4138 + }, + { + "epoch": 0.03609739931276273, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4139 + }, + { + "epoch": 0.036106120597931314, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4140 + }, + { + "epoch": 0.036114841883099894, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4141 + }, + { + "epoch": 0.03612356316826847, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 4142 + }, + { + "epoch": 0.03613228445343706, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4143 + }, + { + "epoch": 0.03614100573860564, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4144 + }, + { + "epoch": 0.036149727023774225, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 4145 + }, + { + "epoch": 0.036158448308942805, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4146 + }, + { + "epoch": 0.03616716959411139, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4147 + }, + { + "epoch": 0.03617589087927997, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 4148 + }, + { + "epoch": 0.03618461216444855, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4149 + }, + { + "epoch": 0.036193333449617136, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 4150 + }, + { + "epoch": 0.036202054734785716, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4151 + }, + { + "epoch": 0.0362107760199543, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4152 + }, + { + "epoch": 0.03621949730512288, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4153 + }, + { + "epoch": 0.03622821859029147, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4154 + }, + { + "epoch": 0.03623693987546005, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4155 + }, + { + "epoch": 0.03624566116062863, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 4156 + }, + { + "epoch": 0.03625438244579721, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 4157 + }, + { + "epoch": 0.03626310373096579, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 4158 + }, + { + "epoch": 0.03627182501613438, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4159 + }, + { + "epoch": 0.03628054630130296, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 4160 + }, + { + "epoch": 0.036289267586471545, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4161 + }, + { + "epoch": 0.036297988871640124, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 4162 + }, + { + "epoch": 0.036306710156808704, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 4163 + }, + { + "epoch": 0.03631543144197729, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4164 + }, + { + "epoch": 0.03632415272714587, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4165 + }, + { + "epoch": 0.036332874012314456, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 4166 + }, + { + "epoch": 0.036341595297483036, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4167 + }, + { + "epoch": 0.03635031658265162, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 4168 + }, + { + "epoch": 0.0363590378678202, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 4169 + }, + { + "epoch": 0.03636775915298879, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4170 + }, + { + "epoch": 0.03637648043815737, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 4171 + }, + { + "epoch": 0.03638520172332595, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4172 + }, + { + "epoch": 0.03639392300849453, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4173 + }, + { + "epoch": 0.03640264429366311, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 4174 + }, + { + "epoch": 0.0364113655788317, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 4175 + }, + { + "epoch": 0.03642008686400028, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 4176 + }, + { + "epoch": 0.036428808149168865, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4177 + }, + { + "epoch": 0.036437529434337444, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4178 + }, + { + "epoch": 0.036446250719506024, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 4179 + }, + { + "epoch": 0.03645497200467461, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4180 + }, + { + "epoch": 0.03646369328984319, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 4181 + }, + { + "epoch": 0.036472414575011776, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 4182 + }, + { + "epoch": 0.036481135860180355, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 4183 + }, + { + "epoch": 0.03648985714534894, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 4184 + }, + { + "epoch": 0.03649857843051752, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4185 + }, + { + "epoch": 0.0365072997156861, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4186 + }, + { + "epoch": 0.03651602100085469, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 4187 + }, + { + "epoch": 0.036524742286023267, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4188 + }, + { + "epoch": 0.03653346357119185, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4189 + }, + { + "epoch": 0.03654218485636043, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4190 + }, + { + "epoch": 0.03655090614152902, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4191 + }, + { + "epoch": 0.0365596274266976, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 4192 + }, + { + "epoch": 0.03656834871186618, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4193 + }, + { + "epoch": 0.036577069997034764, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4194 + }, + { + "epoch": 0.036585791282203343, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 4195 + }, + { + "epoch": 0.03659451256737193, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 4196 + }, + { + "epoch": 0.03660323385254051, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4197 + }, + { + "epoch": 0.036611955137709096, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 4198 + }, + { + "epoch": 0.036620676422877675, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4199 + }, + { + "epoch": 0.036629397708046255, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4200 + }, + { + "epoch": 0.03663811899321484, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 4201 + }, + { + "epoch": 0.03664684027838342, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 4202 + }, + { + "epoch": 0.03665556156355201, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 4203 + }, + { + "epoch": 0.036664282848720586, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4204 + }, + { + "epoch": 0.03667300413388917, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 4205 + }, + { + "epoch": 0.03668172541905775, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 4206 + }, + { + "epoch": 0.03669044670422633, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 4207 + }, + { + "epoch": 0.03669916798939492, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 4208 + }, + { + "epoch": 0.0367078892745635, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4209 + }, + { + "epoch": 0.036716610559732084, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4210 + }, + { + "epoch": 0.03672533184490066, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4211 + }, + { + "epoch": 0.03673405313006925, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4212 + }, + { + "epoch": 0.03674277441523783, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4213 + }, + { + "epoch": 0.03675149570040641, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 4214 + }, + { + "epoch": 0.036760216985574995, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 4215 + }, + { + "epoch": 0.036768938270743574, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4216 + }, + { + "epoch": 0.03677765955591216, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4217 + }, + { + "epoch": 0.03678638084108074, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 4218 + }, + { + "epoch": 0.03679510212624933, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 4219 + }, + { + "epoch": 0.036803823411417906, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4220 + }, + { + "epoch": 0.036812544696586486, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4221 + }, + { + "epoch": 0.03682126598175507, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 4222 + }, + { + "epoch": 0.03682998726692365, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4223 + }, + { + "epoch": 0.03683870855209224, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4224 + }, + { + "epoch": 0.03684742983726082, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 4225 + }, + { + "epoch": 0.036856151122429404, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4226 + }, + { + "epoch": 0.03686487240759798, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 4227 + }, + { + "epoch": 0.03687359369276657, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4228 + }, + { + "epoch": 0.03688231497793515, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4229 + }, + { + "epoch": 0.03689103626310373, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4230 + }, + { + "epoch": 0.036899757548272315, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4231 + }, + { + "epoch": 0.036908478833440894, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 4232 + }, + { + "epoch": 0.03691720011860948, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 4233 + }, + { + "epoch": 0.03692592140377806, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 4234 + }, + { + "epoch": 0.036934642688946646, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 4235 + }, + { + "epoch": 0.036943363974115226, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4236 + }, + { + "epoch": 0.036952085259283805, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4237 + }, + { + "epoch": 0.03696080654445239, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 4238 + }, + { + "epoch": 0.03696952782962097, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4239 + }, + { + "epoch": 0.03697824911478956, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 4240 + }, + { + "epoch": 0.03698697039995814, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 4241 + }, + { + "epoch": 0.03699569168512672, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4242 + }, + { + "epoch": 0.0370044129702953, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 4243 + }, + { + "epoch": 0.03701313425546388, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4244 + }, + { + "epoch": 0.03702185554063247, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 4245 + }, + { + "epoch": 0.03703057682580105, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 4246 + }, + { + "epoch": 0.037039298110969635, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4247 + }, + { + "epoch": 0.037048019396138214, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4248 + }, + { + "epoch": 0.0370567406813068, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 4249 + }, + { + "epoch": 0.03706546196647538, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4250 + }, + { + "epoch": 0.03707418325164396, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4251 + }, + { + "epoch": 0.037082904536812546, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 4252 + }, + { + "epoch": 0.037091625821981125, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4253 + }, + { + "epoch": 0.03710034710714971, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4254 + }, + { + "epoch": 0.03710906839231829, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4255 + }, + { + "epoch": 0.03711778967748688, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 4256 + }, + { + "epoch": 0.03712651096265546, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 4257 + }, + { + "epoch": 0.037135232247824036, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4258 + }, + { + "epoch": 0.03714395353299262, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4259 + }, + { + "epoch": 0.0371526748181612, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 4260 + }, + { + "epoch": 0.03716139610332979, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 4261 + }, + { + "epoch": 0.03717011738849837, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 4262 + }, + { + "epoch": 0.037178838673666954, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4263 + }, + { + "epoch": 0.037187559958835534, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 4264 + }, + { + "epoch": 0.03719628124400411, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4265 + }, + { + "epoch": 0.0372050025291727, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0915, + "step": 4266 + }, + { + "epoch": 0.03721372381434128, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 4267 + }, + { + "epoch": 0.037222445099509865, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 4268 + }, + { + "epoch": 0.037231166384678445, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4269 + }, + { + "epoch": 0.03723988766984703, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4270 + }, + { + "epoch": 0.03724860895501561, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4271 + }, + { + "epoch": 0.03725733024018419, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 4272 + }, + { + "epoch": 0.03726605152535278, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4273 + }, + { + "epoch": 0.037274772810521356, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4274 + }, + { + "epoch": 0.03728349409568994, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 4275 + }, + { + "epoch": 0.03729221538085852, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4276 + }, + { + "epoch": 0.03730093666602711, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 4277 + }, + { + "epoch": 0.03730965795119569, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4278 + }, + { + "epoch": 0.03731837923636427, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4279 + }, + { + "epoch": 0.037327100521532854, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4280 + }, + { + "epoch": 0.03733582180670143, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4281 + }, + { + "epoch": 0.03734454309187002, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 4282 + }, + { + "epoch": 0.0373532643770386, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 4283 + }, + { + "epoch": 0.037361985662207185, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 4284 + }, + { + "epoch": 0.037370706947375765, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 4285 + }, + { + "epoch": 0.03737942823254435, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 4286 + }, + { + "epoch": 0.03738814951771293, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4287 + }, + { + "epoch": 0.03739687080288151, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 4288 + }, + { + "epoch": 0.037405592088050096, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4289 + }, + { + "epoch": 0.037414313373218676, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 4290 + }, + { + "epoch": 0.03742303465838726, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4291 + }, + { + "epoch": 0.03743175594355584, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 4292 + }, + { + "epoch": 0.03744047722872443, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 4293 + }, + { + "epoch": 0.03744919851389301, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4294 + }, + { + "epoch": 0.03745791979906159, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 4295 + }, + { + "epoch": 0.03746664108423017, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 4296 + }, + { + "epoch": 0.03747536236939875, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4297 + }, + { + "epoch": 0.03748408365456734, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4298 + }, + { + "epoch": 0.03749280493973592, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4299 + }, + { + "epoch": 0.037501526224904505, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4300 + }, + { + "epoch": 0.037510247510073084, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4301 + }, + { + "epoch": 0.037518968795241664, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4302 + }, + { + "epoch": 0.03752769008041025, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4303 + }, + { + "epoch": 0.03753641136557883, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4304 + }, + { + "epoch": 0.037545132650747416, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 4305 + }, + { + "epoch": 0.037553853935915996, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 4306 + }, + { + "epoch": 0.03756257522108458, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4307 + }, + { + "epoch": 0.03757129650625316, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 4308 + }, + { + "epoch": 0.03758001779142174, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 4309 + }, + { + "epoch": 0.03758873907659033, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 4310 + }, + { + "epoch": 0.03759746036175891, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 4311 + }, + { + "epoch": 0.03760618164692749, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 4312 + }, + { + "epoch": 0.03761490293209607, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4313 + }, + { + "epoch": 0.03762362421726466, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4314 + }, + { + "epoch": 0.03763234550243324, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4315 + }, + { + "epoch": 0.03764106678760182, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4316 + }, + { + "epoch": 0.037649788072770404, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 4317 + }, + { + "epoch": 0.037658509357938984, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4318 + }, + { + "epoch": 0.03766723064310757, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4319 + }, + { + "epoch": 0.03767595192827615, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 4320 + }, + { + "epoch": 0.037684673213444736, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 4321 + }, + { + "epoch": 0.037693394498613315, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4322 + }, + { + "epoch": 0.037702115783781895, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 4323 + }, + { + "epoch": 0.03771083706895048, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4324 + }, + { + "epoch": 0.03771955835411906, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4325 + }, + { + "epoch": 0.03772827963928765, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 4326 + }, + { + "epoch": 0.03773700092445623, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4327 + }, + { + "epoch": 0.03774572220962481, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4328 + }, + { + "epoch": 0.03775444349479339, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 4329 + }, + { + "epoch": 0.03776316477996197, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4330 + }, + { + "epoch": 0.03777188606513056, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4331 + }, + { + "epoch": 0.03778060735029914, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 4332 + }, + { + "epoch": 0.037789328635467724, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4333 + }, + { + "epoch": 0.037798049920636304, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4334 + }, + { + "epoch": 0.03780677120580489, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 4335 + }, + { + "epoch": 0.03781549249097347, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4336 + }, + { + "epoch": 0.03782421377614205, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 4337 + }, + { + "epoch": 0.037832935061310635, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4338 + }, + { + "epoch": 0.037841656346479215, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 4339 + }, + { + "epoch": 0.0378503776316478, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4340 + }, + { + "epoch": 0.03785909891681638, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 4341 + }, + { + "epoch": 0.03786782020198497, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4342 + }, + { + "epoch": 0.037876541487153546, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4343 + }, + { + "epoch": 0.03788526277232213, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4344 + }, + { + "epoch": 0.03789398405749071, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4345 + }, + { + "epoch": 0.03790270534265929, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4346 + }, + { + "epoch": 0.03791142662782788, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 4347 + }, + { + "epoch": 0.03792014791299646, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4348 + }, + { + "epoch": 0.037928869198165044, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4349 + }, + { + "epoch": 0.03793759048333362, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 4350 + }, + { + "epoch": 0.03794631176850221, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 4351 + }, + { + "epoch": 0.03795503305367079, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4352 + }, + { + "epoch": 0.03796375433883937, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4353 + }, + { + "epoch": 0.037972475624007955, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4354 + }, + { + "epoch": 0.037981196909176534, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4355 + }, + { + "epoch": 0.03798991819434512, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4356 + }, + { + "epoch": 0.0379986394795137, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 4357 + }, + { + "epoch": 0.03800736076468229, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 4358 + }, + { + "epoch": 0.038016082049850866, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4359 + }, + { + "epoch": 0.038024803335019446, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4360 + }, + { + "epoch": 0.03803352462018803, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4361 + }, + { + "epoch": 0.03804224590535661, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 4362 + }, + { + "epoch": 0.0380509671905252, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4363 + }, + { + "epoch": 0.03805968847569378, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4364 + }, + { + "epoch": 0.038068409760862364, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4365 + }, + { + "epoch": 0.03807713104603094, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4366 + }, + { + "epoch": 0.03808585233119952, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4367 + }, + { + "epoch": 0.03809457361636811, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4368 + }, + { + "epoch": 0.03810329490153669, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4369 + }, + { + "epoch": 0.038112016186705275, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4370 + }, + { + "epoch": 0.038120737471873854, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 4371 + }, + { + "epoch": 0.03812945875704244, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4372 + }, + { + "epoch": 0.03813818004221102, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4373 + }, + { + "epoch": 0.0381469013273796, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4374 + }, + { + "epoch": 0.038155622612548186, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 4375 + }, + { + "epoch": 0.038164343897716765, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4376 + }, + { + "epoch": 0.03817306518288535, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4377 + }, + { + "epoch": 0.03818178646805393, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4378 + }, + { + "epoch": 0.03819050775322252, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4379 + }, + { + "epoch": 0.0381992290383911, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 4380 + }, + { + "epoch": 0.038207950323559677, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 4381 + }, + { + "epoch": 0.03821667160872826, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4382 + }, + { + "epoch": 0.03822539289389684, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4383 + }, + { + "epoch": 0.03823411417906543, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4384 + }, + { + "epoch": 0.03824283546423401, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4385 + }, + { + "epoch": 0.038251556749402595, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4386 + }, + { + "epoch": 0.038260278034571174, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 4387 + }, + { + "epoch": 0.038268999319739753, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 4388 + }, + { + "epoch": 0.03827772060490834, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 4389 + }, + { + "epoch": 0.03828644189007692, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4390 + }, + { + "epoch": 0.038295163175245506, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4391 + }, + { + "epoch": 0.038303884460414085, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4392 + }, + { + "epoch": 0.03831260574558267, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4393 + }, + { + "epoch": 0.03832132703075125, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4394 + }, + { + "epoch": 0.03833004831591984, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 4395 + }, + { + "epoch": 0.03833876960108842, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4396 + }, + { + "epoch": 0.038347490886256996, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4397 + }, + { + "epoch": 0.03835621217142558, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4398 + }, + { + "epoch": 0.03836493345659416, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4399 + }, + { + "epoch": 0.03837365474176275, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 4400 + }, + { + "epoch": 0.03838237602693133, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 4401 + }, + { + "epoch": 0.038391097312099914, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 4402 + }, + { + "epoch": 0.038399818597268494, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 4403 + }, + { + "epoch": 0.03840853988243707, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 4404 + }, + { + "epoch": 0.03841726116760566, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4405 + }, + { + "epoch": 0.03842598245277424, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4406 + }, + { + "epoch": 0.038434703737942826, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4407 + }, + { + "epoch": 0.038443425023111405, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4408 + }, + { + "epoch": 0.03845214630827999, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4409 + }, + { + "epoch": 0.03846086759344857, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 4410 + }, + { + "epoch": 0.03846958887861715, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 4411 + }, + { + "epoch": 0.03847831016378574, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 4412 + }, + { + "epoch": 0.038487031448954316, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4413 + }, + { + "epoch": 0.0384957527341229, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4414 + }, + { + "epoch": 0.03850447401929148, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 4415 + }, + { + "epoch": 0.03851319530446007, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4416 + }, + { + "epoch": 0.03852191658962865, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 4417 + }, + { + "epoch": 0.03853063787479723, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4418 + }, + { + "epoch": 0.038539359159965814, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4419 + }, + { + "epoch": 0.03854808044513439, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4420 + }, + { + "epoch": 0.03855680173030298, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 4421 + }, + { + "epoch": 0.03856552301547156, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4422 + }, + { + "epoch": 0.038574244300640145, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4423 + }, + { + "epoch": 0.038582965585808725, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 4424 + }, + { + "epoch": 0.038591686870977304, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4425 + }, + { + "epoch": 0.03860040815614589, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4426 + }, + { + "epoch": 0.03860912944131447, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4427 + }, + { + "epoch": 0.038617850726483056, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4428 + }, + { + "epoch": 0.038626572011651636, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 4429 + }, + { + "epoch": 0.03863529329682022, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 4430 + }, + { + "epoch": 0.0386440145819888, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 4431 + }, + { + "epoch": 0.03865273586715738, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4432 + }, + { + "epoch": 0.03866145715232597, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4433 + }, + { + "epoch": 0.03867017843749455, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4434 + }, + { + "epoch": 0.03867889972266313, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4435 + }, + { + "epoch": 0.03868762100783171, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4436 + }, + { + "epoch": 0.0386963422930003, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4437 + }, + { + "epoch": 0.03870506357816888, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4438 + }, + { + "epoch": 0.03871378486333746, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4439 + }, + { + "epoch": 0.038722506148506045, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4440 + }, + { + "epoch": 0.038731227433674624, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4441 + }, + { + "epoch": 0.03873994871884321, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4442 + }, + { + "epoch": 0.03874867000401179, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4443 + }, + { + "epoch": 0.038757391289180376, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4444 + }, + { + "epoch": 0.038766112574348956, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 4445 + }, + { + "epoch": 0.038774833859517535, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4446 + }, + { + "epoch": 0.03878355514468612, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 4447 + }, + { + "epoch": 0.0387922764298547, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4448 + }, + { + "epoch": 0.03880099771502329, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4449 + }, + { + "epoch": 0.03880971900019187, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 4450 + }, + { + "epoch": 0.03881844028536045, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4451 + }, + { + "epoch": 0.03882716157052903, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4452 + }, + { + "epoch": 0.03883588285569762, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4453 + }, + { + "epoch": 0.0388446041408662, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 4454 + }, + { + "epoch": 0.03885332542603478, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4455 + }, + { + "epoch": 0.038862046711203364, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4456 + }, + { + "epoch": 0.038870767996371944, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4457 + }, + { + "epoch": 0.03887948928154053, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4458 + }, + { + "epoch": 0.03888821056670911, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4459 + }, + { + "epoch": 0.038896931851877696, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4460 + }, + { + "epoch": 0.038905653137046275, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4461 + }, + { + "epoch": 0.038914374422214855, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4462 + }, + { + "epoch": 0.03892309570738344, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4463 + }, + { + "epoch": 0.03893181699255202, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 4464 + }, + { + "epoch": 0.03894053827772061, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 4465 + }, + { + "epoch": 0.03894925956288919, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 4466 + }, + { + "epoch": 0.03895798084805777, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 4467 + }, + { + "epoch": 0.03896670213322635, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 4468 + }, + { + "epoch": 0.03897542341839493, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4469 + }, + { + "epoch": 0.03898414470356352, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4470 + }, + { + "epoch": 0.0389928659887321, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4471 + }, + { + "epoch": 0.039001587273900684, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4472 + }, + { + "epoch": 0.039010308559069264, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4473 + }, + { + "epoch": 0.03901902984423785, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.093, + "step": 4474 + }, + { + "epoch": 0.03902775112940643, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 4475 + }, + { + "epoch": 0.03903647241457501, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4476 + }, + { + "epoch": 0.039045193699743595, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4477 + }, + { + "epoch": 0.039053914984912175, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4478 + }, + { + "epoch": 0.03906263627008076, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4479 + }, + { + "epoch": 0.03907135755524934, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 4480 + }, + { + "epoch": 0.03908007884041793, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 4481 + }, + { + "epoch": 0.039088800125586506, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4482 + }, + { + "epoch": 0.039097521410755086, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4483 + }, + { + "epoch": 0.03910624269592367, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 4484 + }, + { + "epoch": 0.03911496398109225, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 4485 + }, + { + "epoch": 0.03912368526626084, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4486 + }, + { + "epoch": 0.03913240655142942, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4487 + }, + { + "epoch": 0.039141127836598004, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 4488 + }, + { + "epoch": 0.03914984912176658, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 4489 + }, + { + "epoch": 0.03915857040693516, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 4490 + }, + { + "epoch": 0.03916729169210375, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 4491 + }, + { + "epoch": 0.03917601297727233, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4492 + }, + { + "epoch": 0.039184734262440915, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4493 + }, + { + "epoch": 0.039193455547609495, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 4494 + }, + { + "epoch": 0.03920217683277808, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4495 + }, + { + "epoch": 0.03921089811794666, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 4496 + }, + { + "epoch": 0.03921961940311524, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 4497 + }, + { + "epoch": 0.039228340688283826, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 4498 + }, + { + "epoch": 0.039237061973452406, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4499 + }, + { + "epoch": 0.03924578325862099, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4500 + }, + { + "epoch": 0.03925450454378957, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4501 + }, + { + "epoch": 0.03926322582895816, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4502 + }, + { + "epoch": 0.03927194711412674, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4503 + }, + { + "epoch": 0.03928066839929532, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 4504 + }, + { + "epoch": 0.0392893896844639, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 4505 + }, + { + "epoch": 0.03929811096963248, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 4506 + }, + { + "epoch": 0.03930683225480107, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4507 + }, + { + "epoch": 0.03931555353996965, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4508 + }, + { + "epoch": 0.039324274825138235, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4509 + }, + { + "epoch": 0.039332996110306814, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 4510 + }, + { + "epoch": 0.0393417173954754, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 4511 + }, + { + "epoch": 0.03935043868064398, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4512 + }, + { + "epoch": 0.03935915996581256, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 4513 + }, + { + "epoch": 0.039367881250981146, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4514 + }, + { + "epoch": 0.039376602536149725, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4515 + }, + { + "epoch": 0.03938532382131831, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4516 + }, + { + "epoch": 0.03939404510648689, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 4517 + }, + { + "epoch": 0.03940276639165548, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4518 + }, + { + "epoch": 0.03941148767682406, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4519 + }, + { + "epoch": 0.03942020896199264, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4520 + }, + { + "epoch": 0.03942893024716122, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 4521 + }, + { + "epoch": 0.0394376515323298, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 4522 + }, + { + "epoch": 0.03944637281749839, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4523 + }, + { + "epoch": 0.03945509410266697, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 4524 + }, + { + "epoch": 0.039463815387835555, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4525 + }, + { + "epoch": 0.039472536673004134, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4526 + }, + { + "epoch": 0.039481257958172714, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 4527 + }, + { + "epoch": 0.0394899792433413, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4528 + }, + { + "epoch": 0.03949870052850988, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 4529 + }, + { + "epoch": 0.039507421813678466, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4530 + }, + { + "epoch": 0.039516143098847045, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 4531 + }, + { + "epoch": 0.03952486438401563, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4532 + }, + { + "epoch": 0.03953358566918421, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0801, + "step": 4533 + }, + { + "epoch": 0.03954230695435279, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4534 + }, + { + "epoch": 0.03955102823952138, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 4535 + }, + { + "epoch": 0.039559749524689956, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4536 + }, + { + "epoch": 0.03956847080985854, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 4537 + }, + { + "epoch": 0.03957719209502712, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4538 + }, + { + "epoch": 0.03958591338019571, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4539 + }, + { + "epoch": 0.03959463466536429, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4540 + }, + { + "epoch": 0.03960335595053287, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 4541 + }, + { + "epoch": 0.039612077235701454, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4542 + }, + { + "epoch": 0.03962079852087003, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 4543 + }, + { + "epoch": 0.03962951980603862, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4544 + }, + { + "epoch": 0.0396382410912072, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 4545 + }, + { + "epoch": 0.039646962376375786, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4546 + }, + { + "epoch": 0.039655683661544365, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0809, + "step": 4547 + }, + { + "epoch": 0.039664404946712944, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4548 + }, + { + "epoch": 0.03967312623188153, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4549 + }, + { + "epoch": 0.03968184751705011, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4550 + }, + { + "epoch": 0.0396905688022187, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 4551 + }, + { + "epoch": 0.039699290087387276, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4552 + }, + { + "epoch": 0.03970801137255586, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 4553 + }, + { + "epoch": 0.03971673265772444, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4554 + }, + { + "epoch": 0.03972545394289302, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4555 + }, + { + "epoch": 0.03973417522806161, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4556 + }, + { + "epoch": 0.03974289651323019, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4557 + }, + { + "epoch": 0.039751617798398774, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4558 + }, + { + "epoch": 0.03976033908356735, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4559 + }, + { + "epoch": 0.03976906036873594, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 4560 + }, + { + "epoch": 0.03977778165390452, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4561 + }, + { + "epoch": 0.0397865029390731, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4562 + }, + { + "epoch": 0.039795224224241685, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4563 + }, + { + "epoch": 0.039803945509410264, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4564 + }, + { + "epoch": 0.03981266679457885, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 4565 + }, + { + "epoch": 0.03982138807974743, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4566 + }, + { + "epoch": 0.039830109364916017, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 4567 + }, + { + "epoch": 0.039838830650084596, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4568 + }, + { + "epoch": 0.03984755193525318, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4569 + }, + { + "epoch": 0.03985627322042176, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4570 + }, + { + "epoch": 0.03986499450559034, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4571 + }, + { + "epoch": 0.03987371579075893, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4572 + }, + { + "epoch": 0.03988243707592751, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4573 + }, + { + "epoch": 0.039891158361096093, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4574 + }, + { + "epoch": 0.03989987964626467, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4575 + }, + { + "epoch": 0.03990860093143326, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4576 + }, + { + "epoch": 0.03991732221660184, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 4577 + }, + { + "epoch": 0.03992604350177042, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4578 + }, + { + "epoch": 0.039934764786939005, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4579 + }, + { + "epoch": 0.039943486072107584, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4580 + }, + { + "epoch": 0.03995220735727617, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4581 + }, + { + "epoch": 0.03996092864244475, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 4582 + }, + { + "epoch": 0.039969649927613336, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4583 + }, + { + "epoch": 0.039978371212781916, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4584 + }, + { + "epoch": 0.039987092497950495, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 4585 + }, + { + "epoch": 0.03999581378311908, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 4586 + }, + { + "epoch": 0.04000453506828766, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4587 + }, + { + "epoch": 0.04001325635345625, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4588 + }, + { + "epoch": 0.04002197763862483, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4589 + }, + { + "epoch": 0.04003069892379341, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4590 + }, + { + "epoch": 0.04003942020896199, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 4591 + }, + { + "epoch": 0.04004814149413057, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4592 + }, + { + "epoch": 0.04005686277929916, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 4593 + }, + { + "epoch": 0.04006558406446774, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 4594 + }, + { + "epoch": 0.040074305349636324, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 4595 + }, + { + "epoch": 0.040083026634804904, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4596 + }, + { + "epoch": 0.04009174791997349, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4597 + }, + { + "epoch": 0.04010046920514207, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4598 + }, + { + "epoch": 0.04010919049031065, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0812, + "step": 4599 + }, + { + "epoch": 0.040117911775479236, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4600 + }, + { + "epoch": 0.040126633060647815, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4601 + }, + { + "epoch": 0.0401353543458164, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4602 + }, + { + "epoch": 0.04014407563098498, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4603 + }, + { + "epoch": 0.04015279691615357, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4604 + }, + { + "epoch": 0.04016151820132215, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4605 + }, + { + "epoch": 0.040170239486490726, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 4606 + }, + { + "epoch": 0.04017896077165931, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 4607 + }, + { + "epoch": 0.04018768205682789, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4608 + }, + { + "epoch": 0.04019640334199648, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4609 + }, + { + "epoch": 0.04020512462716506, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4610 + }, + { + "epoch": 0.040213845912333644, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 4611 + }, + { + "epoch": 0.040222567197502224, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 4612 + }, + { + "epoch": 0.0402312884826708, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4613 + }, + { + "epoch": 0.04024000976783939, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4614 + }, + { + "epoch": 0.04024873105300797, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4615 + }, + { + "epoch": 0.040257452338176555, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4616 + }, + { + "epoch": 0.040266173623345135, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4617 + }, + { + "epoch": 0.04027489490851372, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4618 + }, + { + "epoch": 0.0402836161936823, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 4619 + }, + { + "epoch": 0.04029233747885088, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4620 + }, + { + "epoch": 0.040301058764019466, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 4621 + }, + { + "epoch": 0.040309780049188046, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4622 + }, + { + "epoch": 0.04031850133435663, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4623 + }, + { + "epoch": 0.04032722261952521, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4624 + }, + { + "epoch": 0.0403359439046938, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4625 + }, + { + "epoch": 0.04034466518986238, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4626 + }, + { + "epoch": 0.040353386475030964, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4627 + }, + { + "epoch": 0.04036210776019954, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4628 + }, + { + "epoch": 0.04037082904536812, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4629 + }, + { + "epoch": 0.04037955033053671, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4630 + }, + { + "epoch": 0.04038827161570529, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 4631 + }, + { + "epoch": 0.040396992900873875, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 4632 + }, + { + "epoch": 0.040405714186042455, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4633 + }, + { + "epoch": 0.04041443547121104, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4634 + }, + { + "epoch": 0.04042315675637962, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 4635 + }, + { + "epoch": 0.0404318780415482, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4636 + }, + { + "epoch": 0.040440599326716786, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 4637 + }, + { + "epoch": 0.040449320611885366, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4638 + }, + { + "epoch": 0.04045804189705395, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4639 + }, + { + "epoch": 0.04046676318222253, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4640 + }, + { + "epoch": 0.04047548446739112, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4641 + }, + { + "epoch": 0.0404842057525597, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4642 + }, + { + "epoch": 0.04049292703772828, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4643 + }, + { + "epoch": 0.04050164832289686, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 4644 + }, + { + "epoch": 0.04051036960806544, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4645 + }, + { + "epoch": 0.04051909089323403, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4646 + }, + { + "epoch": 0.04052781217840261, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4647 + }, + { + "epoch": 0.040536533463571195, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4648 + }, + { + "epoch": 0.040545254748739774, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4649 + }, + { + "epoch": 0.040553976033908354, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4650 + }, + { + "epoch": 0.04056269731907694, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4651 + }, + { + "epoch": 0.04057141860424552, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4652 + }, + { + "epoch": 0.040580139889414106, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 4653 + }, + { + "epoch": 0.040588861174582685, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 4654 + }, + { + "epoch": 0.04059758245975127, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4655 + }, + { + "epoch": 0.04060630374491985, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 4656 + }, + { + "epoch": 0.04061502503008843, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 4657 + }, + { + "epoch": 0.04062374631525702, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4658 + }, + { + "epoch": 0.0406324676004256, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0788, + "step": 4659 + }, + { + "epoch": 0.04064118888559418, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 4660 + }, + { + "epoch": 0.04064991017076276, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4661 + }, + { + "epoch": 0.04065863145593135, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 4662 + }, + { + "epoch": 0.04066735274109993, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 4663 + }, + { + "epoch": 0.04067607402626851, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4664 + }, + { + "epoch": 0.040684795311437094, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4665 + }, + { + "epoch": 0.040693516596605674, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 4666 + }, + { + "epoch": 0.04070223788177426, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4667 + }, + { + "epoch": 0.04071095916694284, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4668 + }, + { + "epoch": 0.040719680452111426, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4669 + }, + { + "epoch": 0.040728401737280005, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4670 + }, + { + "epoch": 0.040737123022448585, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 4671 + }, + { + "epoch": 0.04074584430761717, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4672 + }, + { + "epoch": 0.04075456559278575, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 4673 + }, + { + "epoch": 0.04076328687795434, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4674 + }, + { + "epoch": 0.040772008163122916, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 4675 + }, + { + "epoch": 0.0407807294482915, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4676 + }, + { + "epoch": 0.04078945073346008, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 4677 + }, + { + "epoch": 0.04079817201862866, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4678 + }, + { + "epoch": 0.04080689330379725, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4679 + }, + { + "epoch": 0.04081561458896583, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 4680 + }, + { + "epoch": 0.040824335874134414, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 4681 + }, + { + "epoch": 0.04083305715930299, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 4682 + }, + { + "epoch": 0.04084177844447158, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4683 + }, + { + "epoch": 0.04085049972964016, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 4684 + }, + { + "epoch": 0.040859221014808746, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 4685 + }, + { + "epoch": 0.040867942299977325, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4686 + }, + { + "epoch": 0.040876663585145905, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4687 + }, + { + "epoch": 0.04088538487031449, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4688 + }, + { + "epoch": 0.04089410615548307, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4689 + }, + { + "epoch": 0.04090282744065166, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4690 + }, + { + "epoch": 0.040911548725820236, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 4691 + }, + { + "epoch": 0.04092027001098882, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4692 + }, + { + "epoch": 0.0409289912961574, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 4693 + }, + { + "epoch": 0.04093771258132598, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4694 + }, + { + "epoch": 0.04094643386649457, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4695 + }, + { + "epoch": 0.04095515515166315, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4696 + }, + { + "epoch": 0.040963876436831734, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 4697 + }, + { + "epoch": 0.04097259772200031, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 4698 + }, + { + "epoch": 0.0409813190071689, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4699 + }, + { + "epoch": 0.04099004029233748, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4700 + }, + { + "epoch": 0.04099876157750606, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4701 + }, + { + "epoch": 0.041007482862674645, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 4702 + }, + { + "epoch": 0.041016204147843224, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 4703 + }, + { + "epoch": 0.04102492543301181, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 4704 + }, + { + "epoch": 0.04103364671818039, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 4705 + }, + { + "epoch": 0.04104236800334898, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4706 + }, + { + "epoch": 0.041051089288517556, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4707 + }, + { + "epoch": 0.041059810573686135, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 4708 + }, + { + "epoch": 0.04106853185885472, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4709 + }, + { + "epoch": 0.0410772531440233, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4710 + }, + { + "epoch": 0.04108597442919189, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4711 + }, + { + "epoch": 0.04109469571436047, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4712 + }, + { + "epoch": 0.041103416999529054, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 4713 + }, + { + "epoch": 0.04111213828469763, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4714 + }, + { + "epoch": 0.04112085956986621, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4715 + }, + { + "epoch": 0.0411295808550348, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 4716 + }, + { + "epoch": 0.04113830214020338, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 4717 + }, + { + "epoch": 0.041147023425371965, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4718 + }, + { + "epoch": 0.041155744710540544, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4719 + }, + { + "epoch": 0.04116446599570913, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4720 + }, + { + "epoch": 0.04117318728087771, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 4721 + }, + { + "epoch": 0.04118190856604629, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4722 + }, + { + "epoch": 0.041190629851214876, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 4723 + }, + { + "epoch": 0.041199351136383455, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4724 + }, + { + "epoch": 0.04120807242155204, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4725 + }, + { + "epoch": 0.04121679370672062, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0856, + "step": 4726 + }, + { + "epoch": 0.04122551499188921, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4727 + }, + { + "epoch": 0.04123423627705779, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4728 + }, + { + "epoch": 0.041242957562226366, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 4729 + }, + { + "epoch": 0.04125167884739495, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 4730 + }, + { + "epoch": 0.04126040013256353, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 4731 + }, + { + "epoch": 0.04126912141773212, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4732 + }, + { + "epoch": 0.0412778427029007, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4733 + }, + { + "epoch": 0.041286563988069284, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4734 + }, + { + "epoch": 0.041295285273237864, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4735 + }, + { + "epoch": 0.04130400655840644, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4736 + }, + { + "epoch": 0.04131272784357503, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4737 + }, + { + "epoch": 0.04132144912874361, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4738 + }, + { + "epoch": 0.041330170413912196, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4739 + }, + { + "epoch": 0.041338891699080775, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 4740 + }, + { + "epoch": 0.04134761298424936, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4741 + }, + { + "epoch": 0.04135633426941794, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4742 + }, + { + "epoch": 0.04136505555458653, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4743 + }, + { + "epoch": 0.04137377683975511, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 4744 + }, + { + "epoch": 0.041382498124923686, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4745 + }, + { + "epoch": 0.04139121941009227, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4746 + }, + { + "epoch": 0.04139994069526085, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4747 + }, + { + "epoch": 0.04140866198042944, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4748 + }, + { + "epoch": 0.04141738326559802, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4749 + }, + { + "epoch": 0.041426104550766604, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4750 + }, + { + "epoch": 0.041434825835935184, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4751 + }, + { + "epoch": 0.04144354712110376, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4752 + }, + { + "epoch": 0.04145226840627235, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4753 + }, + { + "epoch": 0.04146098969144093, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 4754 + }, + { + "epoch": 0.041469710976609515, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4755 + }, + { + "epoch": 0.041478432261778095, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4756 + }, + { + "epoch": 0.04148715354694668, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4757 + }, + { + "epoch": 0.04149587483211526, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4758 + }, + { + "epoch": 0.04150459611728384, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 4759 + }, + { + "epoch": 0.041513317402452427, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4760 + }, + { + "epoch": 0.041522038687621006, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4761 + }, + { + "epoch": 0.04153075997278959, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4762 + }, + { + "epoch": 0.04153948125795817, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4763 + }, + { + "epoch": 0.04154820254312676, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4764 + }, + { + "epoch": 0.04155692382829534, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 4765 + }, + { + "epoch": 0.04156564511346392, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4766 + }, + { + "epoch": 0.041574366398632503, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 4767 + }, + { + "epoch": 0.04158308768380108, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4768 + }, + { + "epoch": 0.04159180896896967, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 4769 + }, + { + "epoch": 0.04160053025413825, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4770 + }, + { + "epoch": 0.041609251539306835, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4771 + }, + { + "epoch": 0.041617972824475415, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 4772 + }, + { + "epoch": 0.041626694109643994, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4773 + }, + { + "epoch": 0.04163541539481258, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4774 + }, + { + "epoch": 0.04164413667998116, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 4775 + }, + { + "epoch": 0.041652857965149746, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 4776 + }, + { + "epoch": 0.041661579250318326, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4777 + }, + { + "epoch": 0.04167030053548691, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4778 + }, + { + "epoch": 0.04167902182065549, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4779 + }, + { + "epoch": 0.04168774310582407, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 4780 + }, + { + "epoch": 0.04169646439099266, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4781 + }, + { + "epoch": 0.04170518567616124, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4782 + }, + { + "epoch": 0.04171390696132982, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 4783 + }, + { + "epoch": 0.0417226282464984, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4784 + }, + { + "epoch": 0.04173134953166699, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4785 + }, + { + "epoch": 0.04174007081683557, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 4786 + }, + { + "epoch": 0.04174879210200415, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4787 + }, + { + "epoch": 0.041757513387172734, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4788 + }, + { + "epoch": 0.041766234672341314, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 4789 + }, + { + "epoch": 0.0417749559575099, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4790 + }, + { + "epoch": 0.04178367724267848, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4791 + }, + { + "epoch": 0.041792398527847066, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4792 + }, + { + "epoch": 0.041801119813015646, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4793 + }, + { + "epoch": 0.041809841098184225, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 4794 + }, + { + "epoch": 0.04181856238335281, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 4795 + }, + { + "epoch": 0.04182728366852139, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4796 + }, + { + "epoch": 0.04183600495368998, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4797 + }, + { + "epoch": 0.04184472623885856, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 4798 + }, + { + "epoch": 0.04185344752402714, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 4799 + }, + { + "epoch": 0.04186216880919572, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4800 + }, + { + "epoch": 0.04187089009436431, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 4801 + }, + { + "epoch": 0.04187961137953289, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 4802 + }, + { + "epoch": 0.04188833266470147, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4803 + }, + { + "epoch": 0.041897053949870054, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4804 + }, + { + "epoch": 0.041905775235038634, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4805 + }, + { + "epoch": 0.04191449652020722, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 4806 + }, + { + "epoch": 0.0419232178053758, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4807 + }, + { + "epoch": 0.041931939090544386, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4808 + }, + { + "epoch": 0.041940660375712965, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4809 + }, + { + "epoch": 0.041949381660881545, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4810 + }, + { + "epoch": 0.04195810294605013, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4811 + }, + { + "epoch": 0.04196682423121871, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4812 + }, + { + "epoch": 0.0419755455163873, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4813 + }, + { + "epoch": 0.041984266801555876, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 4814 + }, + { + "epoch": 0.04199298808672446, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 4815 + }, + { + "epoch": 0.04200170937189304, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4816 + }, + { + "epoch": 0.04201043065706162, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 4817 + }, + { + "epoch": 0.04201915194223021, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 4818 + }, + { + "epoch": 0.04202787322739879, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4819 + }, + { + "epoch": 0.042036594512567374, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 4820 + }, + { + "epoch": 0.04204531579773595, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4821 + }, + { + "epoch": 0.04205403708290454, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 4822 + }, + { + "epoch": 0.04206275836807312, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 4823 + }, + { + "epoch": 0.0420714796532417, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 4824 + }, + { + "epoch": 0.042080200938410285, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4825 + }, + { + "epoch": 0.042088922223578865, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 4826 + }, + { + "epoch": 0.04209764350874745, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4827 + }, + { + "epoch": 0.04210636479391603, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 4828 + }, + { + "epoch": 0.04211508607908462, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 4829 + }, + { + "epoch": 0.042123807364253196, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4830 + }, + { + "epoch": 0.042132528649421776, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 4831 + }, + { + "epoch": 0.04214124993459036, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 4832 + }, + { + "epoch": 0.04214997121975894, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4833 + }, + { + "epoch": 0.04215869250492753, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4834 + }, + { + "epoch": 0.04216741379009611, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4835 + }, + { + "epoch": 0.042176135075264694, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 4836 + }, + { + "epoch": 0.04218485636043327, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4837 + }, + { + "epoch": 0.04219357764560185, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 4838 + }, + { + "epoch": 0.04220229893077044, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4839 + }, + { + "epoch": 0.04221102021593902, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4840 + }, + { + "epoch": 0.042219741501107605, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4841 + }, + { + "epoch": 0.042228462786276184, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 4842 + }, + { + "epoch": 0.04223718407144477, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4843 + }, + { + "epoch": 0.04224590535661335, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4844 + }, + { + "epoch": 0.04225462664178193, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4845 + }, + { + "epoch": 0.042263347926950516, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 4846 + }, + { + "epoch": 0.042272069212119096, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4847 + }, + { + "epoch": 0.04228079049728768, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4848 + }, + { + "epoch": 0.04228951178245626, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4849 + }, + { + "epoch": 0.04229823306762485, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 4850 + }, + { + "epoch": 0.04230695435279343, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4851 + }, + { + "epoch": 0.04231567563796201, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4852 + }, + { + "epoch": 0.04232439692313059, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 4853 + }, + { + "epoch": 0.04233311820829917, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4854 + }, + { + "epoch": 0.04234183949346776, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 4855 + }, + { + "epoch": 0.04235056077863634, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4856 + }, + { + "epoch": 0.042359282063804925, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4857 + }, + { + "epoch": 0.042368003348973504, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4858 + }, + { + "epoch": 0.04237672463414209, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4859 + }, + { + "epoch": 0.04238544591931067, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4860 + }, + { + "epoch": 0.04239416720447925, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 4861 + }, + { + "epoch": 0.042402888489647836, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 4862 + }, + { + "epoch": 0.042411609774816415, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4863 + }, + { + "epoch": 0.042420331059985, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4864 + }, + { + "epoch": 0.04242905234515358, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4865 + }, + { + "epoch": 0.04243777363032217, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4866 + }, + { + "epoch": 0.04244649491549075, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4867 + }, + { + "epoch": 0.042455216200659326, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 4868 + }, + { + "epoch": 0.04246393748582791, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 4869 + }, + { + "epoch": 0.04247265877099649, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4870 + }, + { + "epoch": 0.04248138005616508, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4871 + }, + { + "epoch": 0.04249010134133366, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4872 + }, + { + "epoch": 0.042498822626502245, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 4873 + }, + { + "epoch": 0.042507543911670824, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4874 + }, + { + "epoch": 0.0425162651968394, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4875 + }, + { + "epoch": 0.04252498648200799, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4876 + }, + { + "epoch": 0.04253370776717657, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 4877 + }, + { + "epoch": 0.042542429052345156, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 4878 + }, + { + "epoch": 0.042551150337513735, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0806, + "step": 4879 + }, + { + "epoch": 0.04255987162268232, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 4880 + }, + { + "epoch": 0.0425685929078509, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4881 + }, + { + "epoch": 0.04257731419301948, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4882 + }, + { + "epoch": 0.04258603547818807, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4883 + }, + { + "epoch": 0.042594756763356646, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4884 + }, + { + "epoch": 0.04260347804852523, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 4885 + }, + { + "epoch": 0.04261219933369381, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4886 + }, + { + "epoch": 0.0426209206188624, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4887 + }, + { + "epoch": 0.04262964190403098, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4888 + }, + { + "epoch": 0.04263836318919956, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 4889 + }, + { + "epoch": 0.042647084474368144, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4890 + }, + { + "epoch": 0.04265580575953672, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 4891 + }, + { + "epoch": 0.04266452704470531, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4892 + }, + { + "epoch": 0.04267324832987389, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4893 + }, + { + "epoch": 0.042681969615042475, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 4894 + }, + { + "epoch": 0.042690690900211055, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4895 + }, + { + "epoch": 0.042699412185379634, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 4896 + }, + { + "epoch": 0.04270813347054822, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4897 + }, + { + "epoch": 0.0427168547557168, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 4898 + }, + { + "epoch": 0.04272557604088539, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 4899 + }, + { + "epoch": 0.042734297326053966, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4900 + }, + { + "epoch": 0.04274301861122255, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4901 + }, + { + "epoch": 0.04275173989639113, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 4902 + }, + { + "epoch": 0.04276046118155971, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4903 + }, + { + "epoch": 0.0427691824667283, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.086, + "step": 4904 + }, + { + "epoch": 0.04277790375189688, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4905 + }, + { + "epoch": 0.042786625037065464, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4906 + }, + { + "epoch": 0.04279534632223404, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4907 + }, + { + "epoch": 0.04280406760740263, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 4908 + }, + { + "epoch": 0.04281278889257121, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4909 + }, + { + "epoch": 0.04282151017773979, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4910 + }, + { + "epoch": 0.042830231462908375, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 4911 + }, + { + "epoch": 0.042838952748076954, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4912 + }, + { + "epoch": 0.04284767403324554, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4913 + }, + { + "epoch": 0.04285639531841412, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4914 + }, + { + "epoch": 0.042865116603582706, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4915 + }, + { + "epoch": 0.042873837888751286, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4916 + }, + { + "epoch": 0.04288255917391987, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 4917 + }, + { + "epoch": 0.04289128045908845, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 4918 + }, + { + "epoch": 0.04290000174425703, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4919 + }, + { + "epoch": 0.04290872302942562, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4920 + }, + { + "epoch": 0.0429174443145942, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4921 + }, + { + "epoch": 0.04292616559976278, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4922 + }, + { + "epoch": 0.04293488688493136, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4923 + }, + { + "epoch": 0.04294360817009995, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 4924 + }, + { + "epoch": 0.04295232945526853, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4925 + }, + { + "epoch": 0.04296105074043711, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 4926 + }, + { + "epoch": 0.042969772025605694, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 4927 + }, + { + "epoch": 0.042978493310774274, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 4928 + }, + { + "epoch": 0.04298721459594286, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4929 + }, + { + "epoch": 0.04299593588111144, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 4930 + }, + { + "epoch": 0.043004657166280026, + "grad_norm": 0.53515625, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4931 + }, + { + "epoch": 0.043013378451448606, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 4932 + }, + { + "epoch": 0.043022099736617185, + "grad_norm": 0.390625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4933 + }, + { + "epoch": 0.04303082102178577, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4934 + }, + { + "epoch": 0.04303954230695435, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 4935 + }, + { + "epoch": 0.04304826359212294, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4936 + }, + { + "epoch": 0.04305698487729152, + "grad_norm": 0.439453125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4937 + }, + { + "epoch": 0.0430657061624601, + "grad_norm": 0.38671875, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 4938 + }, + { + "epoch": 0.04307442744762868, + "grad_norm": 0.5859375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 4939 + }, + { + "epoch": 0.04308314873279726, + "grad_norm": 0.419921875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4940 + }, + { + "epoch": 0.04309187001796585, + "grad_norm": 0.71484375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4941 + }, + { + "epoch": 0.04310059130313443, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4942 + }, + { + "epoch": 0.043109312588303014, + "grad_norm": 0.7421875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 4943 + }, + { + "epoch": 0.043118033873471594, + "grad_norm": 0.412109375, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4944 + }, + { + "epoch": 0.04312675515864018, + "grad_norm": 0.66015625, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4945 + }, + { + "epoch": 0.04313547644380876, + "grad_norm": 0.55078125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4946 + }, + { + "epoch": 0.04314419772897734, + "grad_norm": 1.0078125, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 4947 + }, + { + "epoch": 0.043152919014145925, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4948 + }, + { + "epoch": 0.043161640299314505, + "grad_norm": 0.69140625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4949 + }, + { + "epoch": 0.04317036158448309, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 4950 + }, + { + "epoch": 0.04317908286965167, + "grad_norm": 0.625, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4951 + }, + { + "epoch": 0.04318780415482026, + "grad_norm": 0.60546875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4952 + }, + { + "epoch": 0.04319652543998884, + "grad_norm": 0.54296875, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4953 + }, + { + "epoch": 0.043205246725157416, + "grad_norm": 0.6484375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 4954 + }, + { + "epoch": 0.043213968010326, + "grad_norm": 0.458984375, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 4955 + }, + { + "epoch": 0.04322268929549458, + "grad_norm": 0.5234375, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4956 + }, + { + "epoch": 0.04323141058066317, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4957 + }, + { + "epoch": 0.04324013186583175, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 4958 + }, + { + "epoch": 0.043248853151000334, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 4959 + }, + { + "epoch": 0.043257574436168914, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 4960 + }, + { + "epoch": 0.04326629572133749, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 4961 + }, + { + "epoch": 0.04327501700650608, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4962 + }, + { + "epoch": 0.04328373829167466, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 4963 + }, + { + "epoch": 0.043292459576843245, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4964 + }, + { + "epoch": 0.043301180862011825, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4965 + }, + { + "epoch": 0.04330990214718041, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 4966 + }, + { + "epoch": 0.04331862343234899, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4967 + }, + { + "epoch": 0.04332734471751757, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0766, + "step": 4968 + }, + { + "epoch": 0.043336066002686156, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 4969 + }, + { + "epoch": 0.043344787287854736, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4970 + }, + { + "epoch": 0.04335350857302332, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4971 + }, + { + "epoch": 0.0433622298581919, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 4972 + }, + { + "epoch": 0.04337095114336049, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4973 + }, + { + "epoch": 0.04337967242852907, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4974 + }, + { + "epoch": 0.043388393713697654, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 4975 + }, + { + "epoch": 0.04339711499886623, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4976 + }, + { + "epoch": 0.04340583628403481, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4977 + }, + { + "epoch": 0.0434145575692034, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 4978 + }, + { + "epoch": 0.04342327885437198, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4979 + }, + { + "epoch": 0.043432000139540565, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4980 + }, + { + "epoch": 0.043440721424709144, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4981 + }, + { + "epoch": 0.04344944270987773, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 4982 + }, + { + "epoch": 0.04345816399504631, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4983 + }, + { + "epoch": 0.04346688528021489, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4984 + }, + { + "epoch": 0.043475606565383476, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4985 + }, + { + "epoch": 0.043484327850552056, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4986 + }, + { + "epoch": 0.04349304913572064, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4987 + }, + { + "epoch": 0.04350177042088922, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4988 + }, + { + "epoch": 0.04351049170605781, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 4989 + }, + { + "epoch": 0.04351921299122639, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4990 + }, + { + "epoch": 0.04352793427639497, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4991 + }, + { + "epoch": 0.04353665556156355, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4992 + }, + { + "epoch": 0.04354537684673213, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4993 + }, + { + "epoch": 0.04355409813190072, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4994 + }, + { + "epoch": 0.0435628194170693, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4995 + }, + { + "epoch": 0.043571540702237885, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 4996 + }, + { + "epoch": 0.043580261987406464, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4997 + }, + { + "epoch": 0.043588983272575044, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4998 + }, + { + "epoch": 0.04359770455774363, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 4999 + }, + { + "epoch": 0.04360642584291221, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5000 + }, + { + "epoch": 0.043615147128080796, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0935, + "step": 5001 + }, + { + "epoch": 0.043623868413249375, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5002 + }, + { + "epoch": 0.04363258969841796, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5003 + }, + { + "epoch": 0.04364131098358654, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 5004 + }, + { + "epoch": 0.04365003226875512, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5005 + }, + { + "epoch": 0.04365875355392371, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5006 + }, + { + "epoch": 0.043667474839092286, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5007 + }, + { + "epoch": 0.04367619612426087, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5008 + }, + { + "epoch": 0.04368491740942945, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5009 + }, + { + "epoch": 0.04369363869459804, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5010 + }, + { + "epoch": 0.04370235997976662, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0881, + "step": 5011 + }, + { + "epoch": 0.0437110812649352, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 5012 + }, + { + "epoch": 0.043719802550103784, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5013 + }, + { + "epoch": 0.043728523835272363, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5014 + }, + { + "epoch": 0.04373724512044095, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5015 + }, + { + "epoch": 0.04374596640560953, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 5016 + }, + { + "epoch": 0.043754687690778116, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5017 + }, + { + "epoch": 0.043763408975946695, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5018 + }, + { + "epoch": 0.043772130261115275, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5019 + }, + { + "epoch": 0.04378085154628386, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5020 + }, + { + "epoch": 0.04378957283145244, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5021 + }, + { + "epoch": 0.04379829411662103, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 5022 + }, + { + "epoch": 0.043807015401789606, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5023 + }, + { + "epoch": 0.04381573668695819, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5024 + }, + { + "epoch": 0.04382445797212677, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5025 + }, + { + "epoch": 0.04383317925729535, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5026 + }, + { + "epoch": 0.04384190054246394, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5027 + }, + { + "epoch": 0.04385062182763252, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5028 + }, + { + "epoch": 0.043859343112801104, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5029 + }, + { + "epoch": 0.04386806439796968, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5030 + }, + { + "epoch": 0.04387678568313827, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5031 + }, + { + "epoch": 0.04388550696830685, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5032 + }, + { + "epoch": 0.043894228253475435, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5033 + }, + { + "epoch": 0.043902949538644015, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5034 + }, + { + "epoch": 0.043911670823812594, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0924, + "step": 5035 + }, + { + "epoch": 0.04392039210898118, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 5036 + }, + { + "epoch": 0.04392911339414976, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5037 + }, + { + "epoch": 0.04393783467931835, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5038 + }, + { + "epoch": 0.043946555964486926, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5039 + }, + { + "epoch": 0.04395527724965551, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5040 + }, + { + "epoch": 0.04396399853482409, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5041 + }, + { + "epoch": 0.04397271981999267, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5042 + }, + { + "epoch": 0.04398144110516126, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5043 + }, + { + "epoch": 0.04399016239032984, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5044 + }, + { + "epoch": 0.043998883675498424, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5045 + }, + { + "epoch": 0.044007604960667, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 5046 + }, + { + "epoch": 0.04401632624583559, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5047 + }, + { + "epoch": 0.04402504753100417, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5048 + }, + { + "epoch": 0.04403376881617275, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5049 + }, + { + "epoch": 0.044042490101341335, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5050 + }, + { + "epoch": 0.044051211386509914, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5051 + }, + { + "epoch": 0.0440599326716785, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5052 + }, + { + "epoch": 0.04406865395684708, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5053 + }, + { + "epoch": 0.044077375242015666, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5054 + }, + { + "epoch": 0.044086096527184246, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5055 + }, + { + "epoch": 0.044094817812352825, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5056 + }, + { + "epoch": 0.04410353909752141, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5057 + }, + { + "epoch": 0.04411226038268999, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 5058 + }, + { + "epoch": 0.04412098166785858, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 5059 + }, + { + "epoch": 0.04412970295302716, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5060 + }, + { + "epoch": 0.04413842423819574, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5061 + }, + { + "epoch": 0.04414714552336432, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5062 + }, + { + "epoch": 0.0441558668085329, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 5063 + }, + { + "epoch": 0.04416458809370149, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5064 + }, + { + "epoch": 0.04417330937887007, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5065 + }, + { + "epoch": 0.044182030664038655, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 5066 + }, + { + "epoch": 0.044190751949207234, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 5067 + }, + { + "epoch": 0.04419947323437582, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5068 + }, + { + "epoch": 0.0442081945195444, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5069 + }, + { + "epoch": 0.04421691580471298, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 5070 + }, + { + "epoch": 0.044225637089881566, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5071 + }, + { + "epoch": 0.044234358375050145, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5072 + }, + { + "epoch": 0.04424307966021873, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5073 + }, + { + "epoch": 0.04425180094538731, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5074 + }, + { + "epoch": 0.0442605222305559, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5075 + }, + { + "epoch": 0.04426924351572448, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5076 + }, + { + "epoch": 0.044277964800893056, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5077 + }, + { + "epoch": 0.04428668608606164, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 5078 + }, + { + "epoch": 0.04429540737123022, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 5079 + }, + { + "epoch": 0.04430412865639881, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5080 + }, + { + "epoch": 0.04431284994156739, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5081 + }, + { + "epoch": 0.044321571226735974, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 5082 + }, + { + "epoch": 0.044330292511904554, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5083 + }, + { + "epoch": 0.04433901379707314, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 5084 + }, + { + "epoch": 0.04434773508224172, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5085 + }, + { + "epoch": 0.0443564563674103, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5086 + }, + { + "epoch": 0.044365177652578885, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 5087 + }, + { + "epoch": 0.044373898937747465, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5088 + }, + { + "epoch": 0.04438262022291605, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5089 + }, + { + "epoch": 0.04439134150808463, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 5090 + }, + { + "epoch": 0.04440006279325322, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5091 + }, + { + "epoch": 0.0444087840784218, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5092 + }, + { + "epoch": 0.044417505363590376, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 5093 + }, + { + "epoch": 0.04442622664875896, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5094 + }, + { + "epoch": 0.04443494793392754, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5095 + }, + { + "epoch": 0.04444366921909613, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5096 + }, + { + "epoch": 0.04445239050426471, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5097 + }, + { + "epoch": 0.044461111789433294, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5098 + }, + { + "epoch": 0.044469833074601874, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5099 + }, + { + "epoch": 0.04447855435977045, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 5100 + }, + { + "epoch": 0.04448727564493904, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5101 + }, + { + "epoch": 0.04449599693010762, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 5102 + }, + { + "epoch": 0.044504718215276205, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5103 + }, + { + "epoch": 0.044513439500444785, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 5104 + }, + { + "epoch": 0.04452216078561337, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5105 + }, + { + "epoch": 0.04453088207078195, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5106 + }, + { + "epoch": 0.04453960335595053, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 5107 + }, + { + "epoch": 0.044548324641119116, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5108 + }, + { + "epoch": 0.044557045926287696, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5109 + }, + { + "epoch": 0.04456576721145628, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 5110 + }, + { + "epoch": 0.04457448849662486, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5111 + }, + { + "epoch": 0.04458320978179345, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 5112 + }, + { + "epoch": 0.04459193106696203, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5113 + }, + { + "epoch": 0.04460065235213061, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5114 + }, + { + "epoch": 0.04460937363729919, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 5115 + }, + { + "epoch": 0.04461809492246777, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5116 + }, + { + "epoch": 0.04462681620763636, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 5117 + }, + { + "epoch": 0.04463553749280494, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5118 + }, + { + "epoch": 0.044644258777973525, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5119 + }, + { + "epoch": 0.044652980063142104, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 5120 + }, + { + "epoch": 0.044661701348310684, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5121 + }, + { + "epoch": 0.04467042263347927, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5122 + }, + { + "epoch": 0.04467914391864785, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5123 + }, + { + "epoch": 0.044687865203816436, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 5124 + }, + { + "epoch": 0.044696586488985016, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 5125 + }, + { + "epoch": 0.0447053077741536, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5126 + }, + { + "epoch": 0.04471402905932218, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5127 + }, + { + "epoch": 0.04472275034449076, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5128 + }, + { + "epoch": 0.04473147162965935, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5129 + }, + { + "epoch": 0.04474019291482793, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5130 + }, + { + "epoch": 0.04474891419999651, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5131 + }, + { + "epoch": 0.04475763548516509, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5132 + }, + { + "epoch": 0.04476635677033368, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5133 + }, + { + "epoch": 0.04477507805550226, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5134 + }, + { + "epoch": 0.04478379934067084, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5135 + }, + { + "epoch": 0.044792520625839424, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5136 + }, + { + "epoch": 0.044801241911008004, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5137 + }, + { + "epoch": 0.04480996319617659, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5138 + }, + { + "epoch": 0.04481868448134517, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5139 + }, + { + "epoch": 0.044827405766513756, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5140 + }, + { + "epoch": 0.044836127051682335, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5141 + }, + { + "epoch": 0.04484484833685092, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 5142 + }, + { + "epoch": 0.0448535696220195, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5143 + }, + { + "epoch": 0.04486229090718808, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 5144 + }, + { + "epoch": 0.04487101219235667, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 5145 + }, + { + "epoch": 0.04487973347752525, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5146 + }, + { + "epoch": 0.04488845476269383, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5147 + }, + { + "epoch": 0.04489717604786241, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5148 + }, + { + "epoch": 0.044905897333031, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5149 + }, + { + "epoch": 0.04491461861819958, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 5150 + }, + { + "epoch": 0.04492333990336816, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5151 + }, + { + "epoch": 0.044932061188536744, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 5152 + }, + { + "epoch": 0.044940782473705324, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5153 + }, + { + "epoch": 0.04494950375887391, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5154 + }, + { + "epoch": 0.04495822504404249, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5155 + }, + { + "epoch": 0.044966946329211076, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 5156 + }, + { + "epoch": 0.044975667614379655, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5157 + }, + { + "epoch": 0.044984388899548235, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5158 + }, + { + "epoch": 0.04499311018471682, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5159 + }, + { + "epoch": 0.0450018314698854, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5160 + }, + { + "epoch": 0.04501055275505399, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5161 + }, + { + "epoch": 0.045019274040222566, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 5162 + }, + { + "epoch": 0.04502799532539115, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5163 + }, + { + "epoch": 0.04503671661055973, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5164 + }, + { + "epoch": 0.04504543789572831, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 5165 + }, + { + "epoch": 0.0450541591808969, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5166 + }, + { + "epoch": 0.04506288046606548, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5167 + }, + { + "epoch": 0.045071601751234064, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5168 + }, + { + "epoch": 0.04508032303640264, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5169 + }, + { + "epoch": 0.04508904432157123, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5170 + }, + { + "epoch": 0.04509776560673981, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5171 + }, + { + "epoch": 0.04510648689190839, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5172 + }, + { + "epoch": 0.045115208177076975, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5173 + }, + { + "epoch": 0.045123929462245554, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5174 + }, + { + "epoch": 0.04513265074741414, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5175 + }, + { + "epoch": 0.04514137203258272, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5176 + }, + { + "epoch": 0.04515009331775131, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5177 + }, + { + "epoch": 0.045158814602919886, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5178 + }, + { + "epoch": 0.045167535888088466, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5179 + }, + { + "epoch": 0.04517625717325705, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5180 + }, + { + "epoch": 0.04518497845842563, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5181 + }, + { + "epoch": 0.04519369974359422, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5182 + }, + { + "epoch": 0.0452024210287628, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 5183 + }, + { + "epoch": 0.045211142313931384, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 5184 + }, + { + "epoch": 0.04521986359909996, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5185 + }, + { + "epoch": 0.04522858488426854, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5186 + }, + { + "epoch": 0.04523730616943713, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5187 + }, + { + "epoch": 0.04524602745460571, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 5188 + }, + { + "epoch": 0.045254748739774295, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5189 + }, + { + "epoch": 0.045263470024942874, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5190 + }, + { + "epoch": 0.04527219131011146, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 5191 + }, + { + "epoch": 0.04528091259528004, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 5192 + }, + { + "epoch": 0.04528963388044862, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5193 + }, + { + "epoch": 0.045298355165617206, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5194 + }, + { + "epoch": 0.045307076450785785, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 5195 + }, + { + "epoch": 0.04531579773595437, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 5196 + }, + { + "epoch": 0.04532451902112295, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5197 + }, + { + "epoch": 0.04533324030629154, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 5198 + }, + { + "epoch": 0.04534196159146012, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 5199 + }, + { + "epoch": 0.0453506828766287, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5200 + }, + { + "epoch": 0.04535940416179728, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 5201 + }, + { + "epoch": 0.04536812544696586, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 5202 + }, + { + "epoch": 0.04537684673213445, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5203 + }, + { + "epoch": 0.04538556801730303, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 5204 + }, + { + "epoch": 0.045394289302471615, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 5205 + }, + { + "epoch": 0.045403010587640194, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 5206 + }, + { + "epoch": 0.04541173187280878, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5207 + }, + { + "epoch": 0.04542045315797736, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5208 + }, + { + "epoch": 0.04542917444314594, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5209 + }, + { + "epoch": 0.045437895728314526, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5210 + }, + { + "epoch": 0.045446617013483105, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5211 + }, + { + "epoch": 0.04545533829865169, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5212 + }, + { + "epoch": 0.04546405958382027, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5213 + }, + { + "epoch": 0.04547278086898886, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5214 + }, + { + "epoch": 0.04548150215415744, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 5215 + }, + { + "epoch": 0.045490223439326016, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 5216 + }, + { + "epoch": 0.0454989447244946, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5217 + }, + { + "epoch": 0.04550766600966318, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 5218 + }, + { + "epoch": 0.04551638729483177, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 5219 + }, + { + "epoch": 0.04552510858000035, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5220 + }, + { + "epoch": 0.045533829865168934, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 5221 + }, + { + "epoch": 0.045542551150337514, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5222 + }, + { + "epoch": 0.04555127243550609, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5223 + }, + { + "epoch": 0.04555999372067468, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5224 + }, + { + "epoch": 0.04556871500584326, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5225 + }, + { + "epoch": 0.045577436291011846, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5226 + }, + { + "epoch": 0.045586157576180425, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5227 + }, + { + "epoch": 0.04559487886134901, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5228 + }, + { + "epoch": 0.04560360014651759, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5229 + }, + { + "epoch": 0.04561232143168617, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5230 + }, + { + "epoch": 0.04562104271685476, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5231 + }, + { + "epoch": 0.045629764002023336, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5232 + }, + { + "epoch": 0.04563848528719192, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5233 + }, + { + "epoch": 0.0456472065723605, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 5234 + }, + { + "epoch": 0.04565592785752909, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5235 + }, + { + "epoch": 0.04566464914269767, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 5236 + }, + { + "epoch": 0.04567337042786625, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5237 + }, + { + "epoch": 0.045682091713034834, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 5238 + }, + { + "epoch": 0.04569081299820341, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5239 + }, + { + "epoch": 0.045699534283372, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5240 + }, + { + "epoch": 0.04570825556854058, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5241 + }, + { + "epoch": 0.045716976853709165, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 5242 + }, + { + "epoch": 0.045725698138877745, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5243 + }, + { + "epoch": 0.045734419424046324, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5244 + }, + { + "epoch": 0.04574314070921491, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5245 + }, + { + "epoch": 0.04575186199438349, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5246 + }, + { + "epoch": 0.045760583279552076, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5247 + }, + { + "epoch": 0.045769304564720656, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5248 + }, + { + "epoch": 0.04577802584988924, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5249 + }, + { + "epoch": 0.04578674713505782, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5250 + }, + { + "epoch": 0.0457954684202264, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 5251 + }, + { + "epoch": 0.04580418970539499, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5252 + }, + { + "epoch": 0.04581291099056357, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5253 + }, + { + "epoch": 0.04582163227573215, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5254 + }, + { + "epoch": 0.04583035356090073, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 5255 + }, + { + "epoch": 0.04583907484606932, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 5256 + }, + { + "epoch": 0.0458477961312379, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5257 + }, + { + "epoch": 0.045856517416406485, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5258 + }, + { + "epoch": 0.045865238701575065, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 5259 + }, + { + "epoch": 0.045873959986743644, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5260 + }, + { + "epoch": 0.04588268127191223, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5261 + }, + { + "epoch": 0.04589140255708081, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5262 + }, + { + "epoch": 0.045900123842249396, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5263 + }, + { + "epoch": 0.045908845127417976, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 5264 + }, + { + "epoch": 0.04591756641258656, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 5265 + }, + { + "epoch": 0.04592628769775514, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5266 + }, + { + "epoch": 0.04593500898292372, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5267 + }, + { + "epoch": 0.04594373026809231, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5268 + }, + { + "epoch": 0.04595245155326089, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5269 + }, + { + "epoch": 0.04596117283842947, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 5270 + }, + { + "epoch": 0.04596989412359805, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 5271 + }, + { + "epoch": 0.04597861540876664, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 5272 + }, + { + "epoch": 0.04598733669393522, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5273 + }, + { + "epoch": 0.0459960579791038, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5274 + }, + { + "epoch": 0.046004779264272384, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5275 + }, + { + "epoch": 0.046013500549440964, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5276 + }, + { + "epoch": 0.04602222183460955, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5277 + }, + { + "epoch": 0.04603094311977813, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5278 + }, + { + "epoch": 0.046039664404946716, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 5279 + }, + { + "epoch": 0.046048385690115295, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5280 + }, + { + "epoch": 0.046057106975283875, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5281 + }, + { + "epoch": 0.04606582826045246, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5282 + }, + { + "epoch": 0.04607454954562104, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5283 + }, + { + "epoch": 0.04608327083078963, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5284 + }, + { + "epoch": 0.04609199211595821, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 5285 + }, + { + "epoch": 0.04610071340112679, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5286 + }, + { + "epoch": 0.04610943468629537, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5287 + }, + { + "epoch": 0.04611815597146395, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5288 + }, + { + "epoch": 0.04612687725663254, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5289 + }, + { + "epoch": 0.04613559854180112, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5290 + }, + { + "epoch": 0.046144319826969704, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5291 + }, + { + "epoch": 0.046153041112138284, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5292 + }, + { + "epoch": 0.04616176239730687, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5293 + }, + { + "epoch": 0.04617048368247545, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 5294 + }, + { + "epoch": 0.04617920496764403, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5295 + }, + { + "epoch": 0.046187926252812615, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5296 + }, + { + "epoch": 0.046196647537981195, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5297 + }, + { + "epoch": 0.04620536882314978, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 5298 + }, + { + "epoch": 0.04621409010831836, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5299 + }, + { + "epoch": 0.04622281139348695, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5300 + }, + { + "epoch": 0.046231532678655526, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 5301 + }, + { + "epoch": 0.046240253963824106, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5302 + }, + { + "epoch": 0.04624897524899269, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 5303 + }, + { + "epoch": 0.04625769653416127, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5304 + }, + { + "epoch": 0.04626641781932986, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5305 + }, + { + "epoch": 0.04627513910449844, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5306 + }, + { + "epoch": 0.046283860389667024, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 5307 + }, + { + "epoch": 0.0462925816748356, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 5308 + }, + { + "epoch": 0.04630130296000418, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 5309 + }, + { + "epoch": 0.04631002424517277, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5310 + }, + { + "epoch": 0.04631874553034135, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 5311 + }, + { + "epoch": 0.046327466815509935, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5312 + }, + { + "epoch": 0.046336188100678515, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5313 + }, + { + "epoch": 0.0463449093858471, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 5314 + }, + { + "epoch": 0.04635363067101568, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 5315 + }, + { + "epoch": 0.04636235195618427, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5316 + }, + { + "epoch": 0.046371073241352846, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5317 + }, + { + "epoch": 0.046379794526521426, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5318 + }, + { + "epoch": 0.04638851581169001, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 5319 + }, + { + "epoch": 0.04639723709685859, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5320 + }, + { + "epoch": 0.04640595838202718, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5321 + }, + { + "epoch": 0.04641467966719576, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5322 + }, + { + "epoch": 0.046423400952364344, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5323 + }, + { + "epoch": 0.04643212223753292, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5324 + }, + { + "epoch": 0.0464408435227015, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 5325 + }, + { + "epoch": 0.04644956480787009, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5326 + }, + { + "epoch": 0.04645828609303867, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5327 + }, + { + "epoch": 0.046467007378207255, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5328 + }, + { + "epoch": 0.046475728663375834, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 5329 + }, + { + "epoch": 0.04648444994854442, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 5330 + }, + { + "epoch": 0.046493171233713, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5331 + }, + { + "epoch": 0.04650189251888158, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 5332 + }, + { + "epoch": 0.046510613804050166, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5333 + }, + { + "epoch": 0.046519335089218745, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 5334 + }, + { + "epoch": 0.04652805637438733, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 5335 + }, + { + "epoch": 0.04653677765955591, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5336 + }, + { + "epoch": 0.0465454989447245, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 5337 + }, + { + "epoch": 0.04655422022989308, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5338 + }, + { + "epoch": 0.04656294151506166, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5339 + }, + { + "epoch": 0.04657166280023024, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5340 + }, + { + "epoch": 0.04658038408539882, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5341 + }, + { + "epoch": 0.04658910537056741, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 5342 + }, + { + "epoch": 0.04659782665573599, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5343 + }, + { + "epoch": 0.046606547940904575, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 5344 + }, + { + "epoch": 0.046615269226073154, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 5345 + }, + { + "epoch": 0.046623990511241734, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5346 + }, + { + "epoch": 0.04663271179641032, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 5347 + }, + { + "epoch": 0.0466414330815789, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5348 + }, + { + "epoch": 0.046650154366747486, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 5349 + }, + { + "epoch": 0.046658875651916065, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5350 + }, + { + "epoch": 0.04666759693708465, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5351 + }, + { + "epoch": 0.04667631822225323, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5352 + }, + { + "epoch": 0.04668503950742181, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5353 + }, + { + "epoch": 0.0466937607925904, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 5354 + }, + { + "epoch": 0.046702482077758976, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5355 + }, + { + "epoch": 0.04671120336292756, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 5356 + }, + { + "epoch": 0.04671992464809614, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 5357 + }, + { + "epoch": 0.04672864593326473, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 5358 + }, + { + "epoch": 0.04673736721843331, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5359 + }, + { + "epoch": 0.04674608850360189, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 5360 + }, + { + "epoch": 0.046754809788770474, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5361 + }, + { + "epoch": 0.04676353107393905, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5362 + }, + { + "epoch": 0.04677225235910764, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5363 + }, + { + "epoch": 0.04678097364427622, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 5364 + }, + { + "epoch": 0.046789694929444806, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 5365 + }, + { + "epoch": 0.046798416214613385, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5366 + }, + { + "epoch": 0.046807137499781964, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5367 + }, + { + "epoch": 0.04681585878495055, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 5368 + }, + { + "epoch": 0.04682458007011913, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5369 + }, + { + "epoch": 0.04683330135528772, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 5370 + }, + { + "epoch": 0.046842022640456296, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 5371 + }, + { + "epoch": 0.04685074392562488, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5372 + }, + { + "epoch": 0.04685946521079346, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5373 + }, + { + "epoch": 0.04686818649596205, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5374 + }, + { + "epoch": 0.04687690778113063, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5375 + }, + { + "epoch": 0.04688562906629921, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 5376 + }, + { + "epoch": 0.046894350351467794, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 5377 + }, + { + "epoch": 0.04690307163663637, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5378 + }, + { + "epoch": 0.04691179292180496, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 5379 + }, + { + "epoch": 0.04692051420697354, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5380 + }, + { + "epoch": 0.046929235492142125, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5381 + }, + { + "epoch": 0.046937956777310705, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5382 + }, + { + "epoch": 0.046946678062479284, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 5383 + }, + { + "epoch": 0.04695539934764787, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5384 + }, + { + "epoch": 0.04696412063281645, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5385 + }, + { + "epoch": 0.046972841917985036, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5386 + }, + { + "epoch": 0.046981563203153616, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 5387 + }, + { + "epoch": 0.0469902844883222, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5388 + }, + { + "epoch": 0.04699900577349078, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5389 + }, + { + "epoch": 0.04700772705865936, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5390 + }, + { + "epoch": 0.04701644834382795, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5391 + }, + { + "epoch": 0.04702516962899653, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5392 + }, + { + "epoch": 0.047033890914165113, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5393 + }, + { + "epoch": 0.04704261219933369, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5394 + }, + { + "epoch": 0.04705133348450228, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5395 + }, + { + "epoch": 0.04706005476967086, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 5396 + }, + { + "epoch": 0.04706877605483944, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 5397 + }, + { + "epoch": 0.047077497340008025, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5398 + }, + { + "epoch": 0.047086218625176604, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5399 + }, + { + "epoch": 0.04709493991034519, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5400 + }, + { + "epoch": 0.04710366119551377, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 5401 + }, + { + "epoch": 0.047112382480682356, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 5402 + }, + { + "epoch": 0.047121103765850936, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5403 + }, + { + "epoch": 0.047129825051019515, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5404 + }, + { + "epoch": 0.0471385463361881, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 5405 + }, + { + "epoch": 0.04714726762135668, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5406 + }, + { + "epoch": 0.04715598890652527, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5407 + }, + { + "epoch": 0.04716471019169385, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5408 + }, + { + "epoch": 0.04717343147686243, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 5409 + }, + { + "epoch": 0.04718215276203101, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5410 + }, + { + "epoch": 0.04719087404719959, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5411 + }, + { + "epoch": 0.04719959533236818, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5412 + }, + { + "epoch": 0.04720831661753676, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5413 + }, + { + "epoch": 0.047217037902705344, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5414 + }, + { + "epoch": 0.047225759187873924, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5415 + }, + { + "epoch": 0.04723448047304251, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5416 + }, + { + "epoch": 0.04724320175821109, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 5417 + }, + { + "epoch": 0.04725192304337967, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5418 + }, + { + "epoch": 0.047260644328548256, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5419 + }, + { + "epoch": 0.047269365613716835, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5420 + }, + { + "epoch": 0.04727808689888542, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 5421 + }, + { + "epoch": 0.047286808184054, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5422 + }, + { + "epoch": 0.04729552946922259, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 5423 + }, + { + "epoch": 0.04730425075439117, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5424 + }, + { + "epoch": 0.047312972039559746, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 5425 + }, + { + "epoch": 0.04732169332472833, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5426 + }, + { + "epoch": 0.04733041460989691, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5427 + }, + { + "epoch": 0.0473391358950655, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5428 + }, + { + "epoch": 0.04734785718023408, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5429 + }, + { + "epoch": 0.047356578465402664, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5430 + }, + { + "epoch": 0.047365299750571244, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 5431 + }, + { + "epoch": 0.04737402103573983, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 5432 + }, + { + "epoch": 0.04738274232090841, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5433 + }, + { + "epoch": 0.04739146360607699, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 5434 + }, + { + "epoch": 0.047400184891245575, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5435 + }, + { + "epoch": 0.047408906176414155, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5436 + }, + { + "epoch": 0.04741762746158274, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5437 + }, + { + "epoch": 0.04742634874675132, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5438 + }, + { + "epoch": 0.04743507003191991, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5439 + }, + { + "epoch": 0.047443791317088486, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 5440 + }, + { + "epoch": 0.047452512602257066, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 5441 + }, + { + "epoch": 0.04746123388742565, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 5442 + }, + { + "epoch": 0.04746995517259423, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5443 + }, + { + "epoch": 0.04747867645776282, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 5444 + }, + { + "epoch": 0.0474873977429314, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5445 + }, + { + "epoch": 0.047496119028099984, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5446 + }, + { + "epoch": 0.04750484031326856, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5447 + }, + { + "epoch": 0.04751356159843714, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 5448 + }, + { + "epoch": 0.04752228288360573, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 5449 + }, + { + "epoch": 0.04753100416877431, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5450 + }, + { + "epoch": 0.047539725453942895, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 5451 + }, + { + "epoch": 0.047548446739111475, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5452 + }, + { + "epoch": 0.04755716802428006, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5453 + }, + { + "epoch": 0.04756588930944864, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5454 + }, + { + "epoch": 0.04757461059461722, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5455 + }, + { + "epoch": 0.047583331879785806, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 5456 + }, + { + "epoch": 0.047592053164954386, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 5457 + }, + { + "epoch": 0.04760077445012297, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5458 + }, + { + "epoch": 0.04760949573529155, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5459 + }, + { + "epoch": 0.04761821702046014, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5460 + }, + { + "epoch": 0.04762693830562872, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5461 + }, + { + "epoch": 0.0476356595907973, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5462 + }, + { + "epoch": 0.04764438087596588, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5463 + }, + { + "epoch": 0.04765310216113446, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 5464 + }, + { + "epoch": 0.04766182344630305, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5465 + }, + { + "epoch": 0.04767054473147163, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5466 + }, + { + "epoch": 0.047679266016640215, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5467 + }, + { + "epoch": 0.047687987301808794, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5468 + }, + { + "epoch": 0.047696708586977374, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5469 + }, + { + "epoch": 0.04770542987214596, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 5470 + }, + { + "epoch": 0.04771415115731454, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5471 + }, + { + "epoch": 0.047722872442483126, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5472 + }, + { + "epoch": 0.047731593727651705, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5473 + }, + { + "epoch": 0.04774031501282029, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 5474 + }, + { + "epoch": 0.04774903629798887, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5475 + }, + { + "epoch": 0.04775775758315745, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5476 + }, + { + "epoch": 0.04776647886832604, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5477 + }, + { + "epoch": 0.04777520015349462, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5478 + }, + { + "epoch": 0.0477839214386632, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5479 + }, + { + "epoch": 0.04779264272383178, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5480 + }, + { + "epoch": 0.04780136400900037, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5481 + }, + { + "epoch": 0.04781008529416895, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5482 + }, + { + "epoch": 0.04781880657933753, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5483 + }, + { + "epoch": 0.047827527864506114, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 5484 + }, + { + "epoch": 0.047836249149674694, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5485 + }, + { + "epoch": 0.04784497043484328, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5486 + }, + { + "epoch": 0.04785369172001186, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5487 + }, + { + "epoch": 0.047862413005180446, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5488 + }, + { + "epoch": 0.047871134290349025, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5489 + }, + { + "epoch": 0.04787985557551761, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 5490 + }, + { + "epoch": 0.04788857686068619, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5491 + }, + { + "epoch": 0.04789729814585477, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5492 + }, + { + "epoch": 0.04790601943102336, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5493 + }, + { + "epoch": 0.047914740716191936, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 5494 + }, + { + "epoch": 0.04792346200136052, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5495 + }, + { + "epoch": 0.0479321832865291, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5496 + }, + { + "epoch": 0.04794090457169769, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5497 + }, + { + "epoch": 0.04794962585686627, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5498 + }, + { + "epoch": 0.04795834714203485, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 5499 + }, + { + "epoch": 0.047967068427203434, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5500 + }, + { + "epoch": 0.04797578971237201, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5501 + }, + { + "epoch": 0.0479845109975406, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5502 + }, + { + "epoch": 0.04799323228270918, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5503 + }, + { + "epoch": 0.048001953567877766, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5504 + }, + { + "epoch": 0.048010674853046345, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5505 + }, + { + "epoch": 0.048019396138214925, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5506 + }, + { + "epoch": 0.04802811742338351, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 5507 + }, + { + "epoch": 0.04803683870855209, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 5508 + }, + { + "epoch": 0.04804555999372068, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5509 + }, + { + "epoch": 0.048054281278889256, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5510 + }, + { + "epoch": 0.04806300256405784, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5511 + }, + { + "epoch": 0.04807172384922642, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5512 + }, + { + "epoch": 0.048080445134395, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5513 + }, + { + "epoch": 0.04808916641956359, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5514 + }, + { + "epoch": 0.04809788770473217, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5515 + }, + { + "epoch": 0.048106608989900754, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5516 + }, + { + "epoch": 0.04811533027506933, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5517 + }, + { + "epoch": 0.04812405156023792, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 5518 + }, + { + "epoch": 0.0481327728454065, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5519 + }, + { + "epoch": 0.04814149413057508, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 5520 + }, + { + "epoch": 0.048150215415743665, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5521 + }, + { + "epoch": 0.048158936700912244, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5522 + }, + { + "epoch": 0.04816765798608083, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5523 + }, + { + "epoch": 0.04817637927124941, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5524 + }, + { + "epoch": 0.048185100556418, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 5525 + }, + { + "epoch": 0.048193821841586576, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5526 + }, + { + "epoch": 0.048202543126755155, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5527 + }, + { + "epoch": 0.04821126441192374, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5528 + }, + { + "epoch": 0.04821998569709232, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5529 + }, + { + "epoch": 0.04822870698226091, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 5530 + }, + { + "epoch": 0.04823742826742949, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 5531 + }, + { + "epoch": 0.048246149552598074, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5532 + }, + { + "epoch": 0.04825487083776665, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 5533 + }, + { + "epoch": 0.04826359212293523, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5534 + }, + { + "epoch": 0.04827231340810382, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 5535 + }, + { + "epoch": 0.0482810346932724, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5536 + }, + { + "epoch": 0.048289755978440985, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5537 + }, + { + "epoch": 0.048298477263609564, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5538 + }, + { + "epoch": 0.04830719854877815, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5539 + }, + { + "epoch": 0.04831591983394673, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5540 + }, + { + "epoch": 0.04832464111911531, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 5541 + }, + { + "epoch": 0.048333362404283896, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5542 + }, + { + "epoch": 0.048342083689452475, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 5543 + }, + { + "epoch": 0.04835080497462106, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5544 + }, + { + "epoch": 0.04835952625978964, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5545 + }, + { + "epoch": 0.04836824754495823, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5546 + }, + { + "epoch": 0.04837696883012681, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5547 + }, + { + "epoch": 0.04838569011529539, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 5548 + }, + { + "epoch": 0.04839441140046397, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5549 + }, + { + "epoch": 0.04840313268563255, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 5550 + }, + { + "epoch": 0.04841185397080114, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 5551 + }, + { + "epoch": 0.04842057525596972, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5552 + }, + { + "epoch": 0.048429296541138304, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5553 + }, + { + "epoch": 0.048438017826306884, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5554 + }, + { + "epoch": 0.04844673911147547, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5555 + }, + { + "epoch": 0.04845546039664405, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 5556 + }, + { + "epoch": 0.04846418168181263, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 5557 + }, + { + "epoch": 0.048472902966981216, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5558 + }, + { + "epoch": 0.048481624252149795, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5559 + }, + { + "epoch": 0.04849034553731838, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5560 + }, + { + "epoch": 0.04849906682248696, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5561 + }, + { + "epoch": 0.04850778810765555, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5562 + }, + { + "epoch": 0.04851650939282413, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5563 + }, + { + "epoch": 0.048525230677992706, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5564 + }, + { + "epoch": 0.04853395196316129, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 5565 + }, + { + "epoch": 0.04854267324832987, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5566 + }, + { + "epoch": 0.04855139453349846, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5567 + }, + { + "epoch": 0.04856011581866704, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5568 + }, + { + "epoch": 0.048568837103835624, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5569 + }, + { + "epoch": 0.048577558389004204, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5570 + }, + { + "epoch": 0.04858627967417278, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0709, + "step": 5571 + }, + { + "epoch": 0.04859500095934137, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5572 + }, + { + "epoch": 0.04860372224450995, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5573 + }, + { + "epoch": 0.048612443529678535, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5574 + }, + { + "epoch": 0.048621164814847115, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5575 + }, + { + "epoch": 0.0486298861000157, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5576 + }, + { + "epoch": 0.04863860738518428, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5577 + }, + { + "epoch": 0.04864732867035286, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 5578 + }, + { + "epoch": 0.048656049955521447, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5579 + }, + { + "epoch": 0.048664771240690026, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5580 + }, + { + "epoch": 0.04867349252585861, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5581 + }, + { + "epoch": 0.04868221381102719, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5582 + }, + { + "epoch": 0.04869093509619578, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 5583 + }, + { + "epoch": 0.04869965638136436, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 5584 + }, + { + "epoch": 0.04870837766653294, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5585 + }, + { + "epoch": 0.048717098951701523, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5586 + }, + { + "epoch": 0.0487258202368701, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5587 + }, + { + "epoch": 0.04873454152203869, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 5588 + }, + { + "epoch": 0.04874326280720727, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5589 + }, + { + "epoch": 0.048751984092375855, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 5590 + }, + { + "epoch": 0.048760705377544435, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5591 + }, + { + "epoch": 0.048769426662713014, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5592 + }, + { + "epoch": 0.0487781479478816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5593 + }, + { + "epoch": 0.04878686923305018, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5594 + }, + { + "epoch": 0.048795590518218766, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5595 + }, + { + "epoch": 0.048804311803387346, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 5596 + }, + { + "epoch": 0.04881303308855593, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5597 + }, + { + "epoch": 0.04882175437372451, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 5598 + }, + { + "epoch": 0.04883047565889309, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5599 + }, + { + "epoch": 0.04883919694406168, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5600 + }, + { + "epoch": 0.04884791822923026, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 5601 + }, + { + "epoch": 0.04885663951439884, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5602 + }, + { + "epoch": 0.04886536079956742, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 5603 + }, + { + "epoch": 0.04887408208473601, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 5604 + }, + { + "epoch": 0.04888280336990459, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5605 + }, + { + "epoch": 0.048891524655073175, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 5606 + }, + { + "epoch": 0.048900245940241754, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 5607 + }, + { + "epoch": 0.048908967225410334, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5608 + }, + { + "epoch": 0.04891768851057892, + "grad_norm": 0.349609375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 5609 + }, + { + "epoch": 0.0489264097957475, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5610 + }, + { + "epoch": 0.048935131080916086, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5611 + }, + { + "epoch": 0.048943852366084666, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5612 + }, + { + "epoch": 0.04895257365125325, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5613 + }, + { + "epoch": 0.04896129493642183, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5614 + }, + { + "epoch": 0.04897001622159041, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5615 + }, + { + "epoch": 0.048978737506759, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5616 + }, + { + "epoch": 0.04898745879192758, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5617 + }, + { + "epoch": 0.04899618007709616, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5618 + }, + { + "epoch": 0.04900490136226474, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 5619 + }, + { + "epoch": 0.04901362264743333, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5620 + }, + { + "epoch": 0.04902234393260191, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5621 + }, + { + "epoch": 0.04903106521777049, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5622 + }, + { + "epoch": 0.049039786502939074, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 5623 + }, + { + "epoch": 0.049048507788107654, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5624 + }, + { + "epoch": 0.04905722907327624, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5625 + }, + { + "epoch": 0.04906595035844482, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5626 + }, + { + "epoch": 0.049074671643613406, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 5627 + }, + { + "epoch": 0.049083392928781985, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5628 + }, + { + "epoch": 0.049092114213950565, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5629 + }, + { + "epoch": 0.04910083549911915, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5630 + }, + { + "epoch": 0.04910955678428773, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 5631 + }, + { + "epoch": 0.04911827806945632, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 5632 + }, + { + "epoch": 0.049126999354624896, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 5633 + }, + { + "epoch": 0.04913572063979348, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 5634 + }, + { + "epoch": 0.04914444192496206, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5635 + }, + { + "epoch": 0.04915316321013064, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5636 + }, + { + "epoch": 0.04916188449529923, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5637 + }, + { + "epoch": 0.04917060578046781, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5638 + }, + { + "epoch": 0.049179327065636394, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5639 + }, + { + "epoch": 0.04918804835080497, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 5640 + }, + { + "epoch": 0.04919676963597356, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5641 + }, + { + "epoch": 0.04920549092114214, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5642 + }, + { + "epoch": 0.04921421220631072, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 5643 + }, + { + "epoch": 0.049222933491479305, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5644 + }, + { + "epoch": 0.049231654776647885, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5645 + }, + { + "epoch": 0.04924037606181647, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 5646 + }, + { + "epoch": 0.04924909734698505, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5647 + }, + { + "epoch": 0.04925781863215364, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5648 + }, + { + "epoch": 0.049266539917322216, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 5649 + }, + { + "epoch": 0.049275261202490796, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5650 + }, + { + "epoch": 0.04928398248765938, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5651 + }, + { + "epoch": 0.04929270377282796, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5652 + }, + { + "epoch": 0.04930142505799655, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 5653 + }, + { + "epoch": 0.04931014634316513, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 5654 + }, + { + "epoch": 0.049318867628333714, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 5655 + }, + { + "epoch": 0.04932758891350229, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5656 + }, + { + "epoch": 0.04933631019867087, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5657 + }, + { + "epoch": 0.04934503148383946, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 5658 + }, + { + "epoch": 0.04935375276900804, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 5659 + }, + { + "epoch": 0.049362474054176625, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5660 + }, + { + "epoch": 0.049371195339345204, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5661 + }, + { + "epoch": 0.04937991662451379, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 5662 + }, + { + "epoch": 0.04938863790968237, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5663 + }, + { + "epoch": 0.04939735919485096, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 5664 + }, + { + "epoch": 0.049406080480019536, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5665 + }, + { + "epoch": 0.049414801765188116, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 5666 + }, + { + "epoch": 0.0494235230503567, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5667 + }, + { + "epoch": 0.04943224433552528, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5668 + }, + { + "epoch": 0.04944096562069387, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5669 + }, + { + "epoch": 0.04944968690586245, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5670 + }, + { + "epoch": 0.049458408191031034, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5671 + }, + { + "epoch": 0.04946712947619961, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 5672 + }, + { + "epoch": 0.04947585076136819, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 5673 + }, + { + "epoch": 0.04948457204653678, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 5674 + }, + { + "epoch": 0.04949329333170536, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5675 + }, + { + "epoch": 0.049502014616873945, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5676 + }, + { + "epoch": 0.049510735902042524, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5677 + }, + { + "epoch": 0.04951945718721111, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 5678 + }, + { + "epoch": 0.04952817847237969, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 5679 + }, + { + "epoch": 0.04953689975754827, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5680 + }, + { + "epoch": 0.049545621042716856, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5681 + }, + { + "epoch": 0.049554342327885435, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5682 + }, + { + "epoch": 0.04956306361305402, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 5683 + }, + { + "epoch": 0.0495717848982226, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 5684 + }, + { + "epoch": 0.04958050618339119, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5685 + }, + { + "epoch": 0.04958922746855977, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5686 + }, + { + "epoch": 0.049597948753728346, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 5687 + }, + { + "epoch": 0.04960667003889693, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5688 + }, + { + "epoch": 0.04961539132406551, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5689 + }, + { + "epoch": 0.0496241126092341, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5690 + }, + { + "epoch": 0.04963283389440268, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5691 + }, + { + "epoch": 0.049641555179571265, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5692 + }, + { + "epoch": 0.049650276464739844, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5693 + }, + { + "epoch": 0.04965899774990842, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5694 + }, + { + "epoch": 0.04966771903507701, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5695 + }, + { + "epoch": 0.04967644032024559, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 5696 + }, + { + "epoch": 0.049685161605414176, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5697 + }, + { + "epoch": 0.049693882890582755, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5698 + }, + { + "epoch": 0.04970260417575134, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 5699 + }, + { + "epoch": 0.04971132546091992, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5700 + }, + { + "epoch": 0.0497200467460885, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 5701 + }, + { + "epoch": 0.04972876803125709, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 5702 + }, + { + "epoch": 0.049737489316425666, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5703 + }, + { + "epoch": 0.04974621060159425, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5704 + }, + { + "epoch": 0.04975493188676283, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5705 + }, + { + "epoch": 0.04976365317193142, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5706 + }, + { + "epoch": 0.0497723744571, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5707 + }, + { + "epoch": 0.04978109574226858, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5708 + }, + { + "epoch": 0.049789817027437164, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 5709 + }, + { + "epoch": 0.04979853831260574, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5710 + }, + { + "epoch": 0.04980725959777433, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5711 + }, + { + "epoch": 0.04981598088294291, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5712 + }, + { + "epoch": 0.049824702168111495, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5713 + }, + { + "epoch": 0.049833423453280075, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5714 + }, + { + "epoch": 0.049842144738448654, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 5715 + }, + { + "epoch": 0.04985086602361724, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5716 + }, + { + "epoch": 0.04985958730878582, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5717 + }, + { + "epoch": 0.04986830859395441, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 5718 + }, + { + "epoch": 0.049877029879122986, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5719 + }, + { + "epoch": 0.04988575116429157, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5720 + }, + { + "epoch": 0.04989447244946015, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 5721 + }, + { + "epoch": 0.04990319373462874, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5722 + }, + { + "epoch": 0.04991191501979732, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5723 + }, + { + "epoch": 0.0499206363049659, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5724 + }, + { + "epoch": 0.049929357590134484, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 5725 + }, + { + "epoch": 0.04993807887530306, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 5726 + }, + { + "epoch": 0.04994680016047165, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 5727 + }, + { + "epoch": 0.04995552144564023, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5728 + }, + { + "epoch": 0.049964242730808815, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5729 + }, + { + "epoch": 0.049972964015977395, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5730 + }, + { + "epoch": 0.049981685301145974, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5731 + }, + { + "epoch": 0.04999040658631456, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5732 + }, + { + "epoch": 0.04999912787148314, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5733 + }, + { + "epoch": 0.050007849156651726, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 5734 + }, + { + "epoch": 0.050016570441820306, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5735 + }, + { + "epoch": 0.05002529172698889, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5736 + }, + { + "epoch": 0.05003401301215747, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5737 + }, + { + "epoch": 0.05004273429732605, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5738 + }, + { + "epoch": 0.05005145558249464, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5739 + }, + { + "epoch": 0.05006017686766322, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5740 + }, + { + "epoch": 0.0500688981528318, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 5741 + }, + { + "epoch": 0.05007761943800038, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5742 + }, + { + "epoch": 0.05008634072316897, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5743 + }, + { + "epoch": 0.05009506200833755, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5744 + }, + { + "epoch": 0.05010378329350613, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5745 + }, + { + "epoch": 0.050112504578674714, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 5746 + }, + { + "epoch": 0.050121225863843294, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 5747 + }, + { + "epoch": 0.05012994714901188, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5748 + }, + { + "epoch": 0.05013866843418046, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5749 + }, + { + "epoch": 0.050147389719349046, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 5750 + }, + { + "epoch": 0.050156111004517626, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5751 + }, + { + "epoch": 0.050164832289686205, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 5752 + }, + { + "epoch": 0.05017355357485479, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 5753 + }, + { + "epoch": 0.05018227486002337, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5754 + }, + { + "epoch": 0.05019099614519196, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 5755 + }, + { + "epoch": 0.05019971743036054, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 5756 + }, + { + "epoch": 0.05020843871552912, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5757 + }, + { + "epoch": 0.0502171600006977, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5758 + }, + { + "epoch": 0.05022588128586628, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 5759 + }, + { + "epoch": 0.05023460257103487, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5760 + }, + { + "epoch": 0.05024332385620345, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5761 + }, + { + "epoch": 0.050252045141372034, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 5762 + }, + { + "epoch": 0.050260766426540614, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5763 + }, + { + "epoch": 0.0502694877117092, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 5764 + }, + { + "epoch": 0.05027820899687778, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 5765 + }, + { + "epoch": 0.05028693028204636, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 5766 + }, + { + "epoch": 0.050295651567214945, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5767 + }, + { + "epoch": 0.050304372852383525, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5768 + }, + { + "epoch": 0.05031309413755211, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5769 + }, + { + "epoch": 0.05032181542272069, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 5770 + }, + { + "epoch": 0.05033053670788928, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 5771 + }, + { + "epoch": 0.05033925799305786, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5772 + }, + { + "epoch": 0.050347979278226436, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5773 + }, + { + "epoch": 0.05035670056339502, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5774 + }, + { + "epoch": 0.0503654218485636, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 5775 + }, + { + "epoch": 0.05037414313373219, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 5776 + }, + { + "epoch": 0.05038286441890077, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5777 + }, + { + "epoch": 0.050391585704069354, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5778 + }, + { + "epoch": 0.050400306989237934, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 5779 + }, + { + "epoch": 0.05040902827440652, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5780 + }, + { + "epoch": 0.0504177495595751, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5781 + }, + { + "epoch": 0.05042647084474368, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5782 + }, + { + "epoch": 0.050435192129912265, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5783 + }, + { + "epoch": 0.050443913415080845, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5784 + }, + { + "epoch": 0.05045263470024943, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5785 + }, + { + "epoch": 0.05046135598541801, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5786 + }, + { + "epoch": 0.0504700772705866, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5787 + }, + { + "epoch": 0.050478798555755176, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5788 + }, + { + "epoch": 0.050487519840923756, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5789 + }, + { + "epoch": 0.05049624112609234, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 5790 + }, + { + "epoch": 0.05050496241126092, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5791 + }, + { + "epoch": 0.05051368369642951, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5792 + }, + { + "epoch": 0.05052240498159809, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5793 + }, + { + "epoch": 0.050531126266766674, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5794 + }, + { + "epoch": 0.05053984755193525, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5795 + }, + { + "epoch": 0.05054856883710383, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5796 + }, + { + "epoch": 0.05055729012227242, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5797 + }, + { + "epoch": 0.050566011407441, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 5798 + }, + { + "epoch": 0.050574732692609585, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 5799 + }, + { + "epoch": 0.050583453977778164, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 5800 + }, + { + "epoch": 0.05059217526294675, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5801 + }, + { + "epoch": 0.05060089654811533, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5802 + }, + { + "epoch": 0.05060961783328391, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 5803 + }, + { + "epoch": 0.050618339118452496, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5804 + }, + { + "epoch": 0.050627060403621076, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 5805 + }, + { + "epoch": 0.05063578168878966, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5806 + }, + { + "epoch": 0.05064450297395824, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 5807 + }, + { + "epoch": 0.05065322425912683, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 5808 + }, + { + "epoch": 0.05066194554429541, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5809 + }, + { + "epoch": 0.05067066682946399, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5810 + }, + { + "epoch": 0.05067938811463257, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5811 + }, + { + "epoch": 0.05068810939980115, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5812 + }, + { + "epoch": 0.05069683068496974, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 5813 + }, + { + "epoch": 0.05070555197013832, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5814 + }, + { + "epoch": 0.050714273255306905, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5815 + }, + { + "epoch": 0.050722994540475484, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 5816 + }, + { + "epoch": 0.050731715825644064, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5817 + }, + { + "epoch": 0.05074043711081265, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 5818 + }, + { + "epoch": 0.05074915839598123, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5819 + }, + { + "epoch": 0.050757879681149816, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 5820 + }, + { + "epoch": 0.050766600966318395, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 5821 + }, + { + "epoch": 0.05077532225148698, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 5822 + }, + { + "epoch": 0.05078404353665556, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5823 + }, + { + "epoch": 0.05079276482182414, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5824 + }, + { + "epoch": 0.05080148610699273, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5825 + }, + { + "epoch": 0.050810207392161306, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5826 + }, + { + "epoch": 0.05081892867732989, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 5827 + }, + { + "epoch": 0.05082764996249847, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 5828 + }, + { + "epoch": 0.05083637124766706, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 5829 + }, + { + "epoch": 0.05084509253283564, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5830 + }, + { + "epoch": 0.050853813818004225, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5831 + }, + { + "epoch": 0.050862535103172804, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5832 + }, + { + "epoch": 0.050871256388341383, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5833 + }, + { + "epoch": 0.05087997767350997, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5834 + }, + { + "epoch": 0.05088869895867855, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5835 + }, + { + "epoch": 0.050897420243847136, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 5836 + }, + { + "epoch": 0.050906141529015715, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5837 + }, + { + "epoch": 0.0509148628141843, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5838 + }, + { + "epoch": 0.05092358409935288, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5839 + }, + { + "epoch": 0.05093230538452146, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5840 + }, + { + "epoch": 0.05094102666969005, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 5841 + }, + { + "epoch": 0.050949747954858626, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 5842 + }, + { + "epoch": 0.05095846924002721, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 5843 + }, + { + "epoch": 0.05096719052519579, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5844 + }, + { + "epoch": 0.05097591181036438, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 5845 + }, + { + "epoch": 0.05098463309553296, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5846 + }, + { + "epoch": 0.05099335438070154, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5847 + }, + { + "epoch": 0.051002075665870124, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 5848 + }, + { + "epoch": 0.0510107969510387, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 5849 + }, + { + "epoch": 0.05101951823620729, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 5850 + }, + { + "epoch": 0.05102823952137587, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5851 + }, + { + "epoch": 0.051036960806544455, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5852 + }, + { + "epoch": 0.051045682091713035, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 5853 + }, + { + "epoch": 0.051054403376881614, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5854 + }, + { + "epoch": 0.0510631246620502, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5855 + }, + { + "epoch": 0.05107184594721878, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 5856 + }, + { + "epoch": 0.05108056723238737, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5857 + }, + { + "epoch": 0.051089288517555946, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 5858 + }, + { + "epoch": 0.05109800980272453, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5859 + }, + { + "epoch": 0.05110673108789311, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 5860 + }, + { + "epoch": 0.05111545237306169, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 5861 + }, + { + "epoch": 0.05112417365823028, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 5862 + }, + { + "epoch": 0.05113289494339886, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 5863 + }, + { + "epoch": 0.051141616228567444, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 5864 + }, + { + "epoch": 0.05115033751373602, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5865 + }, + { + "epoch": 0.05115905879890461, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5866 + }, + { + "epoch": 0.05116778008407319, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 5867 + }, + { + "epoch": 0.05117650136924177, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5868 + }, + { + "epoch": 0.051185222654410355, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5869 + }, + { + "epoch": 0.051193943939578934, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 5870 + }, + { + "epoch": 0.05120266522474752, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 5871 + }, + { + "epoch": 0.0512113865099161, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5872 + }, + { + "epoch": 0.051220107795084686, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5873 + }, + { + "epoch": 0.051228829080253266, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5874 + }, + { + "epoch": 0.051237550365421845, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5875 + }, + { + "epoch": 0.05124627165059043, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5876 + }, + { + "epoch": 0.05125499293575901, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 5877 + }, + { + "epoch": 0.0512637142209276, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 5878 + }, + { + "epoch": 0.05127243550609618, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5879 + }, + { + "epoch": 0.05128115679126476, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5880 + }, + { + "epoch": 0.05128987807643334, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 5881 + }, + { + "epoch": 0.05129859936160192, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5882 + }, + { + "epoch": 0.05130732064677051, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5883 + }, + { + "epoch": 0.05131604193193909, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 5884 + }, + { + "epoch": 0.051324763217107675, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5885 + }, + { + "epoch": 0.051333484502276254, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5886 + }, + { + "epoch": 0.05134220578744484, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5887 + }, + { + "epoch": 0.05135092707261342, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5888 + }, + { + "epoch": 0.051359648357782006, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5889 + }, + { + "epoch": 0.051368369642950586, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5890 + }, + { + "epoch": 0.051377090928119165, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5891 + }, + { + "epoch": 0.05138581221328775, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5892 + }, + { + "epoch": 0.05139453349845633, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5893 + }, + { + "epoch": 0.05140325478362492, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5894 + }, + { + "epoch": 0.0514119760687935, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5895 + }, + { + "epoch": 0.05142069735396208, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5896 + }, + { + "epoch": 0.05142941863913066, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5897 + }, + { + "epoch": 0.05143813992429924, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5898 + }, + { + "epoch": 0.05144686120946783, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5899 + }, + { + "epoch": 0.05145558249463641, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5900 + }, + { + "epoch": 0.051464303779804994, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5901 + }, + { + "epoch": 0.051473025064973574, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 5902 + }, + { + "epoch": 0.05148174635014216, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5903 + }, + { + "epoch": 0.05149046763531074, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5904 + }, + { + "epoch": 0.05149918892047932, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5905 + }, + { + "epoch": 0.051507910205647905, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 5906 + }, + { + "epoch": 0.051516631490816485, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 5907 + }, + { + "epoch": 0.05152535277598507, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5908 + }, + { + "epoch": 0.05153407406115365, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 5909 + }, + { + "epoch": 0.05154279534632224, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 5910 + }, + { + "epoch": 0.05155151663149082, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5911 + }, + { + "epoch": 0.051560237916659396, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5912 + }, + { + "epoch": 0.05156895920182798, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 5913 + }, + { + "epoch": 0.05157768048699656, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5914 + }, + { + "epoch": 0.05158640177216515, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5915 + }, + { + "epoch": 0.05159512305733373, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5916 + }, + { + "epoch": 0.051603844342502314, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 5917 + }, + { + "epoch": 0.051612565627670894, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5918 + }, + { + "epoch": 0.05162128691283947, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 5919 + }, + { + "epoch": 0.05163000819800806, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 5920 + }, + { + "epoch": 0.05163872948317664, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5921 + }, + { + "epoch": 0.051647450768345225, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5922 + }, + { + "epoch": 0.051656172053513805, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5923 + }, + { + "epoch": 0.05166489333868239, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5924 + }, + { + "epoch": 0.05167361462385097, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5925 + }, + { + "epoch": 0.05168233590901955, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 5926 + }, + { + "epoch": 0.051691057194188136, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 5927 + }, + { + "epoch": 0.051699778479356716, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 5928 + }, + { + "epoch": 0.0517084997645253, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5929 + }, + { + "epoch": 0.05171722104969388, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5930 + }, + { + "epoch": 0.05172594233486247, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5931 + }, + { + "epoch": 0.05173466362003105, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5932 + }, + { + "epoch": 0.05174338490519963, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5933 + }, + { + "epoch": 0.05175210619036821, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 5934 + }, + { + "epoch": 0.05176082747553679, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 5935 + }, + { + "epoch": 0.05176954876070538, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 5936 + }, + { + "epoch": 0.05177827004587396, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5937 + }, + { + "epoch": 0.051786991331042545, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5938 + }, + { + "epoch": 0.051795712616211124, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5939 + }, + { + "epoch": 0.051804433901379704, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 5940 + }, + { + "epoch": 0.05181315518654829, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5941 + }, + { + "epoch": 0.05182187647171687, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5942 + }, + { + "epoch": 0.051830597756885456, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 5943 + }, + { + "epoch": 0.051839319042054036, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5944 + }, + { + "epoch": 0.05184804032722262, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 5945 + }, + { + "epoch": 0.0518567616123912, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5946 + }, + { + "epoch": 0.05186548289755979, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5947 + }, + { + "epoch": 0.05187420418272837, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 5948 + }, + { + "epoch": 0.05188292546789695, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5949 + }, + { + "epoch": 0.05189164675306553, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5950 + }, + { + "epoch": 0.05190036803823411, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 5951 + }, + { + "epoch": 0.0519090893234027, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5952 + }, + { + "epoch": 0.05191781060857128, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 5953 + }, + { + "epoch": 0.051926531893739865, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5954 + }, + { + "epoch": 0.051935253178908444, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5955 + }, + { + "epoch": 0.051943974464077024, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 5956 + }, + { + "epoch": 0.05195269574924561, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5957 + }, + { + "epoch": 0.05196141703441419, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5958 + }, + { + "epoch": 0.051970138319582776, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5959 + }, + { + "epoch": 0.051978859604751355, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5960 + }, + { + "epoch": 0.05198758088991994, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5961 + }, + { + "epoch": 0.05199630217508852, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5962 + }, + { + "epoch": 0.0520050234602571, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5963 + }, + { + "epoch": 0.05201374474542569, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5964 + }, + { + "epoch": 0.05202246603059427, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5965 + }, + { + "epoch": 0.05203118731576285, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5966 + }, + { + "epoch": 0.05203990860093143, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5967 + }, + { + "epoch": 0.05204862988610002, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5968 + }, + { + "epoch": 0.0520573511712686, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 5969 + }, + { + "epoch": 0.05206607245643718, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 5970 + }, + { + "epoch": 0.052074793741605764, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 5971 + }, + { + "epoch": 0.052083515026774344, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5972 + }, + { + "epoch": 0.05209223631194293, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5973 + }, + { + "epoch": 0.05210095759711151, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5974 + }, + { + "epoch": 0.052109678882280096, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 5975 + }, + { + "epoch": 0.052118400167448675, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 5976 + }, + { + "epoch": 0.052127121452617255, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5977 + }, + { + "epoch": 0.05213584273778584, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5978 + }, + { + "epoch": 0.05214456402295442, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 5979 + }, + { + "epoch": 0.05215328530812301, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5980 + }, + { + "epoch": 0.052162006593291586, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5981 + }, + { + "epoch": 0.05217072787846017, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5982 + }, + { + "epoch": 0.05217944916362875, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 5983 + }, + { + "epoch": 0.05218817044879733, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5984 + }, + { + "epoch": 0.05219689173396592, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 5985 + }, + { + "epoch": 0.0522056130191345, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 5986 + }, + { + "epoch": 0.052214334304303084, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5987 + }, + { + "epoch": 0.05222305558947166, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 5988 + }, + { + "epoch": 0.05223177687464025, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 5989 + }, + { + "epoch": 0.05224049815980883, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 5990 + }, + { + "epoch": 0.05224921944497741, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5991 + }, + { + "epoch": 0.052257940730145995, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5992 + }, + { + "epoch": 0.052266662015314574, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5993 + }, + { + "epoch": 0.05227538330048316, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 5994 + }, + { + "epoch": 0.05228410458565174, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5995 + }, + { + "epoch": 0.05229282587082033, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5996 + }, + { + "epoch": 0.052301547155988906, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5997 + }, + { + "epoch": 0.052310268441157486, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5998 + }, + { + "epoch": 0.05231898972632607, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 5999 + }, + { + "epoch": 0.05232771101149465, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 6000 + }, + { + "epoch": 0.05233643229666324, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 6001 + }, + { + "epoch": 0.05234515358183182, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 6002 + }, + { + "epoch": 0.052353874867000404, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6003 + }, + { + "epoch": 0.05236259615216898, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 6004 + }, + { + "epoch": 0.05237131743733757, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6005 + }, + { + "epoch": 0.05238003872250615, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 6006 + }, + { + "epoch": 0.05238876000767473, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6007 + }, + { + "epoch": 0.052397481292843315, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 6008 + }, + { + "epoch": 0.052406202578011894, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6009 + }, + { + "epoch": 0.05241492386318048, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6010 + }, + { + "epoch": 0.05242364514834906, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6011 + }, + { + "epoch": 0.052432366433517646, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6012 + }, + { + "epoch": 0.052441087718686226, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6013 + }, + { + "epoch": 0.052449809003854805, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6014 + }, + { + "epoch": 0.05245853028902339, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 6015 + }, + { + "epoch": 0.05246725157419197, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 6016 + }, + { + "epoch": 0.05247597285936056, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 6017 + }, + { + "epoch": 0.05248469414452914, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6018 + }, + { + "epoch": 0.05249341542969772, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 6019 + }, + { + "epoch": 0.0525021367148663, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 6020 + }, + { + "epoch": 0.05251085800003488, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6021 + }, + { + "epoch": 0.05251957928520347, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 6022 + }, + { + "epoch": 0.05252830057037205, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 6023 + }, + { + "epoch": 0.052537021855540635, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 6024 + }, + { + "epoch": 0.052545743140709214, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6025 + }, + { + "epoch": 0.0525544644258778, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 6026 + }, + { + "epoch": 0.05256318571104638, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 6027 + }, + { + "epoch": 0.05257190699621496, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6028 + }, + { + "epoch": 0.052580628281383546, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 6029 + }, + { + "epoch": 0.052589349566552125, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6030 + }, + { + "epoch": 0.05259807085172071, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 6031 + }, + { + "epoch": 0.05260679213688929, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 6032 + }, + { + "epoch": 0.05261551342205788, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 6033 + }, + { + "epoch": 0.05262423470722646, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 6034 + }, + { + "epoch": 0.052632955992395036, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6035 + }, + { + "epoch": 0.05264167727756362, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 6036 + }, + { + "epoch": 0.0526503985627322, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6037 + }, + { + "epoch": 0.05265911984790079, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 6038 + }, + { + "epoch": 0.05266784113306937, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6039 + }, + { + "epoch": 0.052676562418237954, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6040 + }, + { + "epoch": 0.052685283703406534, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 6041 + }, + { + "epoch": 0.05269400498857511, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 6042 + }, + { + "epoch": 0.0527027262737437, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 6043 + }, + { + "epoch": 0.05271144755891228, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6044 + }, + { + "epoch": 0.052720168844080866, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6045 + }, + { + "epoch": 0.052728890129249445, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6046 + }, + { + "epoch": 0.05273761141441803, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6047 + }, + { + "epoch": 0.05274633269958661, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6048 + }, + { + "epoch": 0.05275505398475519, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6049 + }, + { + "epoch": 0.05276377526992378, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 6050 + }, + { + "epoch": 0.052772496555092356, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6051 + }, + { + "epoch": 0.05278121784026094, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6052 + }, + { + "epoch": 0.05278993912542952, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 6053 + }, + { + "epoch": 0.05279866041059811, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6054 + }, + { + "epoch": 0.05280738169576669, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 6055 + }, + { + "epoch": 0.05281610298093527, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 6056 + }, + { + "epoch": 0.052824824266103854, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6057 + }, + { + "epoch": 0.05283354555127243, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 6058 + }, + { + "epoch": 0.05284226683644102, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6059 + }, + { + "epoch": 0.0528509881216096, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 6060 + }, + { + "epoch": 0.052859709406778185, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6061 + }, + { + "epoch": 0.052868430691946765, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6062 + }, + { + "epoch": 0.05287715197711535, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6063 + }, + { + "epoch": 0.05288587326228393, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6064 + }, + { + "epoch": 0.05289459454745251, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 6065 + }, + { + "epoch": 0.052903315832621096, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6066 + }, + { + "epoch": 0.052912037117789676, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6067 + }, + { + "epoch": 0.05292075840295826, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 6068 + }, + { + "epoch": 0.05292947968812684, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 6069 + }, + { + "epoch": 0.05293820097329543, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6070 + }, + { + "epoch": 0.05294692225846401, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6071 + }, + { + "epoch": 0.05295564354363259, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 6072 + }, + { + "epoch": 0.05296436482880117, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6073 + }, + { + "epoch": 0.05297308611396975, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 6074 + }, + { + "epoch": 0.05298180739913834, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6075 + }, + { + "epoch": 0.05299052868430692, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 6076 + }, + { + "epoch": 0.052999249969475505, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 6077 + }, + { + "epoch": 0.053007971254644085, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 6078 + }, + { + "epoch": 0.053016692539812664, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6079 + }, + { + "epoch": 0.05302541382498125, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 6080 + }, + { + "epoch": 0.05303413511014983, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6081 + }, + { + "epoch": 0.053042856395318416, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 6082 + }, + { + "epoch": 0.053051577680486996, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6083 + }, + { + "epoch": 0.05306029896565558, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 6084 + }, + { + "epoch": 0.05306902025082416, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 6085 + }, + { + "epoch": 0.05307774153599274, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6086 + }, + { + "epoch": 0.05308646282116133, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6087 + }, + { + "epoch": 0.05309518410632991, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 6088 + }, + { + "epoch": 0.05310390539149849, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 6089 + }, + { + "epoch": 0.05311262667666707, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 6090 + }, + { + "epoch": 0.05312134796183566, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 6091 + }, + { + "epoch": 0.05313006924700424, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6092 + }, + { + "epoch": 0.05313879053217282, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 6093 + }, + { + "epoch": 0.053147511817341404, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6094 + }, + { + "epoch": 0.053156233102509984, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0829, + "step": 6095 + }, + { + "epoch": 0.05316495438767857, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6096 + }, + { + "epoch": 0.05317367567284715, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 6097 + }, + { + "epoch": 0.053182396958015736, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 6098 + }, + { + "epoch": 0.053191118243184315, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6099 + }, + { + "epoch": 0.053199839528352895, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6100 + }, + { + "epoch": 0.05320856081352148, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6101 + }, + { + "epoch": 0.05321728209869006, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 6102 + }, + { + "epoch": 0.05322600338385865, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 6103 + }, + { + "epoch": 0.05323472466902723, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6104 + }, + { + "epoch": 0.05324344595419581, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6105 + }, + { + "epoch": 0.05325216723936439, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6106 + }, + { + "epoch": 0.05326088852453297, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 6107 + }, + { + "epoch": 0.05326960980970156, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 6108 + }, + { + "epoch": 0.05327833109487014, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 6109 + }, + { + "epoch": 0.053287052380038724, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 6110 + }, + { + "epoch": 0.053295773665207304, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6111 + }, + { + "epoch": 0.05330449495037589, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 6112 + }, + { + "epoch": 0.05331321623554447, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6113 + }, + { + "epoch": 0.05332193752071305, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6114 + }, + { + "epoch": 0.053330658805881635, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6115 + }, + { + "epoch": 0.053339380091050215, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6116 + }, + { + "epoch": 0.0533481013762188, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6117 + }, + { + "epoch": 0.05335682266138738, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 6118 + }, + { + "epoch": 0.05336554394655597, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 6119 + }, + { + "epoch": 0.053374265231724546, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 6120 + }, + { + "epoch": 0.05338298651689313, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6121 + }, + { + "epoch": 0.05339170780206171, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 6122 + }, + { + "epoch": 0.05340042908723029, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6123 + }, + { + "epoch": 0.05340915037239888, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 6124 + }, + { + "epoch": 0.05341787165756746, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 6125 + }, + { + "epoch": 0.053426592942736044, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6126 + }, + { + "epoch": 0.05343531422790462, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0765, + "step": 6127 + }, + { + "epoch": 0.05344403551307321, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6128 + }, + { + "epoch": 0.05345275679824179, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 6129 + }, + { + "epoch": 0.05346147808341037, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6130 + }, + { + "epoch": 0.053470199368578955, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6131 + }, + { + "epoch": 0.053478920653747535, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6132 + }, + { + "epoch": 0.05348764193891612, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6133 + }, + { + "epoch": 0.0534963632240847, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 6134 + }, + { + "epoch": 0.05350508450925329, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6135 + }, + { + "epoch": 0.053513805794421866, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 6136 + }, + { + "epoch": 0.053522527079590446, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6137 + }, + { + "epoch": 0.05353124836475903, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6138 + }, + { + "epoch": 0.05353996964992761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 6139 + }, + { + "epoch": 0.0535486909350962, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6140 + }, + { + "epoch": 0.05355741222026478, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6141 + }, + { + "epoch": 0.053566133505433364, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 6142 + }, + { + "epoch": 0.05357485479060194, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 6143 + }, + { + "epoch": 0.05358357607577052, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6144 + }, + { + "epoch": 0.05359229736093911, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 6145 + }, + { + "epoch": 0.05360101864610769, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6146 + }, + { + "epoch": 0.053609739931276275, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 6147 + }, + { + "epoch": 0.053618461216444854, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6148 + }, + { + "epoch": 0.05362718250161344, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6149 + }, + { + "epoch": 0.05363590378678202, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 6150 + }, + { + "epoch": 0.0536446250719506, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 6151 + }, + { + "epoch": 0.053653346357119186, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 6152 + }, + { + "epoch": 0.053662067642287765, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6153 + }, + { + "epoch": 0.05367078892745635, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 6154 + }, + { + "epoch": 0.05367951021262493, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6155 + }, + { + "epoch": 0.05368823149779352, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6156 + }, + { + "epoch": 0.0536969527829621, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6157 + }, + { + "epoch": 0.05370567406813068, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 6158 + }, + { + "epoch": 0.05371439535329926, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6159 + }, + { + "epoch": 0.05372311663846784, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 6160 + }, + { + "epoch": 0.05373183792363643, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6161 + }, + { + "epoch": 0.05374055920880501, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6162 + }, + { + "epoch": 0.053749280493973595, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6163 + }, + { + "epoch": 0.053758001779142174, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 6164 + }, + { + "epoch": 0.053766723064310754, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 6165 + }, + { + "epoch": 0.05377544434947934, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6166 + }, + { + "epoch": 0.05378416563464792, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6167 + }, + { + "epoch": 0.053792886919816506, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6168 + }, + { + "epoch": 0.053801608204985085, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6169 + }, + { + "epoch": 0.05381032949015367, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6170 + }, + { + "epoch": 0.05381905077532225, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 6171 + }, + { + "epoch": 0.05382777206049083, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 6172 + }, + { + "epoch": 0.05383649334565942, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6173 + }, + { + "epoch": 0.053845214630827996, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6174 + }, + { + "epoch": 0.05385393591599658, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 6175 + }, + { + "epoch": 0.05386265720116516, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6176 + }, + { + "epoch": 0.05387137848633375, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6177 + }, + { + "epoch": 0.05388009977150233, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6178 + }, + { + "epoch": 0.053888821056670914, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6179 + }, + { + "epoch": 0.053897542341839494, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6180 + }, + { + "epoch": 0.05390626362700807, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 6181 + }, + { + "epoch": 0.05391498491217666, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6182 + }, + { + "epoch": 0.05392370619734524, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 6183 + }, + { + "epoch": 0.053932427482513826, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6184 + }, + { + "epoch": 0.053941148767682405, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6185 + }, + { + "epoch": 0.05394987005285099, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 6186 + }, + { + "epoch": 0.05395859133801957, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6187 + }, + { + "epoch": 0.05396731262318815, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 6188 + }, + { + "epoch": 0.05397603390835674, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6189 + }, + { + "epoch": 0.053984755193525316, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6190 + }, + { + "epoch": 0.0539934764786939, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 6191 + }, + { + "epoch": 0.05400219776386248, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 6192 + }, + { + "epoch": 0.05401091904903107, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 6193 + }, + { + "epoch": 0.05401964033419965, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6194 + }, + { + "epoch": 0.05402836161936823, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 6195 + }, + { + "epoch": 0.054037082904536814, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 6196 + }, + { + "epoch": 0.05404580418970539, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 6197 + }, + { + "epoch": 0.05405452547487398, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6198 + }, + { + "epoch": 0.05406324676004256, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 6199 + }, + { + "epoch": 0.054071968045211145, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6200 + }, + { + "epoch": 0.054080689330379725, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6201 + }, + { + "epoch": 0.054089410615548304, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 6202 + }, + { + "epoch": 0.05409813190071689, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 6203 + }, + { + "epoch": 0.05410685318588547, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 6204 + }, + { + "epoch": 0.054115574471054056, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6205 + }, + { + "epoch": 0.054124295756222636, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6206 + }, + { + "epoch": 0.05413301704139122, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 6207 + }, + { + "epoch": 0.0541417383265598, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 6208 + }, + { + "epoch": 0.05415045961172838, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 6209 + }, + { + "epoch": 0.05415918089689697, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6210 + }, + { + "epoch": 0.05416790218206555, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6211 + }, + { + "epoch": 0.05417662346723413, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6212 + }, + { + "epoch": 0.05418534475240271, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6213 + }, + { + "epoch": 0.0541940660375713, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 6214 + }, + { + "epoch": 0.05420278732273988, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 6215 + }, + { + "epoch": 0.05421150860790846, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 6216 + }, + { + "epoch": 0.054220229893077045, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6217 + }, + { + "epoch": 0.054228951178245624, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 6218 + }, + { + "epoch": 0.05423767246341421, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 6219 + }, + { + "epoch": 0.05424639374858279, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6220 + }, + { + "epoch": 0.054255115033751376, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 6221 + }, + { + "epoch": 0.054263836318919956, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 6222 + }, + { + "epoch": 0.054272557604088535, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 6223 + }, + { + "epoch": 0.05428127888925712, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 6224 + }, + { + "epoch": 0.0542900001744257, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6225 + }, + { + "epoch": 0.05429872145959429, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 6226 + }, + { + "epoch": 0.05430744274476287, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 6227 + }, + { + "epoch": 0.05431616402993145, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6228 + }, + { + "epoch": 0.05432488531510003, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6229 + }, + { + "epoch": 0.05433360660026861, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6230 + }, + { + "epoch": 0.0543423278854372, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 6231 + }, + { + "epoch": 0.05435104917060578, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6232 + }, + { + "epoch": 0.054359770455774364, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6233 + }, + { + "epoch": 0.054368491740942944, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 6234 + }, + { + "epoch": 0.05437721302611153, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 6235 + }, + { + "epoch": 0.05438593431128011, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 6236 + }, + { + "epoch": 0.054394655596448696, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 6237 + }, + { + "epoch": 0.054403376881617276, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6238 + }, + { + "epoch": 0.054412098166785855, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 6239 + }, + { + "epoch": 0.05442081945195444, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 6240 + }, + { + "epoch": 0.05442954073712302, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 6241 + }, + { + "epoch": 0.05443826202229161, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 6242 + }, + { + "epoch": 0.05444698330746019, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 6243 + }, + { + "epoch": 0.05445570459262877, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6244 + }, + { + "epoch": 0.05446442587779735, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 6245 + }, + { + "epoch": 0.05447314716296593, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6246 + }, + { + "epoch": 0.05448186844813452, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6247 + }, + { + "epoch": 0.0544905897333031, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6248 + }, + { + "epoch": 0.054499311018471684, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6249 + }, + { + "epoch": 0.054508032303640264, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 6250 + }, + { + "epoch": 0.05451675358880885, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 6251 + }, + { + "epoch": 0.05452547487397743, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 6252 + }, + { + "epoch": 0.05453419615914601, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6253 + }, + { + "epoch": 0.054542917444314595, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6254 + }, + { + "epoch": 0.054551638729483175, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6255 + }, + { + "epoch": 0.05456036001465176, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 6256 + }, + { + "epoch": 0.05456908129982034, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6257 + }, + { + "epoch": 0.05457780258498893, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 6258 + }, + { + "epoch": 0.054586523870157506, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6259 + }, + { + "epoch": 0.054595245155326086, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6260 + }, + { + "epoch": 0.05460396644049467, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 6261 + }, + { + "epoch": 0.05461268772566325, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6262 + }, + { + "epoch": 0.05462140901083184, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 6263 + }, + { + "epoch": 0.05463013029600042, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 6264 + }, + { + "epoch": 0.054638851581169004, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6265 + }, + { + "epoch": 0.05464757286633758, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 6266 + }, + { + "epoch": 0.05465629415150616, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6267 + }, + { + "epoch": 0.05466501543667475, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6268 + }, + { + "epoch": 0.05467373672184333, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 6269 + }, + { + "epoch": 0.054682458007011915, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 6270 + }, + { + "epoch": 0.054691179292180495, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6271 + }, + { + "epoch": 0.05469990057734908, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6272 + }, + { + "epoch": 0.05470862186251766, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 6273 + }, + { + "epoch": 0.05471734314768624, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 6274 + }, + { + "epoch": 0.054726064432854826, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6275 + }, + { + "epoch": 0.054734785718023406, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 6276 + }, + { + "epoch": 0.05474350700319199, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 6277 + }, + { + "epoch": 0.05475222828836057, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6278 + }, + { + "epoch": 0.05476094957352916, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 6279 + }, + { + "epoch": 0.05476967085869774, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6280 + }, + { + "epoch": 0.05477839214386632, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 6281 + }, + { + "epoch": 0.0547871134290349, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 6282 + }, + { + "epoch": 0.05479583471420348, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6283 + }, + { + "epoch": 0.05480455599937207, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6284 + }, + { + "epoch": 0.05481327728454065, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6285 + }, + { + "epoch": 0.054821998569709235, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6286 + }, + { + "epoch": 0.054830719854877814, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 6287 + }, + { + "epoch": 0.054839441140046394, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 6288 + }, + { + "epoch": 0.05484816242521498, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 6289 + }, + { + "epoch": 0.05485688371038356, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 6290 + }, + { + "epoch": 0.054865604995552146, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6291 + }, + { + "epoch": 0.054874326280720725, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6292 + }, + { + "epoch": 0.05488304756588931, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6293 + }, + { + "epoch": 0.05489176885105789, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6294 + }, + { + "epoch": 0.05490049013622648, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6295 + }, + { + "epoch": 0.05490921142139506, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 6296 + }, + { + "epoch": 0.05491793270656364, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 6297 + }, + { + "epoch": 0.05492665399173222, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 6298 + }, + { + "epoch": 0.0549353752769008, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6299 + }, + { + "epoch": 0.05494409656206939, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6300 + }, + { + "epoch": 0.05495281784723797, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6301 + }, + { + "epoch": 0.054961539132406555, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 6302 + }, + { + "epoch": 0.054970260417575134, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 6303 + }, + { + "epoch": 0.054978981702743714, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6304 + }, + { + "epoch": 0.0549877029879123, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6305 + }, + { + "epoch": 0.05499642427308088, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 6306 + }, + { + "epoch": 0.055005145558249466, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6307 + }, + { + "epoch": 0.055013866843418045, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 6308 + }, + { + "epoch": 0.05502258812858663, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6309 + }, + { + "epoch": 0.05503130941375521, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 6310 + }, + { + "epoch": 0.05504003069892379, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6311 + }, + { + "epoch": 0.05504875198409238, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6312 + }, + { + "epoch": 0.055057473269260956, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 6313 + }, + { + "epoch": 0.05506619455442954, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 6314 + }, + { + "epoch": 0.05507491583959812, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 6315 + }, + { + "epoch": 0.05508363712476671, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6316 + }, + { + "epoch": 0.05509235840993529, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6317 + }, + { + "epoch": 0.05510107969510387, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 6318 + }, + { + "epoch": 0.055109800980272454, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 6319 + }, + { + "epoch": 0.05511852226544103, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6320 + }, + { + "epoch": 0.05512724355060962, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6321 + }, + { + "epoch": 0.0551359648357782, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 6322 + }, + { + "epoch": 0.055144686120946786, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6323 + }, + { + "epoch": 0.055153407406115365, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 6324 + }, + { + "epoch": 0.055162128691283945, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 6325 + }, + { + "epoch": 0.05517084997645253, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 6326 + }, + { + "epoch": 0.05517957126162111, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 6327 + }, + { + "epoch": 0.0551882925467897, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6328 + }, + { + "epoch": 0.055197013831958276, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6329 + }, + { + "epoch": 0.05520573511712686, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6330 + }, + { + "epoch": 0.05521445640229544, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6331 + }, + { + "epoch": 0.05522317768746402, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6332 + }, + { + "epoch": 0.05523189897263261, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 6333 + }, + { + "epoch": 0.05524062025780119, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6334 + }, + { + "epoch": 0.055249341542969774, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6335 + }, + { + "epoch": 0.05525806282813835, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6336 + }, + { + "epoch": 0.05526678411330694, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6337 + }, + { + "epoch": 0.05527550539847552, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6338 + }, + { + "epoch": 0.0552842266836441, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6339 + }, + { + "epoch": 0.055292947968812685, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6340 + }, + { + "epoch": 0.055301669253981264, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6341 + }, + { + "epoch": 0.05531039053914985, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6342 + }, + { + "epoch": 0.05531911182431843, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6343 + }, + { + "epoch": 0.05532783310948702, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6344 + }, + { + "epoch": 0.055336554394655596, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 6345 + }, + { + "epoch": 0.055345275679824175, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 6346 + }, + { + "epoch": 0.05535399696499276, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6347 + }, + { + "epoch": 0.05536271825016134, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 6348 + }, + { + "epoch": 0.05537143953532993, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6349 + }, + { + "epoch": 0.05538016082049851, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6350 + }, + { + "epoch": 0.055388882105667094, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6351 + }, + { + "epoch": 0.05539760339083567, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 6352 + }, + { + "epoch": 0.05540632467600426, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6353 + }, + { + "epoch": 0.05541504596117284, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6354 + }, + { + "epoch": 0.05542376724634142, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 6355 + }, + { + "epoch": 0.055432488531510005, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 6356 + }, + { + "epoch": 0.055441209816678584, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6357 + }, + { + "epoch": 0.05544993110184717, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6358 + }, + { + "epoch": 0.05545865238701575, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 6359 + }, + { + "epoch": 0.055467373672184336, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 6360 + }, + { + "epoch": 0.055476094957352916, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6361 + }, + { + "epoch": 0.055484816242521495, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 6362 + }, + { + "epoch": 0.05549353752769008, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 6363 + }, + { + "epoch": 0.05550225881285866, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6364 + }, + { + "epoch": 0.05551098009802725, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 6365 + }, + { + "epoch": 0.05551970138319583, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6366 + }, + { + "epoch": 0.05552842266836441, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 6367 + }, + { + "epoch": 0.05553714395353299, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6368 + }, + { + "epoch": 0.05554586523870157, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 6369 + }, + { + "epoch": 0.05555458652387016, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 6370 + }, + { + "epoch": 0.05556330780903874, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6371 + }, + { + "epoch": 0.055572029094207324, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 6372 + }, + { + "epoch": 0.055580750379375904, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6373 + }, + { + "epoch": 0.05558947166454449, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6374 + }, + { + "epoch": 0.05559819294971307, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 6375 + }, + { + "epoch": 0.05560691423488165, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6376 + }, + { + "epoch": 0.055615635520050236, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6377 + }, + { + "epoch": 0.055624356805218815, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6378 + }, + { + "epoch": 0.0556330780903874, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6379 + }, + { + "epoch": 0.05564179937555598, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6380 + }, + { + "epoch": 0.05565052066072457, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6381 + }, + { + "epoch": 0.05565924194589315, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6382 + }, + { + "epoch": 0.055667963231061726, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 6383 + }, + { + "epoch": 0.05567668451623031, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6384 + }, + { + "epoch": 0.05568540580139889, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 6385 + }, + { + "epoch": 0.05569412708656748, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 6386 + }, + { + "epoch": 0.05570284837173606, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6387 + }, + { + "epoch": 0.055711569656904644, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6388 + }, + { + "epoch": 0.055720290942073224, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 6389 + }, + { + "epoch": 0.0557290122272418, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6390 + }, + { + "epoch": 0.05573773351241039, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6391 + }, + { + "epoch": 0.05574645479757897, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6392 + }, + { + "epoch": 0.055755176082747555, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 6393 + }, + { + "epoch": 0.055763897367916135, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6394 + }, + { + "epoch": 0.05577261865308472, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6395 + }, + { + "epoch": 0.0557813399382533, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 6396 + }, + { + "epoch": 0.05579006122342188, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6397 + }, + { + "epoch": 0.055798782508590467, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 6398 + }, + { + "epoch": 0.055807503793759046, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6399 + }, + { + "epoch": 0.05581622507892763, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6400 + }, + { + "epoch": 0.05582494636409621, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6401 + }, + { + "epoch": 0.0558336676492648, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 6402 + }, + { + "epoch": 0.05584238893443338, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6403 + }, + { + "epoch": 0.05585111021960196, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6404 + }, + { + "epoch": 0.055859831504770543, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 6405 + }, + { + "epoch": 0.05586855278993912, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6406 + }, + { + "epoch": 0.05587727407510771, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6407 + }, + { + "epoch": 0.05588599536027629, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6408 + }, + { + "epoch": 0.055894716645444875, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 6409 + }, + { + "epoch": 0.055903437930613455, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6410 + }, + { + "epoch": 0.05591215921578204, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 6411 + }, + { + "epoch": 0.05592088050095062, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6412 + }, + { + "epoch": 0.0559296017861192, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6413 + }, + { + "epoch": 0.055938323071287786, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6414 + }, + { + "epoch": 0.055947044356456366, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 6415 + }, + { + "epoch": 0.05595576564162495, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6416 + }, + { + "epoch": 0.05596448692679353, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 6417 + }, + { + "epoch": 0.05597320821196212, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6418 + }, + { + "epoch": 0.0559819294971307, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6419 + }, + { + "epoch": 0.05599065078229928, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6420 + }, + { + "epoch": 0.05599937206746786, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 6421 + }, + { + "epoch": 0.05600809335263644, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 6422 + }, + { + "epoch": 0.05601681463780503, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6423 + }, + { + "epoch": 0.05602553592297361, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 6424 + }, + { + "epoch": 0.056034257208142195, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6425 + }, + { + "epoch": 0.056042978493310774, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6426 + }, + { + "epoch": 0.056051699778479354, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6427 + }, + { + "epoch": 0.05606042106364794, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6428 + }, + { + "epoch": 0.05606914234881652, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 6429 + }, + { + "epoch": 0.056077863633985106, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 6430 + }, + { + "epoch": 0.056086584919153686, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6431 + }, + { + "epoch": 0.05609530620432227, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6432 + }, + { + "epoch": 0.05610402748949085, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6433 + }, + { + "epoch": 0.05611274877465943, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 6434 + }, + { + "epoch": 0.05612147005982802, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 6435 + }, + { + "epoch": 0.0561301913449966, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6436 + }, + { + "epoch": 0.05613891263016518, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6437 + }, + { + "epoch": 0.05614763391533376, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6438 + }, + { + "epoch": 0.05615635520050235, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6439 + }, + { + "epoch": 0.05616507648567093, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6440 + }, + { + "epoch": 0.05617379777083951, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6441 + }, + { + "epoch": 0.056182519056008094, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6442 + }, + { + "epoch": 0.056191240341176674, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6443 + }, + { + "epoch": 0.05619996162634526, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6444 + }, + { + "epoch": 0.05620868291151384, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6445 + }, + { + "epoch": 0.056217404196682426, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 6446 + }, + { + "epoch": 0.056226125481851005, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6447 + }, + { + "epoch": 0.056234846767019585, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 6448 + }, + { + "epoch": 0.05624356805218817, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 6449 + }, + { + "epoch": 0.05625228933735675, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6450 + }, + { + "epoch": 0.05626101062252534, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6451 + }, + { + "epoch": 0.056269731907693916, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 6452 + }, + { + "epoch": 0.0562784531928625, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 6453 + }, + { + "epoch": 0.05628717447803108, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 6454 + }, + { + "epoch": 0.05629589576319966, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6455 + }, + { + "epoch": 0.05630461704836825, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 6456 + }, + { + "epoch": 0.05631333833353683, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 6457 + }, + { + "epoch": 0.056322059618705414, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6458 + }, + { + "epoch": 0.05633078090387399, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6459 + }, + { + "epoch": 0.05633950218904258, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 6460 + }, + { + "epoch": 0.05634822347421116, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 6461 + }, + { + "epoch": 0.05635694475937974, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 6462 + }, + { + "epoch": 0.056365666044548325, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 6463 + }, + { + "epoch": 0.056374387329716905, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6464 + }, + { + "epoch": 0.05638310861488549, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 6465 + }, + { + "epoch": 0.05639182990005407, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 6466 + }, + { + "epoch": 0.05640055118522266, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 6467 + }, + { + "epoch": 0.056409272470391236, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 6468 + }, + { + "epoch": 0.05641799375555982, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 6469 + }, + { + "epoch": 0.0564267150407284, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 6470 + }, + { + "epoch": 0.05643543632589698, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 6471 + }, + { + "epoch": 0.05644415761106557, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6472 + }, + { + "epoch": 0.05645287889623415, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 6473 + }, + { + "epoch": 0.056461600181402734, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 6474 + }, + { + "epoch": 0.05647032146657131, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6475 + }, + { + "epoch": 0.0564790427517399, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6476 + }, + { + "epoch": 0.05648776403690848, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6477 + }, + { + "epoch": 0.05649648532207706, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6478 + }, + { + "epoch": 0.056505206607245645, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6479 + }, + { + "epoch": 0.056513927892414224, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6480 + }, + { + "epoch": 0.05652264917758281, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6481 + }, + { + "epoch": 0.05653137046275139, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 6482 + }, + { + "epoch": 0.05654009174791998, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 6483 + }, + { + "epoch": 0.056548813033088556, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 6484 + }, + { + "epoch": 0.056557534318257136, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6485 + }, + { + "epoch": 0.05656625560342572, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 6486 + }, + { + "epoch": 0.0565749768885943, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 6487 + }, + { + "epoch": 0.05658369817376289, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6488 + }, + { + "epoch": 0.05659241945893147, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 6489 + }, + { + "epoch": 0.056601140744100054, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6490 + }, + { + "epoch": 0.05660986202926863, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6491 + }, + { + "epoch": 0.05661858331443721, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 6492 + }, + { + "epoch": 0.0566273045996058, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6493 + }, + { + "epoch": 0.05663602588477438, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 6494 + }, + { + "epoch": 0.056644747169942965, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6495 + }, + { + "epoch": 0.056653468455111544, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6496 + }, + { + "epoch": 0.05666218974028013, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6497 + }, + { + "epoch": 0.05667091102544871, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6498 + }, + { + "epoch": 0.05667963231061729, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 6499 + }, + { + "epoch": 0.056688353595785876, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6500 + }, + { + "epoch": 0.056697074880954455, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 6501 + }, + { + "epoch": 0.05670579616612304, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6502 + }, + { + "epoch": 0.05671451745129162, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 6503 + }, + { + "epoch": 0.05672323873646021, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 6504 + }, + { + "epoch": 0.05673196002162879, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6505 + }, + { + "epoch": 0.056740681306797366, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 6506 + }, + { + "epoch": 0.05674940259196595, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6507 + }, + { + "epoch": 0.05675812387713453, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6508 + }, + { + "epoch": 0.05676684516230312, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 6509 + }, + { + "epoch": 0.0567755664474717, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6510 + }, + { + "epoch": 0.056784287732640285, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6511 + }, + { + "epoch": 0.056793009017808864, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6512 + }, + { + "epoch": 0.05680173030297744, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6513 + }, + { + "epoch": 0.05681045158814603, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6514 + }, + { + "epoch": 0.05681917287331461, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6515 + }, + { + "epoch": 0.056827894158483196, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 6516 + }, + { + "epoch": 0.056836615443651775, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6517 + }, + { + "epoch": 0.05684533672882036, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 6518 + }, + { + "epoch": 0.05685405801398894, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6519 + }, + { + "epoch": 0.05686277929915752, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 6520 + }, + { + "epoch": 0.05687150058432611, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 6521 + }, + { + "epoch": 0.056880221869494686, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 6522 + }, + { + "epoch": 0.05688894315466327, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6523 + }, + { + "epoch": 0.05689766443983185, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6524 + }, + { + "epoch": 0.05690638572500044, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 6525 + }, + { + "epoch": 0.05691510701016902, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6526 + }, + { + "epoch": 0.056923828295337604, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 6527 + }, + { + "epoch": 0.056932549580506184, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 6528 + }, + { + "epoch": 0.05694127086567476, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6529 + }, + { + "epoch": 0.05694999215084335, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6530 + }, + { + "epoch": 0.05695871343601193, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 6531 + }, + { + "epoch": 0.056967434721180515, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6532 + }, + { + "epoch": 0.056976156006349095, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6533 + }, + { + "epoch": 0.05698487729151768, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 6534 + }, + { + "epoch": 0.05699359857668626, + "grad_norm": 0.359375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 6535 + }, + { + "epoch": 0.05700231986185484, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 6536 + }, + { + "epoch": 0.05701104114702343, + "grad_norm": 0.400390625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 6537 + }, + { + "epoch": 0.057019762432192006, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 6538 + }, + { + "epoch": 0.05702848371736059, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6539 + }, + { + "epoch": 0.05703720500252917, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 6540 + }, + { + "epoch": 0.05704592628769776, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6541 + }, + { + "epoch": 0.05705464757286634, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 6542 + }, + { + "epoch": 0.05706336885803492, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6543 + }, + { + "epoch": 0.057072090143203504, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6544 + }, + { + "epoch": 0.05708081142837208, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6545 + }, + { + "epoch": 0.05708953271354067, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6546 + }, + { + "epoch": 0.05709825399870925, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6547 + }, + { + "epoch": 0.057106975283877835, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6548 + }, + { + "epoch": 0.057115696569046415, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6549 + }, + { + "epoch": 0.057124417854214994, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6550 + }, + { + "epoch": 0.05713313913938358, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 6551 + }, + { + "epoch": 0.05714186042455216, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6552 + }, + { + "epoch": 0.057150581709720746, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6553 + }, + { + "epoch": 0.057159302994889326, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 6554 + }, + { + "epoch": 0.05716802428005791, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 6555 + }, + { + "epoch": 0.05717674556522649, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 6556 + }, + { + "epoch": 0.05718546685039507, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 6557 + }, + { + "epoch": 0.05719418813556366, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 6558 + }, + { + "epoch": 0.05720290942073224, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6559 + }, + { + "epoch": 0.05721163070590082, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 6560 + }, + { + "epoch": 0.0572203519910694, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 6561 + }, + { + "epoch": 0.05722907327623799, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 6562 + }, + { + "epoch": 0.05723779456140657, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6563 + }, + { + "epoch": 0.05724651584657515, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 6564 + }, + { + "epoch": 0.057255237131743734, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 6565 + }, + { + "epoch": 0.057263958416912314, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6566 + }, + { + "epoch": 0.0572726797020809, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 6567 + }, + { + "epoch": 0.05728140098724948, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6568 + }, + { + "epoch": 0.057290122272418066, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 6569 + }, + { + "epoch": 0.057298843557586646, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6570 + }, + { + "epoch": 0.057307564842755225, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6571 + }, + { + "epoch": 0.05731628612792381, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 6572 + }, + { + "epoch": 0.05732500741309239, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6573 + }, + { + "epoch": 0.05733372869826098, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 6574 + }, + { + "epoch": 0.05734244998342956, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 6575 + }, + { + "epoch": 0.05735117126859814, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6576 + }, + { + "epoch": 0.05735989255376672, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6577 + }, + { + "epoch": 0.05736861383893531, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 6578 + }, + { + "epoch": 0.05737733512410389, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6579 + }, + { + "epoch": 0.05738605640927247, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 6580 + }, + { + "epoch": 0.057394777694441054, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 6581 + }, + { + "epoch": 0.057403498979609634, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 6582 + }, + { + "epoch": 0.05741222026477822, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 6583 + }, + { + "epoch": 0.0574209415499468, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6584 + }, + { + "epoch": 0.057429662835115386, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6585 + }, + { + "epoch": 0.057438384120283965, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 6586 + }, + { + "epoch": 0.057447105405452545, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6587 + }, + { + "epoch": 0.05745582669062113, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 6588 + }, + { + "epoch": 0.05746454797578971, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6589 + }, + { + "epoch": 0.0574732692609583, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 6590 + }, + { + "epoch": 0.057481990546126877, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 6591 + }, + { + "epoch": 0.05749071183129546, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 6592 + }, + { + "epoch": 0.05749943311646404, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6593 + }, + { + "epoch": 0.05750815440163262, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 6594 + }, + { + "epoch": 0.05751687568680121, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 6595 + }, + { + "epoch": 0.05752559697196979, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 6596 + }, + { + "epoch": 0.057534318257138374, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 6597 + }, + { + "epoch": 0.057543039542306954, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6598 + }, + { + "epoch": 0.05755176082747554, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 6599 + }, + { + "epoch": 0.05756048211264412, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6600 + }, + { + "epoch": 0.0575692033978127, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6601 + }, + { + "epoch": 0.057577924682981285, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6602 + }, + { + "epoch": 0.057586645968149865, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6603 + }, + { + "epoch": 0.05759536725331845, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6604 + }, + { + "epoch": 0.05760408853848703, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 6605 + }, + { + "epoch": 0.05761280982365562, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 6606 + }, + { + "epoch": 0.057621531108824196, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6607 + }, + { + "epoch": 0.057630252393992776, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 6608 + }, + { + "epoch": 0.05763897367916136, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6609 + }, + { + "epoch": 0.05764769496432994, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6610 + }, + { + "epoch": 0.05765641624949853, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 6611 + }, + { + "epoch": 0.05766513753466711, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 6612 + }, + { + "epoch": 0.057673858819835694, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 6613 + }, + { + "epoch": 0.05768258010500427, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6614 + }, + { + "epoch": 0.05769130139017285, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 6615 + }, + { + "epoch": 0.05770002267534144, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 6616 + }, + { + "epoch": 0.05770874396051002, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6617 + }, + { + "epoch": 0.057717465245678605, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6618 + }, + { + "epoch": 0.057726186530847184, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 6619 + }, + { + "epoch": 0.05773490781601577, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6620 + }, + { + "epoch": 0.05774362910118435, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 6621 + }, + { + "epoch": 0.05775235038635293, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 6622 + }, + { + "epoch": 0.057761071671521516, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6623 + }, + { + "epoch": 0.057769792956690096, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6624 + }, + { + "epoch": 0.05777851424185868, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6625 + }, + { + "epoch": 0.05778723552702726, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 6626 + }, + { + "epoch": 0.05779595681219585, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6627 + }, + { + "epoch": 0.05780467809736443, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6628 + }, + { + "epoch": 0.05781339938253301, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 6629 + }, + { + "epoch": 0.05782212066770159, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 6630 + }, + { + "epoch": 0.05783084195287017, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 6631 + }, + { + "epoch": 0.05783956323803876, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6632 + }, + { + "epoch": 0.05784828452320734, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6633 + }, + { + "epoch": 0.057857005808375925, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 6634 + }, + { + "epoch": 0.057865727093544504, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 6635 + }, + { + "epoch": 0.05787444837871309, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6636 + }, + { + "epoch": 0.05788316966388167, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 6637 + }, + { + "epoch": 0.05789189094905025, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6638 + }, + { + "epoch": 0.057900612234218836, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6639 + }, + { + "epoch": 0.057909333519387415, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 6640 + }, + { + "epoch": 0.057918054804556, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6641 + }, + { + "epoch": 0.05792677608972458, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6642 + }, + { + "epoch": 0.05793549737489317, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6643 + }, + { + "epoch": 0.05794421866006175, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6644 + }, + { + "epoch": 0.057952939945230326, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 6645 + }, + { + "epoch": 0.05796166123039891, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6646 + }, + { + "epoch": 0.05797038251556749, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 6647 + }, + { + "epoch": 0.05797910380073608, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 6648 + }, + { + "epoch": 0.05798782508590466, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 6649 + }, + { + "epoch": 0.057996546371073245, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 6650 + }, + { + "epoch": 0.058005267656241824, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6651 + }, + { + "epoch": 0.0580139889414104, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6652 + }, + { + "epoch": 0.05802271022657899, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 6653 + }, + { + "epoch": 0.05803143151174757, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6654 + }, + { + "epoch": 0.058040152796916156, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6655 + }, + { + "epoch": 0.058048874082084735, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6656 + }, + { + "epoch": 0.05805759536725332, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6657 + }, + { + "epoch": 0.0580663166524219, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6658 + }, + { + "epoch": 0.05807503793759048, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6659 + }, + { + "epoch": 0.05808375922275907, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 6660 + }, + { + "epoch": 0.058092480507927646, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 6661 + }, + { + "epoch": 0.05810120179309623, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 6662 + }, + { + "epoch": 0.05810992307826481, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 6663 + }, + { + "epoch": 0.0581186443634334, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 6664 + }, + { + "epoch": 0.05812736564860198, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 6665 + }, + { + "epoch": 0.05813608693377056, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6666 + }, + { + "epoch": 0.058144808218939144, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 6667 + }, + { + "epoch": 0.05815352950410772, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6668 + }, + { + "epoch": 0.05816225078927631, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6669 + }, + { + "epoch": 0.05817097207444489, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 6670 + }, + { + "epoch": 0.058179693359613475, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 6671 + }, + { + "epoch": 0.058188414644782055, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 6672 + }, + { + "epoch": 0.058197135929950634, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6673 + }, + { + "epoch": 0.05820585721511922, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 6674 + }, + { + "epoch": 0.0582145785002878, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 6675 + }, + { + "epoch": 0.05822329978545639, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 6676 + }, + { + "epoch": 0.058232021070624966, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 6677 + }, + { + "epoch": 0.05824074235579355, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 6678 + }, + { + "epoch": 0.05824946364096213, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 6679 + }, + { + "epoch": 0.05825818492613071, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 6680 + }, + { + "epoch": 0.0582669062112993, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 6681 + }, + { + "epoch": 0.05827562749646788, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6682 + }, + { + "epoch": 0.058284348781636464, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6683 + }, + { + "epoch": 0.05829307006680504, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6684 + }, + { + "epoch": 0.05830179135197363, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6685 + }, + { + "epoch": 0.05831051263714221, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 6686 + }, + { + "epoch": 0.05831923392231079, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 6687 + }, + { + "epoch": 0.058327955207479375, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 6688 + }, + { + "epoch": 0.058336676492647954, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6689 + }, + { + "epoch": 0.05834539777781654, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6690 + }, + { + "epoch": 0.05835411906298512, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6691 + }, + { + "epoch": 0.058362840348153706, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 6692 + }, + { + "epoch": 0.058371561633322286, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6693 + }, + { + "epoch": 0.05838028291849087, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 6694 + }, + { + "epoch": 0.05838900420365945, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6695 + }, + { + "epoch": 0.05839772548882803, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 6696 + }, + { + "epoch": 0.05840644677399662, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6697 + }, + { + "epoch": 0.0584151680591652, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6698 + }, + { + "epoch": 0.05842388934433378, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 6699 + }, + { + "epoch": 0.05843261062950236, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 6700 + }, + { + "epoch": 0.05844133191467095, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6701 + }, + { + "epoch": 0.05845005319983953, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 6702 + }, + { + "epoch": 0.05845877448500811, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6703 + }, + { + "epoch": 0.058467495770176695, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6704 + }, + { + "epoch": 0.058476217055345274, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 6705 + }, + { + "epoch": 0.05848493834051386, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6706 + }, + { + "epoch": 0.05849365962568244, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6707 + }, + { + "epoch": 0.058502380910851026, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 6708 + }, + { + "epoch": 0.058511102196019606, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6709 + }, + { + "epoch": 0.058519823481188185, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 6710 + }, + { + "epoch": 0.05852854476635677, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 6711 + }, + { + "epoch": 0.05853726605152535, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 6712 + }, + { + "epoch": 0.05854598733669394, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 6713 + }, + { + "epoch": 0.05855470862186252, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6714 + }, + { + "epoch": 0.0585634299070311, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 6715 + }, + { + "epoch": 0.05857215119219968, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6716 + }, + { + "epoch": 0.05858087247736826, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 6717 + }, + { + "epoch": 0.05858959376253685, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 6718 + }, + { + "epoch": 0.05859831504770543, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6719 + }, + { + "epoch": 0.058607036332874014, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 6720 + }, + { + "epoch": 0.058615757618042594, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 6721 + }, + { + "epoch": 0.05862447890321118, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6722 + }, + { + "epoch": 0.05863320018837976, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 6723 + }, + { + "epoch": 0.05864192147354834, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 6724 + }, + { + "epoch": 0.058650642758716925, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 6725 + }, + { + "epoch": 0.058659364043885505, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 6726 + }, + { + "epoch": 0.05866808532905409, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6727 + }, + { + "epoch": 0.05867680661422267, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 6728 + }, + { + "epoch": 0.05868552789939126, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 6729 + }, + { + "epoch": 0.05869424918455984, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 6730 + }, + { + "epoch": 0.058702970469728416, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 6731 + }, + { + "epoch": 0.058711691754897, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6732 + }, + { + "epoch": 0.05872041304006558, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6733 + }, + { + "epoch": 0.05872913432523417, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 6734 + }, + { + "epoch": 0.05873785561040275, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 6735 + }, + { + "epoch": 0.058746576895571334, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6736 + }, + { + "epoch": 0.058755298180739914, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6737 + }, + { + "epoch": 0.05876401946590849, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6738 + }, + { + "epoch": 0.05877274075107708, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6739 + }, + { + "epoch": 0.05878146203624566, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6740 + }, + { + "epoch": 0.058790183321414245, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6741 + }, + { + "epoch": 0.058798904606582825, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 6742 + }, + { + "epoch": 0.05880762589175141, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6743 + }, + { + "epoch": 0.05881634717691999, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 6744 + }, + { + "epoch": 0.05882506846208857, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6745 + }, + { + "epoch": 0.058833789747257156, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6746 + }, + { + "epoch": 0.058842511032425736, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 6747 + }, + { + "epoch": 0.05885123231759432, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 6748 + }, + { + "epoch": 0.0588599536027629, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6749 + }, + { + "epoch": 0.05886867488793149, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6750 + }, + { + "epoch": 0.05887739617310007, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 6751 + }, + { + "epoch": 0.058886117458268654, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 6752 + }, + { + "epoch": 0.05889483874343723, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6753 + }, + { + "epoch": 0.05890356002860581, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 6754 + }, + { + "epoch": 0.0589122813137744, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 6755 + }, + { + "epoch": 0.05892100259894298, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6756 + }, + { + "epoch": 0.058929723884111565, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6757 + }, + { + "epoch": 0.058938445169280144, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 6758 + }, + { + "epoch": 0.05894716645444873, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 6759 + }, + { + "epoch": 0.05895588773961731, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 6760 + }, + { + "epoch": 0.05896460902478589, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6761 + }, + { + "epoch": 0.058973330309954476, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 6762 + }, + { + "epoch": 0.058982051595123056, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6763 + }, + { + "epoch": 0.05899077288029164, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6764 + }, + { + "epoch": 0.05899949416546022, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6765 + }, + { + "epoch": 0.05900821545062881, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 6766 + }, + { + "epoch": 0.05901693673579739, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 6767 + }, + { + "epoch": 0.05902565802096597, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6768 + }, + { + "epoch": 0.05903437930613455, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6769 + }, + { + "epoch": 0.05904310059130313, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6770 + }, + { + "epoch": 0.05905182187647172, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 6771 + }, + { + "epoch": 0.0590605431616403, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6772 + }, + { + "epoch": 0.059069264446808885, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6773 + }, + { + "epoch": 0.059077985731977464, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 6774 + }, + { + "epoch": 0.059086707017146044, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 6775 + }, + { + "epoch": 0.05909542830231463, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 6776 + }, + { + "epoch": 0.05910414958748321, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 6777 + }, + { + "epoch": 0.059112870872651796, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6778 + }, + { + "epoch": 0.059121592157820375, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6779 + }, + { + "epoch": 0.05913031344298896, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6780 + }, + { + "epoch": 0.05913903472815754, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 6781 + }, + { + "epoch": 0.05914775601332612, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6782 + }, + { + "epoch": 0.05915647729849471, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 6783 + }, + { + "epoch": 0.05916519858366329, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6784 + }, + { + "epoch": 0.05917391986883187, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 6785 + }, + { + "epoch": 0.05918264115400045, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 6786 + }, + { + "epoch": 0.05919136243916904, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 6787 + }, + { + "epoch": 0.05920008372433762, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 6788 + }, + { + "epoch": 0.0592088050095062, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 6789 + }, + { + "epoch": 0.059217526294674784, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 6790 + }, + { + "epoch": 0.059226247579843364, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6791 + }, + { + "epoch": 0.05923496886501195, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6792 + }, + { + "epoch": 0.05924369015018053, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 6793 + }, + { + "epoch": 0.059252411435349116, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 6794 + }, + { + "epoch": 0.059261132720517695, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 6795 + }, + { + "epoch": 0.059269854005686275, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6796 + }, + { + "epoch": 0.05927857529085486, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 6797 + }, + { + "epoch": 0.05928729657602344, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 6798 + }, + { + "epoch": 0.05929601786119203, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 6799 + }, + { + "epoch": 0.059304739146360606, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6800 + }, + { + "epoch": 0.05931346043152919, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6801 + }, + { + "epoch": 0.05932218171669777, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 6802 + }, + { + "epoch": 0.05933090300186635, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 6803 + }, + { + "epoch": 0.05933962428703494, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 6804 + }, + { + "epoch": 0.05934834557220352, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6805 + }, + { + "epoch": 0.059357066857372104, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6806 + }, + { + "epoch": 0.05936578814254068, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 6807 + }, + { + "epoch": 0.05937450942770927, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6808 + }, + { + "epoch": 0.05938323071287785, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 6809 + }, + { + "epoch": 0.059391951998046436, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6810 + }, + { + "epoch": 0.059400673283215015, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 6811 + }, + { + "epoch": 0.059409394568383594, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 6812 + }, + { + "epoch": 0.05941811585355218, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6813 + }, + { + "epoch": 0.05942683713872076, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 6814 + }, + { + "epoch": 0.05943555842388935, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6815 + }, + { + "epoch": 0.059444279709057926, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 6816 + }, + { + "epoch": 0.05945300099422651, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6817 + }, + { + "epoch": 0.05946172227939509, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6818 + }, + { + "epoch": 0.05947044356456367, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 6819 + }, + { + "epoch": 0.05947916484973226, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6820 + }, + { + "epoch": 0.05948788613490084, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 6821 + }, + { + "epoch": 0.059496607420069424, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6822 + }, + { + "epoch": 0.059505328705238, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 6823 + }, + { + "epoch": 0.05951404999040659, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6824 + }, + { + "epoch": 0.05952277127557517, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 6825 + }, + { + "epoch": 0.05953149256074375, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 6826 + }, + { + "epoch": 0.059540213845912335, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6827 + }, + { + "epoch": 0.059548935131080914, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 6828 + }, + { + "epoch": 0.0595576564162495, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 6829 + }, + { + "epoch": 0.05956637770141808, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6830 + }, + { + "epoch": 0.059575098986586666, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 6831 + }, + { + "epoch": 0.059583820271755246, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6832 + }, + { + "epoch": 0.059592541556923825, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 6833 + }, + { + "epoch": 0.05960126284209241, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 6834 + }, + { + "epoch": 0.05960998412726099, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6835 + }, + { + "epoch": 0.05961870541242958, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6836 + }, + { + "epoch": 0.05962742669759816, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6837 + }, + { + "epoch": 0.05963614798276674, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 6838 + }, + { + "epoch": 0.05964486926793532, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 6839 + }, + { + "epoch": 0.0596535905531039, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 6840 + }, + { + "epoch": 0.05966231183827249, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 6841 + }, + { + "epoch": 0.05967103312344107, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6842 + }, + { + "epoch": 0.059679754408609655, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 6843 + }, + { + "epoch": 0.059688475693778234, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 6844 + }, + { + "epoch": 0.05969719697894682, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6845 + }, + { + "epoch": 0.0597059182641154, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6846 + }, + { + "epoch": 0.05971463954928398, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6847 + }, + { + "epoch": 0.059723360834452566, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6848 + }, + { + "epoch": 0.059732082119621145, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6849 + }, + { + "epoch": 0.05974080340478973, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 6850 + }, + { + "epoch": 0.05974952468995831, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 6851 + }, + { + "epoch": 0.0597582459751269, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6852 + }, + { + "epoch": 0.05976696726029548, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 6853 + }, + { + "epoch": 0.059775688545464056, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6854 + }, + { + "epoch": 0.05978440983063264, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 6855 + }, + { + "epoch": 0.05979313111580122, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 6856 + }, + { + "epoch": 0.05980185240096981, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 6857 + }, + { + "epoch": 0.05981057368613839, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6858 + }, + { + "epoch": 0.059819294971306974, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6859 + }, + { + "epoch": 0.059828016256475554, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6860 + }, + { + "epoch": 0.05983673754164413, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 6861 + }, + { + "epoch": 0.05984545882681272, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6862 + }, + { + "epoch": 0.0598541801119813, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6863 + }, + { + "epoch": 0.059862901397149886, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6864 + }, + { + "epoch": 0.059871622682318465, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 6865 + }, + { + "epoch": 0.05988034396748705, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6866 + }, + { + "epoch": 0.05988906525265563, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 6867 + }, + { + "epoch": 0.05989778653782422, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 6868 + }, + { + "epoch": 0.0599065078229928, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6869 + }, + { + "epoch": 0.059915229108161376, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 6870 + }, + { + "epoch": 0.05992395039332996, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 6871 + }, + { + "epoch": 0.05993267167849854, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6872 + }, + { + "epoch": 0.05994139296366713, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 6873 + }, + { + "epoch": 0.05995011424883571, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 6874 + }, + { + "epoch": 0.059958835534004294, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 6875 + }, + { + "epoch": 0.059967556819172874, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6876 + }, + { + "epoch": 0.05997627810434145, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 6877 + }, + { + "epoch": 0.05998499938951004, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 6878 + }, + { + "epoch": 0.05999372067467862, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6879 + }, + { + "epoch": 0.060002441959847205, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 6880 + }, + { + "epoch": 0.060011163245015785, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6881 + }, + { + "epoch": 0.06001988453018437, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6882 + }, + { + "epoch": 0.06002860581535295, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 6883 + }, + { + "epoch": 0.06003732710052153, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6884 + }, + { + "epoch": 0.060046048385690116, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 6885 + }, + { + "epoch": 0.060054769670858696, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6886 + }, + { + "epoch": 0.06006349095602728, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6887 + }, + { + "epoch": 0.06007221224119586, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 6888 + }, + { + "epoch": 0.06008093352636445, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 6889 + }, + { + "epoch": 0.06008965481153303, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 6890 + }, + { + "epoch": 0.06009837609670161, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6891 + }, + { + "epoch": 0.06010709738187019, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 6892 + }, + { + "epoch": 0.06011581866703877, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6893 + }, + { + "epoch": 0.06012453995220736, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6894 + }, + { + "epoch": 0.06013326123737594, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6895 + }, + { + "epoch": 0.060141982522544525, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 6896 + }, + { + "epoch": 0.060150703807713105, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 6897 + }, + { + "epoch": 0.060159425092881684, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 6898 + }, + { + "epoch": 0.06016814637805027, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 6899 + }, + { + "epoch": 0.06017686766321885, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 6900 + }, + { + "epoch": 0.060185588948387436, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6901 + }, + { + "epoch": 0.060194310233556016, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6902 + }, + { + "epoch": 0.0602030315187246, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6903 + }, + { + "epoch": 0.06021175280389318, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 6904 + }, + { + "epoch": 0.06022047408906176, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 6905 + }, + { + "epoch": 0.06022919537423035, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6906 + }, + { + "epoch": 0.06023791665939893, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6907 + }, + { + "epoch": 0.06024663794456751, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6908 + }, + { + "epoch": 0.06025535922973609, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 6909 + }, + { + "epoch": 0.06026408051490468, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 6910 + }, + { + "epoch": 0.06027280180007326, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 6911 + }, + { + "epoch": 0.06028152308524184, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6912 + }, + { + "epoch": 0.060290244370410424, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6913 + }, + { + "epoch": 0.060298965655579004, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6914 + }, + { + "epoch": 0.06030768694074759, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6915 + }, + { + "epoch": 0.06031640822591617, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 6916 + }, + { + "epoch": 0.060325129511084756, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 6917 + }, + { + "epoch": 0.060333850796253335, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 6918 + }, + { + "epoch": 0.060342572081421915, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6919 + }, + { + "epoch": 0.0603512933665905, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6920 + }, + { + "epoch": 0.06036001465175908, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 6921 + }, + { + "epoch": 0.06036873593692767, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6922 + }, + { + "epoch": 0.06037745722209625, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 6923 + }, + { + "epoch": 0.06038617850726483, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 6924 + }, + { + "epoch": 0.06039489979243341, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6925 + }, + { + "epoch": 0.060403621077602, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 6926 + }, + { + "epoch": 0.06041234236277058, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6927 + }, + { + "epoch": 0.06042106364793916, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6928 + }, + { + "epoch": 0.060429784933107744, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6929 + }, + { + "epoch": 0.060438506218276324, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 6930 + }, + { + "epoch": 0.06044722750344491, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 6931 + }, + { + "epoch": 0.06045594878861349, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6932 + }, + { + "epoch": 0.060464670073782076, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6933 + }, + { + "epoch": 0.060473391358950655, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6934 + }, + { + "epoch": 0.060482112644119235, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6935 + }, + { + "epoch": 0.06049083392928782, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 6936 + }, + { + "epoch": 0.0604995552144564, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 6937 + }, + { + "epoch": 0.06050827649962499, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 6938 + }, + { + "epoch": 0.060516997784793566, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 6939 + }, + { + "epoch": 0.06052571906996215, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 6940 + }, + { + "epoch": 0.06053444035513073, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 6941 + }, + { + "epoch": 0.06054316164029931, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 6942 + }, + { + "epoch": 0.0605518829254679, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 6943 + }, + { + "epoch": 0.06056060421063648, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 6944 + }, + { + "epoch": 0.060569325495805064, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 6945 + }, + { + "epoch": 0.06057804678097364, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6946 + }, + { + "epoch": 0.06058676806614223, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6947 + }, + { + "epoch": 0.06059548935131081, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 6948 + }, + { + "epoch": 0.06060421063647939, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6949 + }, + { + "epoch": 0.060612931921647975, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6950 + }, + { + "epoch": 0.060621653206816555, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 6951 + }, + { + "epoch": 0.06063037449198514, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 6952 + }, + { + "epoch": 0.06063909577715372, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 6953 + }, + { + "epoch": 0.06064781706232231, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 6954 + }, + { + "epoch": 0.060656538347490886, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 6955 + }, + { + "epoch": 0.060665259632659466, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 6956 + }, + { + "epoch": 0.06067398091782805, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6957 + }, + { + "epoch": 0.06068270220299663, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 6958 + }, + { + "epoch": 0.06069142348816522, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 6959 + }, + { + "epoch": 0.0607001447733338, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 6960 + }, + { + "epoch": 0.060708866058502384, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 6961 + }, + { + "epoch": 0.06071758734367096, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 6962 + }, + { + "epoch": 0.06072630862883954, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 6963 + }, + { + "epoch": 0.06073502991400813, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 6964 + }, + { + "epoch": 0.06074375119917671, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 6965 + }, + { + "epoch": 0.060752472484345295, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6966 + }, + { + "epoch": 0.060761193769513874, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6967 + }, + { + "epoch": 0.06076991505468246, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 6968 + }, + { + "epoch": 0.06077863633985104, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6969 + }, + { + "epoch": 0.06078735762501962, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 6970 + }, + { + "epoch": 0.060796078910188206, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6971 + }, + { + "epoch": 0.060804800195356785, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 6972 + }, + { + "epoch": 0.06081352148052537, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6973 + }, + { + "epoch": 0.06082224276569395, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 6974 + }, + { + "epoch": 0.06083096405086254, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 6975 + }, + { + "epoch": 0.06083968533603112, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6976 + }, + { + "epoch": 0.0608484066211997, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 6977 + }, + { + "epoch": 0.06085712790636828, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6978 + }, + { + "epoch": 0.06086584919153686, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6979 + }, + { + "epoch": 0.06087457047670545, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6980 + }, + { + "epoch": 0.06088329176187403, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6981 + }, + { + "epoch": 0.060892013047042615, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 6982 + }, + { + "epoch": 0.060900734332211194, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 6983 + }, + { + "epoch": 0.06090945561737978, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 6984 + }, + { + "epoch": 0.06091817690254836, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 6985 + }, + { + "epoch": 0.06092689818771694, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 6986 + }, + { + "epoch": 0.060935619472885526, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6987 + }, + { + "epoch": 0.060944340758054105, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6988 + }, + { + "epoch": 0.06095306204322269, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6989 + }, + { + "epoch": 0.06096178332839127, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 6990 + }, + { + "epoch": 0.06097050461355986, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6991 + }, + { + "epoch": 0.06097922589872844, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6992 + }, + { + "epoch": 0.060987947183897016, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 6993 + }, + { + "epoch": 0.0609966684690656, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 6994 + }, + { + "epoch": 0.06100538975423418, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 6995 + }, + { + "epoch": 0.06101411103940277, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 6996 + }, + { + "epoch": 0.06102283232457135, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 6997 + }, + { + "epoch": 0.061031553609739934, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 6998 + }, + { + "epoch": 0.061040274894908514, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 6999 + }, + { + "epoch": 0.06104899618007709, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 7000 + }, + { + "epoch": 0.06105771746524568, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7001 + }, + { + "epoch": 0.06106643875041426, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 7002 + }, + { + "epoch": 0.061075160035582846, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 7003 + }, + { + "epoch": 0.061083881320751425, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7004 + }, + { + "epoch": 0.06109260260592001, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7005 + }, + { + "epoch": 0.06110132389108859, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 7006 + }, + { + "epoch": 0.06111004517625717, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7007 + }, + { + "epoch": 0.06111876646142576, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7008 + }, + { + "epoch": 0.061127487746594336, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7009 + }, + { + "epoch": 0.06113620903176292, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 7010 + }, + { + "epoch": 0.0611449303169315, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 7011 + }, + { + "epoch": 0.06115365160210009, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 7012 + }, + { + "epoch": 0.06116237288726867, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7013 + }, + { + "epoch": 0.06117109417243725, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 7014 + }, + { + "epoch": 0.061179815457605834, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 7015 + }, + { + "epoch": 0.06118853674277441, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7016 + }, + { + "epoch": 0.061197258027943, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7017 + }, + { + "epoch": 0.06120597931311158, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7018 + }, + { + "epoch": 0.061214700598280165, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7019 + }, + { + "epoch": 0.061223421883448745, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7020 + }, + { + "epoch": 0.061232143168617324, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 7021 + }, + { + "epoch": 0.06124086445378591, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 7022 + }, + { + "epoch": 0.06124958573895449, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 7023 + }, + { + "epoch": 0.061258307024123076, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7024 + }, + { + "epoch": 0.061267028309291656, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7025 + }, + { + "epoch": 0.06127574959446024, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7026 + }, + { + "epoch": 0.06128447087962882, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7027 + }, + { + "epoch": 0.0612931921647974, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 7028 + }, + { + "epoch": 0.06130191344996599, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7029 + }, + { + "epoch": 0.06131063473513457, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 7030 + }, + { + "epoch": 0.06131935602030315, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 7031 + }, + { + "epoch": 0.06132807730547173, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7032 + }, + { + "epoch": 0.06133679859064032, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7033 + }, + { + "epoch": 0.0613455198758089, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 7034 + }, + { + "epoch": 0.06135424116097748, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 7035 + }, + { + "epoch": 0.061362962446146065, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7036 + }, + { + "epoch": 0.061371683731314644, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 7037 + }, + { + "epoch": 0.06138040501648323, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7038 + }, + { + "epoch": 0.06138912630165181, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7039 + }, + { + "epoch": 0.061397847586820396, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7040 + }, + { + "epoch": 0.061406568871988976, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7041 + }, + { + "epoch": 0.06141529015715756, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 7042 + }, + { + "epoch": 0.06142401144232614, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7043 + }, + { + "epoch": 0.06143273272749472, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7044 + }, + { + "epoch": 0.06144145401266331, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7045 + }, + { + "epoch": 0.06145017529783189, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 7046 + }, + { + "epoch": 0.06145889658300047, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 7047 + }, + { + "epoch": 0.06146761786816905, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 7048 + }, + { + "epoch": 0.06147633915333764, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 7049 + }, + { + "epoch": 0.06148506043850622, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7050 + }, + { + "epoch": 0.0614937817236748, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7051 + }, + { + "epoch": 0.061502503008843384, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 7052 + }, + { + "epoch": 0.061511224294011964, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7053 + }, + { + "epoch": 0.06151994557918055, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7054 + }, + { + "epoch": 0.06152866686434913, + "grad_norm": 0.34375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 7055 + }, + { + "epoch": 0.061537388149517716, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7056 + }, + { + "epoch": 0.061546109434686296, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7057 + }, + { + "epoch": 0.061554830719854875, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7058 + }, + { + "epoch": 0.06156355200502346, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7059 + }, + { + "epoch": 0.06157227329019204, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 7060 + }, + { + "epoch": 0.06158099457536063, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 7061 + }, + { + "epoch": 0.06158971586052921, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7062 + }, + { + "epoch": 0.06159843714569779, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 7063 + }, + { + "epoch": 0.06160715843086637, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 7064 + }, + { + "epoch": 0.06161587971603495, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 7065 + }, + { + "epoch": 0.06162460100120354, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7066 + }, + { + "epoch": 0.06163332228637212, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7067 + }, + { + "epoch": 0.061642043571540704, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7068 + }, + { + "epoch": 0.061650764856709284, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7069 + }, + { + "epoch": 0.06165948614187787, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7070 + }, + { + "epoch": 0.06166820742704645, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7071 + }, + { + "epoch": 0.06167692871221503, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 7072 + }, + { + "epoch": 0.061685649997383615, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 7073 + }, + { + "epoch": 0.061694371282552195, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7074 + }, + { + "epoch": 0.06170309256772078, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 7075 + }, + { + "epoch": 0.06171181385288936, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7076 + }, + { + "epoch": 0.06172053513805795, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7077 + }, + { + "epoch": 0.061729256423226526, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 7078 + }, + { + "epoch": 0.061737977708395106, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7079 + }, + { + "epoch": 0.06174669899356369, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7080 + }, + { + "epoch": 0.06175542027873227, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7081 + }, + { + "epoch": 0.06176414156390086, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7082 + }, + { + "epoch": 0.06177286284906944, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 7083 + }, + { + "epoch": 0.061781584134238024, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7084 + }, + { + "epoch": 0.0617903054194066, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 7085 + }, + { + "epoch": 0.06179902670457518, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7086 + }, + { + "epoch": 0.06180774798974377, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7087 + }, + { + "epoch": 0.06181646927491235, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7088 + }, + { + "epoch": 0.061825190560080935, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7089 + }, + { + "epoch": 0.061833911845249515, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7090 + }, + { + "epoch": 0.0618426331304181, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7091 + }, + { + "epoch": 0.06185135441558668, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 7092 + }, + { + "epoch": 0.06186007570075526, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 7093 + }, + { + "epoch": 0.061868796985923846, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7094 + }, + { + "epoch": 0.061877518271092426, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7095 + }, + { + "epoch": 0.06188623955626101, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7096 + }, + { + "epoch": 0.06189496084142959, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7097 + }, + { + "epoch": 0.06190368212659818, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 7098 + }, + { + "epoch": 0.06191240341176676, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 7099 + }, + { + "epoch": 0.061921124696935344, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 7100 + }, + { + "epoch": 0.06192984598210392, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7101 + }, + { + "epoch": 0.0619385672672725, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7102 + }, + { + "epoch": 0.06194728855244109, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7103 + }, + { + "epoch": 0.06195600983760967, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 7104 + }, + { + "epoch": 0.061964731122778255, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 7105 + }, + { + "epoch": 0.061973452407946834, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 7106 + }, + { + "epoch": 0.06198217369311542, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 7107 + }, + { + "epoch": 0.061990894978284, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 7108 + }, + { + "epoch": 0.06199961626345258, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 7109 + }, + { + "epoch": 0.062008337548621166, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 7110 + }, + { + "epoch": 0.062017058833789745, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7111 + }, + { + "epoch": 0.06202578011895833, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 7112 + }, + { + "epoch": 0.06203450140412691, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 7113 + }, + { + "epoch": 0.0620432226892955, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 7114 + }, + { + "epoch": 0.06205194397446408, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 7115 + }, + { + "epoch": 0.06206066525963266, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 7116 + }, + { + "epoch": 0.06206938654480124, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 7117 + }, + { + "epoch": 0.06207810782996982, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 7118 + }, + { + "epoch": 0.06208682911513841, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7119 + }, + { + "epoch": 0.06209555040030699, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7120 + }, + { + "epoch": 0.062104271685475575, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7121 + }, + { + "epoch": 0.062112992970644154, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7122 + }, + { + "epoch": 0.062121714255812734, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 7123 + }, + { + "epoch": 0.06213043554098132, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 7124 + }, + { + "epoch": 0.0621391568261499, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 7125 + }, + { + "epoch": 0.062147878111318486, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 7126 + }, + { + "epoch": 0.062156599396487065, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 7127 + }, + { + "epoch": 0.06216532068165565, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 7128 + }, + { + "epoch": 0.06217404196682423, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 7129 + }, + { + "epoch": 0.06218276325199281, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 7130 + }, + { + "epoch": 0.0621914845371614, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 7131 + }, + { + "epoch": 0.062200205822329976, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7132 + }, + { + "epoch": 0.06220892710749856, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7133 + }, + { + "epoch": 0.06221764839266714, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7134 + }, + { + "epoch": 0.06222636967783573, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 7135 + }, + { + "epoch": 0.06223509096300431, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7136 + }, + { + "epoch": 0.06224381224817289, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7137 + }, + { + "epoch": 0.062252533533341474, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 7138 + }, + { + "epoch": 0.06226125481851005, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7139 + }, + { + "epoch": 0.06226997610367864, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7140 + }, + { + "epoch": 0.06227869738884722, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7141 + }, + { + "epoch": 0.062287418674015806, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 7142 + }, + { + "epoch": 0.062296139959184385, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 7143 + }, + { + "epoch": 0.062304861244352965, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 7144 + }, + { + "epoch": 0.06231358252952155, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 7145 + }, + { + "epoch": 0.06232230381469013, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7146 + }, + { + "epoch": 0.06233102509985872, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7147 + }, + { + "epoch": 0.062339746385027296, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7148 + }, + { + "epoch": 0.06234846767019588, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7149 + }, + { + "epoch": 0.06235718895536446, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 7150 + }, + { + "epoch": 0.06236591024053304, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7151 + }, + { + "epoch": 0.06237463152570163, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7152 + }, + { + "epoch": 0.06238335281087021, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 7153 + }, + { + "epoch": 0.062392074096038794, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 7154 + }, + { + "epoch": 0.06240079538120737, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7155 + }, + { + "epoch": 0.06240951666637596, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7156 + }, + { + "epoch": 0.06241823795154454, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7157 + }, + { + "epoch": 0.062426959236713125, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 7158 + }, + { + "epoch": 0.062435680521881705, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 7159 + }, + { + "epoch": 0.062444401807050284, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7160 + }, + { + "epoch": 0.06245312309221887, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7161 + }, + { + "epoch": 0.06246184437738745, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 7162 + }, + { + "epoch": 0.06247056566255604, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7163 + }, + { + "epoch": 0.062479286947724616, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 7164 + }, + { + "epoch": 0.0624880082328932, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7165 + }, + { + "epoch": 0.06249672951806178, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 7166 + }, + { + "epoch": 0.06250545080323036, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 7167 + }, + { + "epoch": 0.06251417208839895, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 7168 + }, + { + "epoch": 0.06252289337356753, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 7169 + }, + { + "epoch": 0.0625316146587361, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7170 + }, + { + "epoch": 0.06254033594390469, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7171 + }, + { + "epoch": 0.06254905722907328, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 7172 + }, + { + "epoch": 0.06255777851424185, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7173 + }, + { + "epoch": 0.06256649979941044, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 7174 + }, + { + "epoch": 0.06257522108457902, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7175 + }, + { + "epoch": 0.06258394236974761, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 7176 + }, + { + "epoch": 0.06259266365491618, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 7177 + }, + { + "epoch": 0.06260138494008477, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7178 + }, + { + "epoch": 0.06261010622525336, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 7179 + }, + { + "epoch": 0.06261882751042193, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7180 + }, + { + "epoch": 0.06262754879559052, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 7181 + }, + { + "epoch": 0.0626362700807591, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 7182 + }, + { + "epoch": 0.06264499136592769, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7183 + }, + { + "epoch": 0.06265371265109626, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 7184 + }, + { + "epoch": 0.06266243393626485, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 7185 + }, + { + "epoch": 0.06267115522143343, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7186 + }, + { + "epoch": 0.06267987650660202, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 7187 + }, + { + "epoch": 0.06268859779177059, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 7188 + }, + { + "epoch": 0.06269731907693918, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 7189 + }, + { + "epoch": 0.06270604036210776, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7190 + }, + { + "epoch": 0.06271476164727634, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 7191 + }, + { + "epoch": 0.06272348293244492, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7192 + }, + { + "epoch": 0.06273220421761351, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 7193 + }, + { + "epoch": 0.0627409255027821, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7194 + }, + { + "epoch": 0.06274964678795067, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7195 + }, + { + "epoch": 0.06275836807311926, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 7196 + }, + { + "epoch": 0.06276708935828784, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7197 + }, + { + "epoch": 0.06277581064345641, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7198 + }, + { + "epoch": 0.062784531928625, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7199 + }, + { + "epoch": 0.06279325321379359, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 7200 + }, + { + "epoch": 0.06280197449896217, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7201 + }, + { + "epoch": 0.06281069578413075, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 7202 + }, + { + "epoch": 0.06281941706929933, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7203 + }, + { + "epoch": 0.06282813835446792, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7204 + }, + { + "epoch": 0.06283685963963649, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 7205 + }, + { + "epoch": 0.06284558092480508, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 7206 + }, + { + "epoch": 0.06285430220997366, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 7207 + }, + { + "epoch": 0.06286302349514225, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7208 + }, + { + "epoch": 0.06287174478031082, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 7209 + }, + { + "epoch": 0.06288046606547941, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7210 + }, + { + "epoch": 0.062889187350648, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7211 + }, + { + "epoch": 0.06289790863581657, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 7212 + }, + { + "epoch": 0.06290662992098515, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7213 + }, + { + "epoch": 0.06291535120615374, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7214 + }, + { + "epoch": 0.06292407249132233, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7215 + }, + { + "epoch": 0.0629327937764909, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 7216 + }, + { + "epoch": 0.06294151506165949, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 7217 + }, + { + "epoch": 0.06295023634682807, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 7218 + }, + { + "epoch": 0.06295895763199665, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 7219 + }, + { + "epoch": 0.06296767891716523, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7220 + }, + { + "epoch": 0.06297640020233382, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7221 + }, + { + "epoch": 0.0629851214875024, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7222 + }, + { + "epoch": 0.06299384277267098, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7223 + }, + { + "epoch": 0.06300256405783956, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 7224 + }, + { + "epoch": 0.06301128534300815, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 7225 + }, + { + "epoch": 0.06302000662817672, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 7226 + }, + { + "epoch": 0.06302872791334531, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7227 + }, + { + "epoch": 0.0630374491985139, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 7228 + }, + { + "epoch": 0.06304617048368248, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 7229 + }, + { + "epoch": 0.06305489176885105, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 7230 + }, + { + "epoch": 0.06306361305401964, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 7231 + }, + { + "epoch": 0.06307233433918823, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 7232 + }, + { + "epoch": 0.0630810556243568, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7233 + }, + { + "epoch": 0.06308977690952539, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 7234 + }, + { + "epoch": 0.06309849819469397, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7235 + }, + { + "epoch": 0.06310721947986256, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7236 + }, + { + "epoch": 0.06311594076503113, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7237 + }, + { + "epoch": 0.06312466205019972, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 7238 + }, + { + "epoch": 0.0631333833353683, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 7239 + }, + { + "epoch": 0.06314210462053688, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7240 + }, + { + "epoch": 0.06315082590570546, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7241 + }, + { + "epoch": 0.06315954719087405, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 7242 + }, + { + "epoch": 0.06316826847604264, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 7243 + }, + { + "epoch": 0.06317698976121121, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 7244 + }, + { + "epoch": 0.0631857110463798, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7245 + }, + { + "epoch": 0.06319443233154838, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7246 + }, + { + "epoch": 0.06320315361671695, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 7247 + }, + { + "epoch": 0.06321187490188554, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7248 + }, + { + "epoch": 0.06322059618705413, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 7249 + }, + { + "epoch": 0.06322931747222271, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 7250 + }, + { + "epoch": 0.06323803875739128, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 7251 + }, + { + "epoch": 0.06324676004255987, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 7252 + }, + { + "epoch": 0.06325548132772846, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 7253 + }, + { + "epoch": 0.06326420261289703, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7254 + }, + { + "epoch": 0.06327292389806562, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7255 + }, + { + "epoch": 0.0632816451832342, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7256 + }, + { + "epoch": 0.06329036646840279, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7257 + }, + { + "epoch": 0.06329908775357136, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7258 + }, + { + "epoch": 0.06330780903873995, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 7259 + }, + { + "epoch": 0.06331653032390853, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7260 + }, + { + "epoch": 0.06332525160907711, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 7261 + }, + { + "epoch": 0.0633339728942457, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 7262 + }, + { + "epoch": 0.06334269417941428, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7263 + }, + { + "epoch": 0.06335141546458287, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7264 + }, + { + "epoch": 0.06336013674975144, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7265 + }, + { + "epoch": 0.06336885803492003, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 7266 + }, + { + "epoch": 0.06337757932008861, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7267 + }, + { + "epoch": 0.06338630060525718, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7268 + }, + { + "epoch": 0.06339502189042577, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 7269 + }, + { + "epoch": 0.06340374317559436, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 7270 + }, + { + "epoch": 0.06341246446076294, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 7271 + }, + { + "epoch": 0.06342118574593152, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7272 + }, + { + "epoch": 0.0634299070311001, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 7273 + }, + { + "epoch": 0.06343862831626869, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7274 + }, + { + "epoch": 0.06344734960143726, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7275 + }, + { + "epoch": 0.06345607088660585, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7276 + }, + { + "epoch": 0.06346479217177443, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 7277 + }, + { + "epoch": 0.06347351345694302, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 7278 + }, + { + "epoch": 0.06348223474211159, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 7279 + }, + { + "epoch": 0.06349095602728018, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 7280 + }, + { + "epoch": 0.06349967731244877, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 7281 + }, + { + "epoch": 0.06350839859761734, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 7282 + }, + { + "epoch": 0.06351711988278592, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 7283 + }, + { + "epoch": 0.06352584116795451, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7284 + }, + { + "epoch": 0.0635345624531231, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7285 + }, + { + "epoch": 0.06354328373829167, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7286 + }, + { + "epoch": 0.06355200502346026, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 7287 + }, + { + "epoch": 0.06356072630862884, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7288 + }, + { + "epoch": 0.06356944759379742, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7289 + }, + { + "epoch": 0.063578168878966, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7290 + }, + { + "epoch": 0.06358689016413459, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7291 + }, + { + "epoch": 0.06359561144930317, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 7292 + }, + { + "epoch": 0.06360433273447175, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7293 + }, + { + "epoch": 0.06361305401964033, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 7294 + }, + { + "epoch": 0.06362177530480892, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7295 + }, + { + "epoch": 0.0636304965899775, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7296 + }, + { + "epoch": 0.06363921787514608, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 7297 + }, + { + "epoch": 0.06364793916031466, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 7298 + }, + { + "epoch": 0.06365666044548325, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7299 + }, + { + "epoch": 0.06366538173065182, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7300 + }, + { + "epoch": 0.06367410301582041, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 7301 + }, + { + "epoch": 0.063682824300989, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7302 + }, + { + "epoch": 0.06369154558615758, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 7303 + }, + { + "epoch": 0.06370026687132616, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7304 + }, + { + "epoch": 0.06370898815649474, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7305 + }, + { + "epoch": 0.06371770944166333, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7306 + }, + { + "epoch": 0.0637264307268319, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7307 + }, + { + "epoch": 0.06373515201200049, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7308 + }, + { + "epoch": 0.06374387329716907, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 7309 + }, + { + "epoch": 0.06375259458233766, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7310 + }, + { + "epoch": 0.06376131586750623, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7311 + }, + { + "epoch": 0.06377003715267482, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 7312 + }, + { + "epoch": 0.0637787584378434, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7313 + }, + { + "epoch": 0.06378747972301198, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 7314 + }, + { + "epoch": 0.06379620100818056, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 7315 + }, + { + "epoch": 0.06380492229334915, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 7316 + }, + { + "epoch": 0.06381364357851774, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 7317 + }, + { + "epoch": 0.06382236486368631, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7318 + }, + { + "epoch": 0.0638310861488549, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7319 + }, + { + "epoch": 0.06383980743402348, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7320 + }, + { + "epoch": 0.06384852871919205, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7321 + }, + { + "epoch": 0.06385725000436064, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 7322 + }, + { + "epoch": 0.06386597128952923, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7323 + }, + { + "epoch": 0.06387469257469781, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 7324 + }, + { + "epoch": 0.06388341385986639, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 7325 + }, + { + "epoch": 0.06389213514503497, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 7326 + }, + { + "epoch": 0.06390085643020356, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 7327 + }, + { + "epoch": 0.06390957771537213, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 7328 + }, + { + "epoch": 0.06391829900054072, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 7329 + }, + { + "epoch": 0.0639270202857093, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7330 + }, + { + "epoch": 0.06393574157087789, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7331 + }, + { + "epoch": 0.06394446285604646, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7332 + }, + { + "epoch": 0.06395318414121505, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7333 + }, + { + "epoch": 0.06396190542638364, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 7334 + }, + { + "epoch": 0.06397062671155221, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7335 + }, + { + "epoch": 0.0639793479967208, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 7336 + }, + { + "epoch": 0.06398806928188938, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7337 + }, + { + "epoch": 0.06399679056705797, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7338 + }, + { + "epoch": 0.06400551185222654, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7339 + }, + { + "epoch": 0.06401423313739513, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0875, + "step": 7340 + }, + { + "epoch": 0.06402295442256371, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7341 + }, + { + "epoch": 0.06403167570773229, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 7342 + }, + { + "epoch": 0.06404039699290087, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 7343 + }, + { + "epoch": 0.06404911827806946, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 7344 + }, + { + "epoch": 0.06405783956323804, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7345 + }, + { + "epoch": 0.06406656084840662, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7346 + }, + { + "epoch": 0.0640752821335752, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 7347 + }, + { + "epoch": 0.06408400341874379, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 7348 + }, + { + "epoch": 0.06409272470391236, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 7349 + }, + { + "epoch": 0.06410144598908095, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 7350 + }, + { + "epoch": 0.06411016727424954, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7351 + }, + { + "epoch": 0.06411888855941812, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7352 + }, + { + "epoch": 0.0641276098445867, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7353 + }, + { + "epoch": 0.06413633112975528, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 7354 + }, + { + "epoch": 0.06414505241492387, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 7355 + }, + { + "epoch": 0.06415377370009244, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7356 + }, + { + "epoch": 0.06416249498526103, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 7357 + }, + { + "epoch": 0.06417121627042961, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7358 + }, + { + "epoch": 0.0641799375555982, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7359 + }, + { + "epoch": 0.06418865884076677, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7360 + }, + { + "epoch": 0.06419738012593536, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7361 + }, + { + "epoch": 0.06420610141110394, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 7362 + }, + { + "epoch": 0.06421482269627252, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7363 + }, + { + "epoch": 0.0642235439814411, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 7364 + }, + { + "epoch": 0.06423226526660969, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 7365 + }, + { + "epoch": 0.06424098655177828, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 7366 + }, + { + "epoch": 0.06424970783694685, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 7367 + }, + { + "epoch": 0.06425842912211543, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 7368 + }, + { + "epoch": 0.06426715040728402, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7369 + }, + { + "epoch": 0.0642758716924526, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 7370 + }, + { + "epoch": 0.06428459297762118, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 7371 + }, + { + "epoch": 0.06429331426278977, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 7372 + }, + { + "epoch": 0.06430203554795835, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 7373 + }, + { + "epoch": 0.06431075683312693, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7374 + }, + { + "epoch": 0.06431947811829551, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7375 + }, + { + "epoch": 0.0643281994034641, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7376 + }, + { + "epoch": 0.06433692068863267, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7377 + }, + { + "epoch": 0.06434564197380126, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7378 + }, + { + "epoch": 0.06435436325896984, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 7379 + }, + { + "epoch": 0.06436308454413843, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 7380 + }, + { + "epoch": 0.064371805829307, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7381 + }, + { + "epoch": 0.06438052711447559, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7382 + }, + { + "epoch": 0.06438924839964418, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7383 + }, + { + "epoch": 0.06439796968481275, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7384 + }, + { + "epoch": 0.06440669096998133, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7385 + }, + { + "epoch": 0.06441541225514992, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 7386 + }, + { + "epoch": 0.0644241335403185, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 7387 + }, + { + "epoch": 0.06443285482548708, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7388 + }, + { + "epoch": 0.06444157611065567, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 7389 + }, + { + "epoch": 0.06445029739582425, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7390 + }, + { + "epoch": 0.06445901868099282, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 7391 + }, + { + "epoch": 0.06446773996616141, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 7392 + }, + { + "epoch": 0.06447646125133, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7393 + }, + { + "epoch": 0.06448518253649858, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7394 + }, + { + "epoch": 0.06449390382166716, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 7395 + }, + { + "epoch": 0.06450262510683574, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7396 + }, + { + "epoch": 0.06451134639200433, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7397 + }, + { + "epoch": 0.0645200676771729, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 7398 + }, + { + "epoch": 0.06452878896234149, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7399 + }, + { + "epoch": 0.06453751024751007, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 7400 + }, + { + "epoch": 0.06454623153267866, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 7401 + }, + { + "epoch": 0.06455495281784723, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 7402 + }, + { + "epoch": 0.06456367410301582, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 7403 + }, + { + "epoch": 0.0645723953881844, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7404 + }, + { + "epoch": 0.06458111667335298, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 7405 + }, + { + "epoch": 0.06458983795852156, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7406 + }, + { + "epoch": 0.06459855924369015, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7407 + }, + { + "epoch": 0.06460728052885874, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 7408 + }, + { + "epoch": 0.06461600181402731, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7409 + }, + { + "epoch": 0.0646247230991959, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7410 + }, + { + "epoch": 0.06463344438436448, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 7411 + }, + { + "epoch": 0.06464216566953307, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7412 + }, + { + "epoch": 0.06465088695470164, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 7413 + }, + { + "epoch": 0.06465960823987023, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7414 + }, + { + "epoch": 0.06466832952503881, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7415 + }, + { + "epoch": 0.06467705081020739, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 7416 + }, + { + "epoch": 0.06468577209537597, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7417 + }, + { + "epoch": 0.06469449338054456, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7418 + }, + { + "epoch": 0.06470321466571315, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 7419 + }, + { + "epoch": 0.06471193595088172, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 7420 + }, + { + "epoch": 0.0647206572360503, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7421 + }, + { + "epoch": 0.06472937852121889, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 7422 + }, + { + "epoch": 0.06473809980638746, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7423 + }, + { + "epoch": 0.06474682109155605, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7424 + }, + { + "epoch": 0.06475554237672464, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 7425 + }, + { + "epoch": 0.06476426366189322, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 7426 + }, + { + "epoch": 0.0647729849470618, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7427 + }, + { + "epoch": 0.06478170623223038, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 7428 + }, + { + "epoch": 0.06479042751739897, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7429 + }, + { + "epoch": 0.06479914880256754, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 7430 + }, + { + "epoch": 0.06480787008773613, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 7431 + }, + { + "epoch": 0.06481659137290471, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7432 + }, + { + "epoch": 0.0648253126580733, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 7433 + }, + { + "epoch": 0.06483403394324187, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7434 + }, + { + "epoch": 0.06484275522841046, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 7435 + }, + { + "epoch": 0.06485147651357905, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7436 + }, + { + "epoch": 0.06486019779874762, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7437 + }, + { + "epoch": 0.0648689190839162, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 7438 + }, + { + "epoch": 0.06487764036908479, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7439 + }, + { + "epoch": 0.06488636165425338, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7440 + }, + { + "epoch": 0.06489508293942195, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 7441 + }, + { + "epoch": 0.06490380422459054, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 7442 + }, + { + "epoch": 0.06491252550975912, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7443 + }, + { + "epoch": 0.0649212467949277, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 7444 + }, + { + "epoch": 0.06492996808009628, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 7445 + }, + { + "epoch": 0.06493868936526487, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7446 + }, + { + "epoch": 0.06494741065043345, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 7447 + }, + { + "epoch": 0.06495613193560203, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7448 + }, + { + "epoch": 0.06496485322077061, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7449 + }, + { + "epoch": 0.0649735745059392, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 7450 + }, + { + "epoch": 0.06498229579110777, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 7451 + }, + { + "epoch": 0.06499101707627636, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 7452 + }, + { + "epoch": 0.06499973836144494, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 7453 + }, + { + "epoch": 0.06500845964661353, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7454 + }, + { + "epoch": 0.0650171809317821, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 7455 + }, + { + "epoch": 0.06502590221695069, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7456 + }, + { + "epoch": 0.06503462350211928, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7457 + }, + { + "epoch": 0.06504334478728785, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 7458 + }, + { + "epoch": 0.06505206607245644, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 7459 + }, + { + "epoch": 0.06506078735762502, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 7460 + }, + { + "epoch": 0.06506950864279361, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7461 + }, + { + "epoch": 0.06507822992796218, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7462 + }, + { + "epoch": 0.06508695121313077, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 7463 + }, + { + "epoch": 0.06509567249829935, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 7464 + }, + { + "epoch": 0.06510439378346793, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 7465 + }, + { + "epoch": 0.06511311506863651, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 7466 + }, + { + "epoch": 0.0651218363538051, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7467 + }, + { + "epoch": 0.06513055763897369, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 7468 + }, + { + "epoch": 0.06513927892414226, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7469 + }, + { + "epoch": 0.06514800020931084, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 7470 + }, + { + "epoch": 0.06515672149447943, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 7471 + }, + { + "epoch": 0.065165442779648, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7472 + }, + { + "epoch": 0.06517416406481659, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 7473 + }, + { + "epoch": 0.06518288534998518, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7474 + }, + { + "epoch": 0.06519160663515376, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7475 + }, + { + "epoch": 0.06520032792032233, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7476 + }, + { + "epoch": 0.06520904920549092, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7477 + }, + { + "epoch": 0.06521777049065951, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 7478 + }, + { + "epoch": 0.06522649177582808, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 7479 + }, + { + "epoch": 0.06523521306099667, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 7480 + }, + { + "epoch": 0.06524393434616525, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 7481 + }, + { + "epoch": 0.06525265563133384, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7482 + }, + { + "epoch": 0.06526137691650241, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 7483 + }, + { + "epoch": 0.065270098201671, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 7484 + }, + { + "epoch": 0.06527881948683958, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 7485 + }, + { + "epoch": 0.06528754077200816, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 7486 + }, + { + "epoch": 0.06529626205717674, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7487 + }, + { + "epoch": 0.06530498334234533, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7488 + }, + { + "epoch": 0.06531370462751392, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7489 + }, + { + "epoch": 0.06532242591268249, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 7490 + }, + { + "epoch": 0.06533114719785107, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 7491 + }, + { + "epoch": 0.06533986848301966, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7492 + }, + { + "epoch": 0.06534858976818823, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 7493 + }, + { + "epoch": 0.06535731105335682, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7494 + }, + { + "epoch": 0.0653660323385254, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 7495 + }, + { + "epoch": 0.06537475362369399, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7496 + }, + { + "epoch": 0.06538347490886257, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7497 + }, + { + "epoch": 0.06539219619403115, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 7498 + }, + { + "epoch": 0.06540091747919974, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7499 + }, + { + "epoch": 0.06540963876436831, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 7500 + }, + { + "epoch": 0.0654183600495369, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7501 + }, + { + "epoch": 0.06542708133470548, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 7502 + }, + { + "epoch": 0.06543580261987407, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 7503 + }, + { + "epoch": 0.06544452390504264, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 7504 + }, + { + "epoch": 0.06545324519021123, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 7505 + }, + { + "epoch": 0.06546196647537982, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 7506 + }, + { + "epoch": 0.06547068776054839, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7507 + }, + { + "epoch": 0.06547940904571697, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7508 + }, + { + "epoch": 0.06548813033088556, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7509 + }, + { + "epoch": 0.06549685161605415, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7510 + }, + { + "epoch": 0.06550557290122272, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 7511 + }, + { + "epoch": 0.0655142941863913, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7512 + }, + { + "epoch": 0.06552301547155989, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7513 + }, + { + "epoch": 0.06553173675672846, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 7514 + }, + { + "epoch": 0.06554045804189705, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 7515 + }, + { + "epoch": 0.06554917932706564, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 7516 + }, + { + "epoch": 0.06555790061223422, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7517 + }, + { + "epoch": 0.0655666218974028, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 7518 + }, + { + "epoch": 0.06557534318257138, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7519 + }, + { + "epoch": 0.06558406446773997, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 7520 + }, + { + "epoch": 0.06559278575290854, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 7521 + }, + { + "epoch": 0.06560150703807713, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7522 + }, + { + "epoch": 0.06561022832324571, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7523 + }, + { + "epoch": 0.0656189496084143, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7524 + }, + { + "epoch": 0.06562767089358287, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 7525 + }, + { + "epoch": 0.06563639217875146, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7526 + }, + { + "epoch": 0.06564511346392005, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7527 + }, + { + "epoch": 0.06565383474908863, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 7528 + }, + { + "epoch": 0.0656625560342572, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7529 + }, + { + "epoch": 0.06567127731942579, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7530 + }, + { + "epoch": 0.06567999860459438, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7531 + }, + { + "epoch": 0.06568871988976295, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7532 + }, + { + "epoch": 0.06569744117493154, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 7533 + }, + { + "epoch": 0.06570616246010012, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7534 + }, + { + "epoch": 0.06571488374526871, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7535 + }, + { + "epoch": 0.06572360503043728, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 7536 + }, + { + "epoch": 0.06573232631560587, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 7537 + }, + { + "epoch": 0.06574104760077445, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7538 + }, + { + "epoch": 0.06574976888594303, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 7539 + }, + { + "epoch": 0.06575849017111161, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 7540 + }, + { + "epoch": 0.0657672114562802, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 7541 + }, + { + "epoch": 0.06577593274144879, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 7542 + }, + { + "epoch": 0.06578465402661736, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7543 + }, + { + "epoch": 0.06579337531178595, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 7544 + }, + { + "epoch": 0.06580209659695453, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7545 + }, + { + "epoch": 0.0658108178821231, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7546 + }, + { + "epoch": 0.06581953916729169, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 7547 + }, + { + "epoch": 0.06582826045246028, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 7548 + }, + { + "epoch": 0.06583698173762886, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7549 + }, + { + "epoch": 0.06584570302279744, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 7550 + }, + { + "epoch": 0.06585442430796602, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7551 + }, + { + "epoch": 0.06586314559313461, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7552 + }, + { + "epoch": 0.06587186687830318, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7553 + }, + { + "epoch": 0.06588058816347177, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 7554 + }, + { + "epoch": 0.06588930944864035, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7555 + }, + { + "epoch": 0.06589803073380894, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 7556 + }, + { + "epoch": 0.06590675201897751, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7557 + }, + { + "epoch": 0.0659154733041461, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7558 + }, + { + "epoch": 0.06592419458931469, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 7559 + }, + { + "epoch": 0.06593291587448326, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7560 + }, + { + "epoch": 0.06594163715965184, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 7561 + }, + { + "epoch": 0.06595035844482043, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 7562 + }, + { + "epoch": 0.06595907972998902, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 7563 + }, + { + "epoch": 0.06596780101515759, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7564 + }, + { + "epoch": 0.06597652230032618, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7565 + }, + { + "epoch": 0.06598524358549476, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 7566 + }, + { + "epoch": 0.06599396487066334, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7567 + }, + { + "epoch": 0.06600268615583192, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 7568 + }, + { + "epoch": 0.06601140744100051, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 7569 + }, + { + "epoch": 0.0660201287261691, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7570 + }, + { + "epoch": 0.06602885001133767, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 7571 + }, + { + "epoch": 0.06603757129650625, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 7572 + }, + { + "epoch": 0.06604629258167484, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 7573 + }, + { + "epoch": 0.06605501386684341, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7574 + }, + { + "epoch": 0.066063735152012, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 7575 + }, + { + "epoch": 0.06607245643718059, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 7576 + }, + { + "epoch": 0.06608117772234917, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 7577 + }, + { + "epoch": 0.06608989900751774, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7578 + }, + { + "epoch": 0.06609862029268633, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7579 + }, + { + "epoch": 0.06610734157785492, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7580 + }, + { + "epoch": 0.06611606286302349, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7581 + }, + { + "epoch": 0.06612478414819208, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 7582 + }, + { + "epoch": 0.06613350543336066, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7583 + }, + { + "epoch": 0.06614222671852925, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 7584 + }, + { + "epoch": 0.06615094800369782, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 7585 + }, + { + "epoch": 0.06615966928886641, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 7586 + }, + { + "epoch": 0.066168390574035, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7587 + }, + { + "epoch": 0.06617711185920357, + "grad_norm": 0.515625, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7588 + }, + { + "epoch": 0.06618583314437215, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 7589 + }, + { + "epoch": 0.06619455442954074, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7590 + }, + { + "epoch": 0.06620327571470933, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 7591 + }, + { + "epoch": 0.0662119969998779, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7592 + }, + { + "epoch": 0.06622071828504648, + "grad_norm": 0.400390625, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7593 + }, + { + "epoch": 0.06622943957021507, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 7594 + }, + { + "epoch": 0.06623816085538364, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7595 + }, + { + "epoch": 0.06624688214055223, + "grad_norm": 0.32421875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 7596 + }, + { + "epoch": 0.06625560342572082, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7597 + }, + { + "epoch": 0.0662643247108894, + "grad_norm": 0.40234375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7598 + }, + { + "epoch": 0.06627304599605797, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 7599 + }, + { + "epoch": 0.06628176728122656, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7600 + }, + { + "epoch": 0.06629048856639515, + "grad_norm": 0.64453125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 7601 + }, + { + "epoch": 0.06629920985156372, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 7602 + }, + { + "epoch": 0.0663079311367323, + "grad_norm": 0.75, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7603 + }, + { + "epoch": 0.06631665242190089, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 7604 + }, + { + "epoch": 0.06632537370706948, + "grad_norm": 0.78515625, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7605 + }, + { + "epoch": 0.06633409499223805, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7606 + }, + { + "epoch": 0.06634281627740664, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 7607 + }, + { + "epoch": 0.06635153756257522, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7608 + }, + { + "epoch": 0.0663602588477438, + "grad_norm": 0.8515625, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7609 + }, + { + "epoch": 0.06636898013291238, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 7610 + }, + { + "epoch": 0.06637770141808097, + "grad_norm": 0.828125, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7611 + }, + { + "epoch": 0.06638642270324956, + "grad_norm": 0.58984375, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 7612 + }, + { + "epoch": 0.06639514398841813, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7613 + }, + { + "epoch": 0.06640386527358672, + "grad_norm": 0.83203125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7614 + }, + { + "epoch": 0.0664125865587553, + "grad_norm": 0.7109375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 7615 + }, + { + "epoch": 0.06642130784392387, + "grad_norm": 0.6640625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 7616 + }, + { + "epoch": 0.06643002912909246, + "grad_norm": 0.373046875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 7617 + }, + { + "epoch": 0.06643875041426105, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7618 + }, + { + "epoch": 0.06644747169942963, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7619 + }, + { + "epoch": 0.0664561929845982, + "grad_norm": 0.4453125, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7620 + }, + { + "epoch": 0.06646491426976679, + "grad_norm": 0.466796875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7621 + }, + { + "epoch": 0.06647363555493538, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7622 + }, + { + "epoch": 0.06648235684010395, + "grad_norm": 0.466796875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7623 + }, + { + "epoch": 0.06649107812527254, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 7624 + }, + { + "epoch": 0.06649979941044112, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7625 + }, + { + "epoch": 0.06650852069560971, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7626 + }, + { + "epoch": 0.06651724198077828, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7627 + }, + { + "epoch": 0.06652596326594687, + "grad_norm": 0.37890625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7628 + }, + { + "epoch": 0.06653468455111546, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7629 + }, + { + "epoch": 0.06654340583628403, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 7630 + }, + { + "epoch": 0.06655212712145261, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7631 + }, + { + "epoch": 0.0665608484066212, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7632 + }, + { + "epoch": 0.06656956969178979, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 7633 + }, + { + "epoch": 0.06657829097695836, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7634 + }, + { + "epoch": 0.06658701226212695, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7635 + }, + { + "epoch": 0.06659573354729553, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 7636 + }, + { + "epoch": 0.0666044548324641, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7637 + }, + { + "epoch": 0.06661317611763269, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7638 + }, + { + "epoch": 0.06662189740280128, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 7639 + }, + { + "epoch": 0.06663061868796986, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7640 + }, + { + "epoch": 0.06663933997313844, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 7641 + }, + { + "epoch": 0.06664806125830702, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 7642 + }, + { + "epoch": 0.06665678254347561, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7643 + }, + { + "epoch": 0.0666655038286442, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 7644 + }, + { + "epoch": 0.06667422511381277, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7645 + }, + { + "epoch": 0.06668294639898135, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7646 + }, + { + "epoch": 0.06669166768414994, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 7647 + }, + { + "epoch": 0.06670038896931851, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 7648 + }, + { + "epoch": 0.0667091102544871, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 7649 + }, + { + "epoch": 0.06671783153965569, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7650 + }, + { + "epoch": 0.06672655282482427, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7651 + }, + { + "epoch": 0.06673527410999285, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 7652 + }, + { + "epoch": 0.06674399539516143, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 7653 + }, + { + "epoch": 0.06675271668033002, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 7654 + }, + { + "epoch": 0.06676143796549859, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7655 + }, + { + "epoch": 0.06677015925066718, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7656 + }, + { + "epoch": 0.06677888053583576, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7657 + }, + { + "epoch": 0.06678760182100435, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 7658 + }, + { + "epoch": 0.06679632310617292, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7659 + }, + { + "epoch": 0.06680504439134151, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 7660 + }, + { + "epoch": 0.0668137656765101, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7661 + }, + { + "epoch": 0.06682248696167867, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7662 + }, + { + "epoch": 0.06683120824684725, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7663 + }, + { + "epoch": 0.06683992953201584, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7664 + }, + { + "epoch": 0.06684865081718443, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7665 + }, + { + "epoch": 0.066857372102353, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7666 + }, + { + "epoch": 0.06686609338752159, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 7667 + }, + { + "epoch": 0.06687481467269017, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7668 + }, + { + "epoch": 0.06688353595785874, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7669 + }, + { + "epoch": 0.06689225724302733, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 7670 + }, + { + "epoch": 0.06690097852819592, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 7671 + }, + { + "epoch": 0.0669096998133645, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7672 + }, + { + "epoch": 0.06691842109853308, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7673 + }, + { + "epoch": 0.06692714238370166, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 7674 + }, + { + "epoch": 0.06693586366887025, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7675 + }, + { + "epoch": 0.06694458495403882, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7676 + }, + { + "epoch": 0.06695330623920741, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 7677 + }, + { + "epoch": 0.066962027524376, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7678 + }, + { + "epoch": 0.06697074880954458, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7679 + }, + { + "epoch": 0.06697947009471315, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 7680 + }, + { + "epoch": 0.06698819137988174, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 7681 + }, + { + "epoch": 0.06699691266505033, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 7682 + }, + { + "epoch": 0.0670056339502189, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7683 + }, + { + "epoch": 0.06701435523538748, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 7684 + }, + { + "epoch": 0.06702307652055607, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7685 + }, + { + "epoch": 0.06703179780572466, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7686 + }, + { + "epoch": 0.06704051909089323, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7687 + }, + { + "epoch": 0.06704924037606182, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 7688 + }, + { + "epoch": 0.0670579616612304, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 7689 + }, + { + "epoch": 0.06706668294639898, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7690 + }, + { + "epoch": 0.06707540423156756, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7691 + }, + { + "epoch": 0.06708412551673615, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7692 + }, + { + "epoch": 0.06709284680190473, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7693 + }, + { + "epoch": 0.06710156808707331, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 7694 + }, + { + "epoch": 0.0671102893722419, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 7695 + }, + { + "epoch": 0.06711901065741048, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 7696 + }, + { + "epoch": 0.06712773194257905, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7697 + }, + { + "epoch": 0.06713645322774764, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7698 + }, + { + "epoch": 0.06714517451291623, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7699 + }, + { + "epoch": 0.06715389579808481, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7700 + }, + { + "epoch": 0.06716261708325338, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 7701 + }, + { + "epoch": 0.06717133836842197, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7702 + }, + { + "epoch": 0.06718005965359056, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 7703 + }, + { + "epoch": 0.06718878093875913, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 7704 + }, + { + "epoch": 0.06719750222392772, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 7705 + }, + { + "epoch": 0.0672062235090963, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 7706 + }, + { + "epoch": 0.06721494479426489, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7707 + }, + { + "epoch": 0.06722366607943346, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7708 + }, + { + "epoch": 0.06723238736460205, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7709 + }, + { + "epoch": 0.06724110864977063, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 7710 + }, + { + "epoch": 0.0672498299349392, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7711 + }, + { + "epoch": 0.06725855122010779, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7712 + }, + { + "epoch": 0.06726727250527638, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7713 + }, + { + "epoch": 0.06727599379044497, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 7714 + }, + { + "epoch": 0.06728471507561354, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7715 + }, + { + "epoch": 0.06729343636078212, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7716 + }, + { + "epoch": 0.06730215764595071, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 7717 + }, + { + "epoch": 0.06731087893111928, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 7718 + }, + { + "epoch": 0.06731960021628787, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 7719 + }, + { + "epoch": 0.06732832150145646, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 7720 + }, + { + "epoch": 0.06733704278662504, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7721 + }, + { + "epoch": 0.06734576407179362, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7722 + }, + { + "epoch": 0.0673544853569622, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7723 + }, + { + "epoch": 0.06736320664213079, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 7724 + }, + { + "epoch": 0.06737192792729936, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7725 + }, + { + "epoch": 0.06738064921246795, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7726 + }, + { + "epoch": 0.06738937049763653, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7727 + }, + { + "epoch": 0.06739809178280512, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 7728 + }, + { + "epoch": 0.06740681306797369, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7729 + }, + { + "epoch": 0.06741553435314228, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 7730 + }, + { + "epoch": 0.06742425563831086, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7731 + }, + { + "epoch": 0.06743297692347944, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 7732 + }, + { + "epoch": 0.06744169820864802, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 7733 + }, + { + "epoch": 0.06745041949381661, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7734 + }, + { + "epoch": 0.0674591407789852, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7735 + }, + { + "epoch": 0.06746786206415377, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 7736 + }, + { + "epoch": 0.06747658334932236, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 7737 + }, + { + "epoch": 0.06748530463449094, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7738 + }, + { + "epoch": 0.06749402591965951, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7739 + }, + { + "epoch": 0.0675027472048281, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7740 + }, + { + "epoch": 0.06751146848999669, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 7741 + }, + { + "epoch": 0.06752018977516527, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 7742 + }, + { + "epoch": 0.06752891106033385, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7743 + }, + { + "epoch": 0.06753763234550243, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7744 + }, + { + "epoch": 0.06754635363067102, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 7745 + }, + { + "epoch": 0.06755507491583959, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7746 + }, + { + "epoch": 0.06756379620100818, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 7747 + }, + { + "epoch": 0.06757251748617676, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7748 + }, + { + "epoch": 0.06758123877134535, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7749 + }, + { + "epoch": 0.06758996005651392, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7750 + }, + { + "epoch": 0.06759868134168251, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 7751 + }, + { + "epoch": 0.0676074026268511, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7752 + }, + { + "epoch": 0.06761612391201967, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 7753 + }, + { + "epoch": 0.06762484519718825, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 7754 + }, + { + "epoch": 0.06763356648235684, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7755 + }, + { + "epoch": 0.06764228776752543, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 7756 + }, + { + "epoch": 0.067651009052694, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 7757 + }, + { + "epoch": 0.06765973033786259, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7758 + }, + { + "epoch": 0.06766845162303117, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7759 + }, + { + "epoch": 0.06767717290819976, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7760 + }, + { + "epoch": 0.06768589419336833, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 7761 + }, + { + "epoch": 0.06769461547853692, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7762 + }, + { + "epoch": 0.0677033367637055, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7763 + }, + { + "epoch": 0.06771205804887408, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 7764 + }, + { + "epoch": 0.06772077933404266, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 7765 + }, + { + "epoch": 0.06772950061921125, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 7766 + }, + { + "epoch": 0.06773822190437984, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 7767 + }, + { + "epoch": 0.06774694318954841, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 7768 + }, + { + "epoch": 0.067755664474717, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7769 + }, + { + "epoch": 0.06776438575988558, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 7770 + }, + { + "epoch": 0.06777310704505415, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 7771 + }, + { + "epoch": 0.06778182833022274, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7772 + }, + { + "epoch": 0.06779054961539133, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 7773 + }, + { + "epoch": 0.06779927090055991, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 7774 + }, + { + "epoch": 0.06780799218572849, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 7775 + }, + { + "epoch": 0.06781671347089707, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7776 + }, + { + "epoch": 0.06782543475606566, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7777 + }, + { + "epoch": 0.06783415604123423, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7778 + }, + { + "epoch": 0.06784287732640282, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 7779 + }, + { + "epoch": 0.0678515986115714, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7780 + }, + { + "epoch": 0.06786031989673999, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7781 + }, + { + "epoch": 0.06786904118190856, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 7782 + }, + { + "epoch": 0.06787776246707715, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 7783 + }, + { + "epoch": 0.06788648375224574, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 7784 + }, + { + "epoch": 0.06789520503741431, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7785 + }, + { + "epoch": 0.0679039263225829, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 7786 + }, + { + "epoch": 0.06791264760775148, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 7787 + }, + { + "epoch": 0.06792136889292007, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7788 + }, + { + "epoch": 0.06793009017808864, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 7789 + }, + { + "epoch": 0.06793881146325723, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 7790 + }, + { + "epoch": 0.06794753274842581, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7791 + }, + { + "epoch": 0.06795625403359438, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7792 + }, + { + "epoch": 0.06796497531876297, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7793 + }, + { + "epoch": 0.06797369660393156, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 7794 + }, + { + "epoch": 0.06798241788910014, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7795 + }, + { + "epoch": 0.06799113917426872, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7796 + }, + { + "epoch": 0.0679998604594373, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7797 + }, + { + "epoch": 0.06800858174460589, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7798 + }, + { + "epoch": 0.06801730302977446, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 7799 + }, + { + "epoch": 0.06802602431494305, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7800 + }, + { + "epoch": 0.06803474560011163, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7801 + }, + { + "epoch": 0.06804346688528022, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7802 + }, + { + "epoch": 0.0680521881704488, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7803 + }, + { + "epoch": 0.06806090945561738, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7804 + }, + { + "epoch": 0.06806963074078597, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 7805 + }, + { + "epoch": 0.06807835202595454, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7806 + }, + { + "epoch": 0.06808707331112313, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7807 + }, + { + "epoch": 0.06809579459629171, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7808 + }, + { + "epoch": 0.0681045158814603, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7809 + }, + { + "epoch": 0.06811323716662887, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7810 + }, + { + "epoch": 0.06812195845179746, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 7811 + }, + { + "epoch": 0.06813067973696604, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7812 + }, + { + "epoch": 0.06813940102213462, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 7813 + }, + { + "epoch": 0.0681481223073032, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7814 + }, + { + "epoch": 0.06815684359247179, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7815 + }, + { + "epoch": 0.06816556487764037, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 7816 + }, + { + "epoch": 0.06817428616280895, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7817 + }, + { + "epoch": 0.06818300744797753, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 7818 + }, + { + "epoch": 0.06819172873314612, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7819 + }, + { + "epoch": 0.06820045001831469, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 7820 + }, + { + "epoch": 0.06820917130348328, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7821 + }, + { + "epoch": 0.06821789258865187, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 7822 + }, + { + "epoch": 0.06822661387382045, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7823 + }, + { + "epoch": 0.06823533515898902, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7824 + }, + { + "epoch": 0.06824405644415761, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 7825 + }, + { + "epoch": 0.0682527777293262, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7826 + }, + { + "epoch": 0.06826149901449477, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 7827 + }, + { + "epoch": 0.06827022029966336, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7828 + }, + { + "epoch": 0.06827894158483194, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 7829 + }, + { + "epoch": 0.06828766287000053, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7830 + }, + { + "epoch": 0.0682963841551691, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 7831 + }, + { + "epoch": 0.06830510544033769, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7832 + }, + { + "epoch": 0.06831382672550627, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 7833 + }, + { + "epoch": 0.06832254801067485, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 7834 + }, + { + "epoch": 0.06833126929584343, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7835 + }, + { + "epoch": 0.06833999058101202, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 7836 + }, + { + "epoch": 0.0683487118661806, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 7837 + }, + { + "epoch": 0.06835743315134918, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 7838 + }, + { + "epoch": 0.06836615443651776, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 7839 + }, + { + "epoch": 0.06837487572168635, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 7840 + }, + { + "epoch": 0.06838359700685492, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 7841 + }, + { + "epoch": 0.06839231829202351, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 7842 + }, + { + "epoch": 0.0684010395771921, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7843 + }, + { + "epoch": 0.06840976086236068, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7844 + }, + { + "epoch": 0.06841848214752926, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 7845 + }, + { + "epoch": 0.06842720343269784, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 7846 + }, + { + "epoch": 0.06843592471786643, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7847 + }, + { + "epoch": 0.068444646003035, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 7848 + }, + { + "epoch": 0.06845336728820359, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 7849 + }, + { + "epoch": 0.06846208857337217, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 7850 + }, + { + "epoch": 0.06847080985854076, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 7851 + }, + { + "epoch": 0.06847953114370933, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 7852 + }, + { + "epoch": 0.06848825242887792, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7853 + }, + { + "epoch": 0.0684969737140465, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 7854 + }, + { + "epoch": 0.06850569499921508, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 7855 + }, + { + "epoch": 0.06851441628438366, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7856 + }, + { + "epoch": 0.06852313756955225, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 7857 + }, + { + "epoch": 0.06853185885472084, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 7858 + }, + { + "epoch": 0.06854058013988941, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 7859 + }, + { + "epoch": 0.068549301425058, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7860 + }, + { + "epoch": 0.06855802271022658, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7861 + }, + { + "epoch": 0.06856674399539515, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 7862 + }, + { + "epoch": 0.06857546528056374, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 7863 + }, + { + "epoch": 0.06858418656573233, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7864 + }, + { + "epoch": 0.06859290785090091, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 7865 + }, + { + "epoch": 0.06860162913606949, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 7866 + }, + { + "epoch": 0.06861035042123807, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 7867 + }, + { + "epoch": 0.06861907170640666, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7868 + }, + { + "epoch": 0.06862779299157523, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 7869 + }, + { + "epoch": 0.06863651427674382, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 7870 + }, + { + "epoch": 0.0686452355619124, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7871 + }, + { + "epoch": 0.06865395684708099, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7872 + }, + { + "epoch": 0.06866267813224956, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 7873 + }, + { + "epoch": 0.06867139941741815, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7874 + }, + { + "epoch": 0.06868012070258674, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 7875 + }, + { + "epoch": 0.06868884198775532, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 7876 + }, + { + "epoch": 0.0686975632729239, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 7877 + }, + { + "epoch": 0.06870628455809248, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 7878 + }, + { + "epoch": 0.06871500584326107, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 7879 + }, + { + "epoch": 0.06872372712842964, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 7880 + }, + { + "epoch": 0.06873244841359823, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 7881 + }, + { + "epoch": 0.06874116969876681, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7882 + }, + { + "epoch": 0.0687498909839354, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 7883 + }, + { + "epoch": 0.06875861226910397, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 7884 + }, + { + "epoch": 0.06876733355427256, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7885 + }, + { + "epoch": 0.06877605483944114, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 7886 + }, + { + "epoch": 0.06878477612460972, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 7887 + }, + { + "epoch": 0.0687934974097783, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 7888 + }, + { + "epoch": 0.06880221869494689, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7889 + }, + { + "epoch": 0.06881093998011548, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7890 + }, + { + "epoch": 0.06881966126528405, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 7891 + }, + { + "epoch": 0.06882838255045264, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 7892 + }, + { + "epoch": 0.06883710383562122, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 7893 + }, + { + "epoch": 0.0688458251207898, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7894 + }, + { + "epoch": 0.06885454640595838, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 7895 + }, + { + "epoch": 0.06886326769112697, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7896 + }, + { + "epoch": 0.06887198897629555, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 7897 + }, + { + "epoch": 0.06888071026146413, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 7898 + }, + { + "epoch": 0.06888943154663271, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7899 + }, + { + "epoch": 0.0688981528318013, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7900 + }, + { + "epoch": 0.06890687411696987, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 7901 + }, + { + "epoch": 0.06891559540213846, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7902 + }, + { + "epoch": 0.06892431668730704, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 7903 + }, + { + "epoch": 0.06893303797247563, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 7904 + }, + { + "epoch": 0.0689417592576442, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7905 + }, + { + "epoch": 0.06895048054281279, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7906 + }, + { + "epoch": 0.06895920182798138, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 7907 + }, + { + "epoch": 0.06896792311314995, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7908 + }, + { + "epoch": 0.06897664439831853, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7909 + }, + { + "epoch": 0.06898536568348712, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7910 + }, + { + "epoch": 0.06899408696865571, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 7911 + }, + { + "epoch": 0.06900280825382428, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7912 + }, + { + "epoch": 0.06901152953899287, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7913 + }, + { + "epoch": 0.06902025082416145, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7914 + }, + { + "epoch": 0.06902897210933003, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7915 + }, + { + "epoch": 0.06903769339449861, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 7916 + }, + { + "epoch": 0.0690464146796672, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7917 + }, + { + "epoch": 0.06905513596483578, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7918 + }, + { + "epoch": 0.06906385725000436, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7919 + }, + { + "epoch": 0.06907257853517294, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7920 + }, + { + "epoch": 0.06908129982034153, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 7921 + }, + { + "epoch": 0.0690900211055101, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 7922 + }, + { + "epoch": 0.06909874239067869, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 7923 + }, + { + "epoch": 0.06910746367584727, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 7924 + }, + { + "epoch": 0.06911618496101586, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 7925 + }, + { + "epoch": 0.06912490624618443, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7926 + }, + { + "epoch": 0.06913362753135302, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7927 + }, + { + "epoch": 0.0691423488165216, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7928 + }, + { + "epoch": 0.06915107010169018, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7929 + }, + { + "epoch": 0.06915979138685877, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 7930 + }, + { + "epoch": 0.06916851267202735, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 7931 + }, + { + "epoch": 0.06917723395719594, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 7932 + }, + { + "epoch": 0.06918595524236451, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 7933 + }, + { + "epoch": 0.0691946765275331, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 7934 + }, + { + "epoch": 0.06920339781270168, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7935 + }, + { + "epoch": 0.06921211909787026, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7936 + }, + { + "epoch": 0.06922084038303884, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 7937 + }, + { + "epoch": 0.06922956166820743, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7938 + }, + { + "epoch": 0.06923828295337602, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 7939 + }, + { + "epoch": 0.06924700423854459, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 7940 + }, + { + "epoch": 0.06925572552371317, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 7941 + }, + { + "epoch": 0.06926444680888176, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7942 + }, + { + "epoch": 0.06927316809405033, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 7943 + }, + { + "epoch": 0.06928188937921892, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7944 + }, + { + "epoch": 0.0692906106643875, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7945 + }, + { + "epoch": 0.06929933194955609, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7946 + }, + { + "epoch": 0.06930805323472466, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7947 + }, + { + "epoch": 0.06931677451989325, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 7948 + }, + { + "epoch": 0.06932549580506184, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 7949 + }, + { + "epoch": 0.06933421709023041, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7950 + }, + { + "epoch": 0.069342938375399, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 7951 + }, + { + "epoch": 0.06935165966056758, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 7952 + }, + { + "epoch": 0.06936038094573617, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 7953 + }, + { + "epoch": 0.06936910223090474, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 7954 + }, + { + "epoch": 0.06937782351607333, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7955 + }, + { + "epoch": 0.06938654480124191, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 7956 + }, + { + "epoch": 0.06939526608641049, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7957 + }, + { + "epoch": 0.06940398737157907, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 7958 + }, + { + "epoch": 0.06941270865674766, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7959 + }, + { + "epoch": 0.06942142994191625, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 7960 + }, + { + "epoch": 0.06943015122708482, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7961 + }, + { + "epoch": 0.0694388725122534, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 7962 + }, + { + "epoch": 0.06944759379742199, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 7963 + }, + { + "epoch": 0.06945631508259056, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 7964 + }, + { + "epoch": 0.06946503636775915, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 7965 + }, + { + "epoch": 0.06947375765292774, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 7966 + }, + { + "epoch": 0.06948247893809632, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7967 + }, + { + "epoch": 0.0694912002232649, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7968 + }, + { + "epoch": 0.06949992150843348, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7969 + }, + { + "epoch": 0.06950864279360207, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7970 + }, + { + "epoch": 0.06951736407877064, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 7971 + }, + { + "epoch": 0.06952608536393923, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7972 + }, + { + "epoch": 0.06953480664910781, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 7973 + }, + { + "epoch": 0.0695435279342764, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7974 + }, + { + "epoch": 0.06955224921944497, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7975 + }, + { + "epoch": 0.06956097050461356, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 7976 + }, + { + "epoch": 0.06956969178978215, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 7977 + }, + { + "epoch": 0.06957841307495072, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7978 + }, + { + "epoch": 0.0695871343601193, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7979 + }, + { + "epoch": 0.06959585564528789, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 7980 + }, + { + "epoch": 0.06960457693045648, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7981 + }, + { + "epoch": 0.06961329821562505, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 7982 + }, + { + "epoch": 0.06962201950079364, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 7983 + }, + { + "epoch": 0.06963074078596222, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7984 + }, + { + "epoch": 0.06963946207113081, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 7985 + }, + { + "epoch": 0.06964818335629938, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 7986 + }, + { + "epoch": 0.06965690464146797, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7987 + }, + { + "epoch": 0.06966562592663655, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 7988 + }, + { + "epoch": 0.06967434721180513, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 7989 + }, + { + "epoch": 0.06968306849697371, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 7990 + }, + { + "epoch": 0.0696917897821423, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 7991 + }, + { + "epoch": 0.06970051106731089, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 7992 + }, + { + "epoch": 0.06970923235247946, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7993 + }, + { + "epoch": 0.06971795363764804, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 7994 + }, + { + "epoch": 0.06972667492281663, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7995 + }, + { + "epoch": 0.0697353962079852, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 7996 + }, + { + "epoch": 0.06974411749315379, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7997 + }, + { + "epoch": 0.06975283877832238, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 7998 + }, + { + "epoch": 0.06976156006349096, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7999 + }, + { + "epoch": 0.06977028134865954, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8000 + }, + { + "epoch": 0.06977900263382812, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 8001 + }, + { + "epoch": 0.06978772391899671, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8002 + }, + { + "epoch": 0.06979644520416528, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 8003 + }, + { + "epoch": 0.06980516648933387, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 8004 + }, + { + "epoch": 0.06981388777450245, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 8005 + }, + { + "epoch": 0.06982260905967104, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 8006 + }, + { + "epoch": 0.06983133034483961, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 8007 + }, + { + "epoch": 0.0698400516300082, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 8008 + }, + { + "epoch": 0.06984877291517678, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 8009 + }, + { + "epoch": 0.06985749420034536, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8010 + }, + { + "epoch": 0.06986621548551394, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 8011 + }, + { + "epoch": 0.06987493677068253, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8012 + }, + { + "epoch": 0.06988365805585112, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 8013 + }, + { + "epoch": 0.06989237934101969, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 8014 + }, + { + "epoch": 0.06990110062618828, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 8015 + }, + { + "epoch": 0.06990982191135686, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8016 + }, + { + "epoch": 0.06991854319652543, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 8017 + }, + { + "epoch": 0.06992726448169402, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 8018 + }, + { + "epoch": 0.06993598576686261, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 8019 + }, + { + "epoch": 0.0699447070520312, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 8020 + }, + { + "epoch": 0.06995342833719977, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 8021 + }, + { + "epoch": 0.06996214962236835, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8022 + }, + { + "epoch": 0.06997087090753694, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8023 + }, + { + "epoch": 0.06997959219270551, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 8024 + }, + { + "epoch": 0.0699883134778741, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 8025 + }, + { + "epoch": 0.06999703476304268, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8026 + }, + { + "epoch": 0.07000575604821127, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8027 + }, + { + "epoch": 0.07001447733337984, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 8028 + }, + { + "epoch": 0.07002319861854843, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8029 + }, + { + "epoch": 0.07003191990371702, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8030 + }, + { + "epoch": 0.07004064118888559, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 8031 + }, + { + "epoch": 0.07004936247405417, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 8032 + }, + { + "epoch": 0.07005808375922276, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 8033 + }, + { + "epoch": 0.07006680504439135, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 8034 + }, + { + "epoch": 0.07007552632955992, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 8035 + }, + { + "epoch": 0.0700842476147285, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8036 + }, + { + "epoch": 0.07009296889989709, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8037 + }, + { + "epoch": 0.07010169018506567, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8038 + }, + { + "epoch": 0.07011041147023425, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 8039 + }, + { + "epoch": 0.07011913275540284, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8040 + }, + { + "epoch": 0.07012785404057142, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 8041 + }, + { + "epoch": 0.07013657532574, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8042 + }, + { + "epoch": 0.07014529661090858, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8043 + }, + { + "epoch": 0.07015401789607717, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8044 + }, + { + "epoch": 0.07016273918124574, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8045 + }, + { + "epoch": 0.07017146046641433, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 8046 + }, + { + "epoch": 0.07018018175158292, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 8047 + }, + { + "epoch": 0.0701889030367515, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 8048 + }, + { + "epoch": 0.07019762432192007, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8049 + }, + { + "epoch": 0.07020634560708866, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 8050 + }, + { + "epoch": 0.07021506689225725, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8051 + }, + { + "epoch": 0.07022378817742582, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8052 + }, + { + "epoch": 0.0702325094625944, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 8053 + }, + { + "epoch": 0.07024123074776299, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8054 + }, + { + "epoch": 0.07024995203293158, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 8055 + }, + { + "epoch": 0.07025867331810015, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 8056 + }, + { + "epoch": 0.07026739460326874, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 8057 + }, + { + "epoch": 0.07027611588843732, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 8058 + }, + { + "epoch": 0.0702848371736059, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 8059 + }, + { + "epoch": 0.07029355845877448, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8060 + }, + { + "epoch": 0.07030227974394307, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 8061 + }, + { + "epoch": 0.07031100102911166, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 8062 + }, + { + "epoch": 0.07031972231428023, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 8063 + }, + { + "epoch": 0.07032844359944881, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8064 + }, + { + "epoch": 0.0703371648846174, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 8065 + }, + { + "epoch": 0.07034588616978597, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8066 + }, + { + "epoch": 0.07035460745495456, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 8067 + }, + { + "epoch": 0.07036332874012315, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 8068 + }, + { + "epoch": 0.07037205002529173, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 8069 + }, + { + "epoch": 0.0703807713104603, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 8070 + }, + { + "epoch": 0.07038949259562889, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8071 + }, + { + "epoch": 0.07039821388079748, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8072 + }, + { + "epoch": 0.07040693516596605, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8073 + }, + { + "epoch": 0.07041565645113464, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8074 + }, + { + "epoch": 0.07042437773630322, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 8075 + }, + { + "epoch": 0.07043309902147181, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 8076 + }, + { + "epoch": 0.07044182030664038, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 8077 + }, + { + "epoch": 0.07045054159180897, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8078 + }, + { + "epoch": 0.07045926287697755, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 8079 + }, + { + "epoch": 0.07046798416214613, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 8080 + }, + { + "epoch": 0.07047670544731471, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8081 + }, + { + "epoch": 0.0704854267324833, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8082 + }, + { + "epoch": 0.07049414801765189, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 8083 + }, + { + "epoch": 0.07050286930282046, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8084 + }, + { + "epoch": 0.07051159058798905, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8085 + }, + { + "epoch": 0.07052031187315763, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 8086 + }, + { + "epoch": 0.0705290331583262, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 8087 + }, + { + "epoch": 0.07053775444349479, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 8088 + }, + { + "epoch": 0.07054647572866338, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 8089 + }, + { + "epoch": 0.07055519701383196, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8090 + }, + { + "epoch": 0.07056391829900054, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8091 + }, + { + "epoch": 0.07057263958416912, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8092 + }, + { + "epoch": 0.07058136086933771, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8093 + }, + { + "epoch": 0.07059008215450628, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8094 + }, + { + "epoch": 0.07059880343967487, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 8095 + }, + { + "epoch": 0.07060752472484345, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8096 + }, + { + "epoch": 0.07061624601001204, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 8097 + }, + { + "epoch": 0.07062496729518061, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8098 + }, + { + "epoch": 0.0706336885803492, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8099 + }, + { + "epoch": 0.07064240986551779, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8100 + }, + { + "epoch": 0.07065113115068637, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 8101 + }, + { + "epoch": 0.07065985243585494, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8102 + }, + { + "epoch": 0.07066857372102353, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8103 + }, + { + "epoch": 0.07067729500619212, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 8104 + }, + { + "epoch": 0.07068601629136069, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 8105 + }, + { + "epoch": 0.07069473757652928, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 8106 + }, + { + "epoch": 0.07070345886169786, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8107 + }, + { + "epoch": 0.07071218014686645, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8108 + }, + { + "epoch": 0.07072090143203502, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8109 + }, + { + "epoch": 0.07072962271720361, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 8110 + }, + { + "epoch": 0.0707383440023722, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8111 + }, + { + "epoch": 0.07074706528754077, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8112 + }, + { + "epoch": 0.07075578657270935, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 8113 + }, + { + "epoch": 0.07076450785787794, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 8114 + }, + { + "epoch": 0.07077322914304653, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8115 + }, + { + "epoch": 0.0707819504282151, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8116 + }, + { + "epoch": 0.07079067171338368, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8117 + }, + { + "epoch": 0.07079939299855227, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 8118 + }, + { + "epoch": 0.07080811428372084, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 8119 + }, + { + "epoch": 0.07081683556888943, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8120 + }, + { + "epoch": 0.07082555685405802, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 8121 + }, + { + "epoch": 0.0708342781392266, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8122 + }, + { + "epoch": 0.07084299942439518, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 8123 + }, + { + "epoch": 0.07085172070956376, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 8124 + }, + { + "epoch": 0.07086044199473235, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8125 + }, + { + "epoch": 0.07086916327990092, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8126 + }, + { + "epoch": 0.07087788456506951, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8127 + }, + { + "epoch": 0.0708866058502381, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 8128 + }, + { + "epoch": 0.07089532713540668, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8129 + }, + { + "epoch": 0.07090404842057525, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8130 + }, + { + "epoch": 0.07091276970574384, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 8131 + }, + { + "epoch": 0.07092149099091243, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8132 + }, + { + "epoch": 0.070930212276081, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8133 + }, + { + "epoch": 0.07093893356124958, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8134 + }, + { + "epoch": 0.07094765484641817, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8135 + }, + { + "epoch": 0.07095637613158676, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 8136 + }, + { + "epoch": 0.07096509741675533, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 8137 + }, + { + "epoch": 0.07097381870192392, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 8138 + }, + { + "epoch": 0.0709825399870925, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8139 + }, + { + "epoch": 0.07099126127226107, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8140 + }, + { + "epoch": 0.07099998255742966, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 8141 + }, + { + "epoch": 0.07100870384259825, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8142 + }, + { + "epoch": 0.07101742512776683, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 8143 + }, + { + "epoch": 0.0710261464129354, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8144 + }, + { + "epoch": 0.07103486769810399, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 8145 + }, + { + "epoch": 0.07104358898327258, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8146 + }, + { + "epoch": 0.07105231026844115, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 8147 + }, + { + "epoch": 0.07106103155360974, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8148 + }, + { + "epoch": 0.07106975283877832, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 8149 + }, + { + "epoch": 0.07107847412394691, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8150 + }, + { + "epoch": 0.07108719540911548, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8151 + }, + { + "epoch": 0.07109591669428407, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8152 + }, + { + "epoch": 0.07110463797945266, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 8153 + }, + { + "epoch": 0.07111335926462123, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8154 + }, + { + "epoch": 0.07112208054978982, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 8155 + }, + { + "epoch": 0.0711308018349584, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8156 + }, + { + "epoch": 0.07113952312012699, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 8157 + }, + { + "epoch": 0.07114824440529556, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8158 + }, + { + "epoch": 0.07115696569046415, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 8159 + }, + { + "epoch": 0.07116568697563273, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 8160 + }, + { + "epoch": 0.0711744082608013, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8161 + }, + { + "epoch": 0.07118312954596989, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 8162 + }, + { + "epoch": 0.07119185083113848, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 8163 + }, + { + "epoch": 0.07120057211630706, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8164 + }, + { + "epoch": 0.07120929340147564, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8165 + }, + { + "epoch": 0.07121801468664422, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8166 + }, + { + "epoch": 0.07122673597181281, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8167 + }, + { + "epoch": 0.07123545725698138, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8168 + }, + { + "epoch": 0.07124417854214997, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8169 + }, + { + "epoch": 0.07125289982731856, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8170 + }, + { + "epoch": 0.07126162111248714, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 8171 + }, + { + "epoch": 0.07127034239765571, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 8172 + }, + { + "epoch": 0.0712790636828243, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8173 + }, + { + "epoch": 0.07128778496799289, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8174 + }, + { + "epoch": 0.07129650625316146, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 8175 + }, + { + "epoch": 0.07130522753833005, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 8176 + }, + { + "epoch": 0.07131394882349863, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 8177 + }, + { + "epoch": 0.07132267010866722, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 8178 + }, + { + "epoch": 0.07133139139383579, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8179 + }, + { + "epoch": 0.07134011267900438, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 8180 + }, + { + "epoch": 0.07134883396417296, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8181 + }, + { + "epoch": 0.07135755524934154, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8182 + }, + { + "epoch": 0.07136627653451012, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 8183 + }, + { + "epoch": 0.07137499781967871, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 8184 + }, + { + "epoch": 0.0713837191048473, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8185 + }, + { + "epoch": 0.07139244039001587, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8186 + }, + { + "epoch": 0.07140116167518445, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 8187 + }, + { + "epoch": 0.07140988296035304, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8188 + }, + { + "epoch": 0.07141860424552161, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8189 + }, + { + "epoch": 0.0714273255306902, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8190 + }, + { + "epoch": 0.07143604681585879, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 8191 + }, + { + "epoch": 0.07144476810102737, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 8192 + }, + { + "epoch": 0.07145348938619595, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8193 + }, + { + "epoch": 0.07146221067136453, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 8194 + }, + { + "epoch": 0.07147093195653312, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 8195 + }, + { + "epoch": 0.07147965324170169, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 8196 + }, + { + "epoch": 0.07148837452687028, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 8197 + }, + { + "epoch": 0.07149709581203886, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 8198 + }, + { + "epoch": 0.07150581709720745, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8199 + }, + { + "epoch": 0.07151453838237602, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8200 + }, + { + "epoch": 0.07152325966754461, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8201 + }, + { + "epoch": 0.0715319809527132, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8202 + }, + { + "epoch": 0.07154070223788177, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8203 + }, + { + "epoch": 0.07154942352305035, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8204 + }, + { + "epoch": 0.07155814480821894, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 8205 + }, + { + "epoch": 0.07156686609338753, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 8206 + }, + { + "epoch": 0.0715755873785561, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 8207 + }, + { + "epoch": 0.07158430866372469, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 8208 + }, + { + "epoch": 0.07159302994889327, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 8209 + }, + { + "epoch": 0.07160175123406184, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 8210 + }, + { + "epoch": 0.07161047251923043, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 8211 + }, + { + "epoch": 0.07161919380439902, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8212 + }, + { + "epoch": 0.0716279150895676, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 8213 + }, + { + "epoch": 0.07163663637473618, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 8214 + }, + { + "epoch": 0.07164535765990476, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 8215 + }, + { + "epoch": 0.07165407894507335, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 8216 + }, + { + "epoch": 0.07166280023024194, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 8217 + }, + { + "epoch": 0.07167152151541051, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 8218 + }, + { + "epoch": 0.0716802428005791, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 8219 + }, + { + "epoch": 0.07168896408574768, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8220 + }, + { + "epoch": 0.07169768537091625, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 8221 + }, + { + "epoch": 0.07170640665608484, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 8222 + }, + { + "epoch": 0.07171512794125343, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8223 + }, + { + "epoch": 0.07172384922642201, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 8224 + }, + { + "epoch": 0.07173257051159058, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 8225 + }, + { + "epoch": 0.07174129179675917, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 8226 + }, + { + "epoch": 0.07175001308192776, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 8227 + }, + { + "epoch": 0.07175873436709633, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 8228 + }, + { + "epoch": 0.07176745565226492, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8229 + }, + { + "epoch": 0.0717761769374335, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8230 + }, + { + "epoch": 0.07178489822260209, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8231 + }, + { + "epoch": 0.07179361950777066, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 8232 + }, + { + "epoch": 0.07180234079293925, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 8233 + }, + { + "epoch": 0.07181106207810783, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 8234 + }, + { + "epoch": 0.07181978336327641, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 8235 + }, + { + "epoch": 0.071828504648445, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 8236 + }, + { + "epoch": 0.07183722593361358, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 8237 + }, + { + "epoch": 0.07184594721878217, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 8238 + }, + { + "epoch": 0.07185466850395074, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8239 + }, + { + "epoch": 0.07186338978911933, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 8240 + }, + { + "epoch": 0.07187211107428791, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 8241 + }, + { + "epoch": 0.07188083235945648, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 8242 + }, + { + "epoch": 0.07188955364462507, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 8243 + }, + { + "epoch": 0.07189827492979366, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8244 + }, + { + "epoch": 0.07190699621496224, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 8245 + }, + { + "epoch": 0.07191571750013082, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 8246 + }, + { + "epoch": 0.0719244387852994, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 8247 + }, + { + "epoch": 0.07193316007046799, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 8248 + }, + { + "epoch": 0.07194188135563656, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8249 + }, + { + "epoch": 0.07195060264080515, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8250 + }, + { + "epoch": 0.07195932392597373, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 8251 + }, + { + "epoch": 0.07196804521114232, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8252 + }, + { + "epoch": 0.07197676649631089, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8253 + }, + { + "epoch": 0.07198548778147948, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 8254 + }, + { + "epoch": 0.07199420906664807, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 8255 + }, + { + "epoch": 0.07200293035181664, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8256 + }, + { + "epoch": 0.07201165163698522, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8257 + }, + { + "epoch": 0.07202037292215381, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 8258 + }, + { + "epoch": 0.0720290942073224, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8259 + }, + { + "epoch": 0.07203781549249097, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 8260 + }, + { + "epoch": 0.07204653677765956, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 8261 + }, + { + "epoch": 0.07205525806282814, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8262 + }, + { + "epoch": 0.07206397934799672, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8263 + }, + { + "epoch": 0.0720727006331653, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 8264 + }, + { + "epoch": 0.07208142191833389, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 8265 + }, + { + "epoch": 0.07209014320350247, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8266 + }, + { + "epoch": 0.07209886448867105, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8267 + }, + { + "epoch": 0.07210758577383963, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 8268 + }, + { + "epoch": 0.07211630705900822, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 8269 + }, + { + "epoch": 0.07212502834417679, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 8270 + }, + { + "epoch": 0.07213374962934538, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8271 + }, + { + "epoch": 0.07214247091451396, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8272 + }, + { + "epoch": 0.07215119219968255, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8273 + }, + { + "epoch": 0.07215991348485112, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 8274 + }, + { + "epoch": 0.07216863477001971, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 8275 + }, + { + "epoch": 0.0721773560551883, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 8276 + }, + { + "epoch": 0.07218607734035687, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 8277 + }, + { + "epoch": 0.07219479862552546, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 8278 + }, + { + "epoch": 0.07220351991069404, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 8279 + }, + { + "epoch": 0.07221224119586263, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 8280 + }, + { + "epoch": 0.0722209624810312, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8281 + }, + { + "epoch": 0.07222968376619979, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 8282 + }, + { + "epoch": 0.07223840505136837, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8283 + }, + { + "epoch": 0.07224712633653695, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 8284 + }, + { + "epoch": 0.07225584762170553, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 8285 + }, + { + "epoch": 0.07226456890687412, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8286 + }, + { + "epoch": 0.0722732901920427, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8287 + }, + { + "epoch": 0.07228201147721128, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 8288 + }, + { + "epoch": 0.07229073276237986, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 8289 + }, + { + "epoch": 0.07229945404754845, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 8290 + }, + { + "epoch": 0.07230817533271702, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 8291 + }, + { + "epoch": 0.07231689661788561, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 8292 + }, + { + "epoch": 0.0723256179030542, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 8293 + }, + { + "epoch": 0.07233433918822278, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 8294 + }, + { + "epoch": 0.07234306047339135, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 8295 + }, + { + "epoch": 0.07235178175855994, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8296 + }, + { + "epoch": 0.07236050304372853, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8297 + }, + { + "epoch": 0.0723692243288971, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 8298 + }, + { + "epoch": 0.07237794561406569, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 8299 + }, + { + "epoch": 0.07238666689923427, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8300 + }, + { + "epoch": 0.07239538818440286, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8301 + }, + { + "epoch": 0.07240410946957143, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8302 + }, + { + "epoch": 0.07241283075474002, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 8303 + }, + { + "epoch": 0.0724215520399086, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8304 + }, + { + "epoch": 0.07243027332507718, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 8305 + }, + { + "epoch": 0.07243899461024576, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8306 + }, + { + "epoch": 0.07244771589541435, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8307 + }, + { + "epoch": 0.07245643718058294, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 8308 + }, + { + "epoch": 0.07246515846575151, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8309 + }, + { + "epoch": 0.0724738797509201, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 8310 + }, + { + "epoch": 0.07248260103608868, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8311 + }, + { + "epoch": 0.07249132232125725, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 8312 + }, + { + "epoch": 0.07250004360642584, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 8313 + }, + { + "epoch": 0.07250876489159443, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8314 + }, + { + "epoch": 0.07251748617676301, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8315 + }, + { + "epoch": 0.07252620746193159, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 8316 + }, + { + "epoch": 0.07253492874710017, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8317 + }, + { + "epoch": 0.07254365003226876, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 8318 + }, + { + "epoch": 0.07255237131743733, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 8319 + }, + { + "epoch": 0.07256109260260592, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 8320 + }, + { + "epoch": 0.0725698138877745, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 8321 + }, + { + "epoch": 0.07257853517294309, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 8322 + }, + { + "epoch": 0.07258725645811166, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8323 + }, + { + "epoch": 0.07259597774328025, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8324 + }, + { + "epoch": 0.07260469902844884, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8325 + }, + { + "epoch": 0.07261342031361741, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8326 + }, + { + "epoch": 0.072622141598786, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 8327 + }, + { + "epoch": 0.07263086288395458, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 8328 + }, + { + "epoch": 0.07263958416912317, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 8329 + }, + { + "epoch": 0.07264830545429174, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 8330 + }, + { + "epoch": 0.07265702673946033, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8331 + }, + { + "epoch": 0.07266574802462891, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 8332 + }, + { + "epoch": 0.0726744693097975, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8333 + }, + { + "epoch": 0.07268319059496607, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 8334 + }, + { + "epoch": 0.07269191188013466, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8335 + }, + { + "epoch": 0.07270063316530324, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8336 + }, + { + "epoch": 0.07270935445047182, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 8337 + }, + { + "epoch": 0.0727180757356404, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 8338 + }, + { + "epoch": 0.07272679702080899, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 8339 + }, + { + "epoch": 0.07273551830597758, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 8340 + }, + { + "epoch": 0.07274423959114615, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 8341 + }, + { + "epoch": 0.07275296087631473, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8342 + }, + { + "epoch": 0.07276168216148332, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 8343 + }, + { + "epoch": 0.0727704034466519, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 8344 + }, + { + "epoch": 0.07277912473182048, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8345 + }, + { + "epoch": 0.07278784601698907, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 8346 + }, + { + "epoch": 0.07279656730215765, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 8347 + }, + { + "epoch": 0.07280528858732623, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8348 + }, + { + "epoch": 0.07281400987249481, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 8349 + }, + { + "epoch": 0.0728227311576634, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8350 + }, + { + "epoch": 0.07283145244283197, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 8351 + }, + { + "epoch": 0.07284017372800056, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 8352 + }, + { + "epoch": 0.07284889501316914, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 8353 + }, + { + "epoch": 0.07285761629833773, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 8354 + }, + { + "epoch": 0.0728663375835063, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8355 + }, + { + "epoch": 0.07287505886867489, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 8356 + }, + { + "epoch": 0.07288378015384347, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 8357 + }, + { + "epoch": 0.07289250143901205, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8358 + }, + { + "epoch": 0.07290122272418063, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8359 + }, + { + "epoch": 0.07290994400934922, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 8360 + }, + { + "epoch": 0.0729186652945178, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8361 + }, + { + "epoch": 0.07292738657968638, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 8362 + }, + { + "epoch": 0.07293610786485497, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8363 + }, + { + "epoch": 0.07294482915002355, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8364 + }, + { + "epoch": 0.07295355043519212, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 8365 + }, + { + "epoch": 0.07296227172036071, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 8366 + }, + { + "epoch": 0.0729709930055293, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 8367 + }, + { + "epoch": 0.07297971429069788, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8368 + }, + { + "epoch": 0.07298843557586646, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 8369 + }, + { + "epoch": 0.07299715686103504, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8370 + }, + { + "epoch": 0.07300587814620363, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8371 + }, + { + "epoch": 0.0730145994313722, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8372 + }, + { + "epoch": 0.07302332071654079, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 8373 + }, + { + "epoch": 0.07303204200170937, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8374 + }, + { + "epoch": 0.07304076328687796, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 8375 + }, + { + "epoch": 0.07304948457204653, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8376 + }, + { + "epoch": 0.07305820585721512, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 8377 + }, + { + "epoch": 0.0730669271423837, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 8378 + }, + { + "epoch": 0.07307564842755228, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8379 + }, + { + "epoch": 0.07308436971272086, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8380 + }, + { + "epoch": 0.07309309099788945, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 8381 + }, + { + "epoch": 0.07310181228305804, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 8382 + }, + { + "epoch": 0.07311053356822661, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8383 + }, + { + "epoch": 0.0731192548533952, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 8384 + }, + { + "epoch": 0.07312797613856378, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8385 + }, + { + "epoch": 0.07313669742373236, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 8386 + }, + { + "epoch": 0.07314541870890094, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8387 + }, + { + "epoch": 0.07315413999406953, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 8388 + }, + { + "epoch": 0.07316286127923811, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8389 + }, + { + "epoch": 0.07317158256440669, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8390 + }, + { + "epoch": 0.07318030384957527, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8391 + }, + { + "epoch": 0.07318902513474386, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8392 + }, + { + "epoch": 0.07319774641991243, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 8393 + }, + { + "epoch": 0.07320646770508102, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8394 + }, + { + "epoch": 0.0732151889902496, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8395 + }, + { + "epoch": 0.07322391027541819, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 8396 + }, + { + "epoch": 0.07323263156058676, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 8397 + }, + { + "epoch": 0.07324135284575535, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 8398 + }, + { + "epoch": 0.07325007413092394, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8399 + }, + { + "epoch": 0.07325879541609251, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8400 + }, + { + "epoch": 0.0732675167012611, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 8401 + }, + { + "epoch": 0.07327623798642968, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8402 + }, + { + "epoch": 0.07328495927159827, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8403 + }, + { + "epoch": 0.07329368055676684, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 8404 + }, + { + "epoch": 0.07330240184193543, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 8405 + }, + { + "epoch": 0.07331112312710401, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8406 + }, + { + "epoch": 0.07331984441227259, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8407 + }, + { + "epoch": 0.07332856569744117, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 8408 + }, + { + "epoch": 0.07333728698260976, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 8409 + }, + { + "epoch": 0.07334600826777835, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 8410 + }, + { + "epoch": 0.07335472955294692, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 8411 + }, + { + "epoch": 0.0733634508381155, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8412 + }, + { + "epoch": 0.07337217212328409, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8413 + }, + { + "epoch": 0.07338089340845266, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 8414 + }, + { + "epoch": 0.07338961469362125, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 8415 + }, + { + "epoch": 0.07339833597878984, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8416 + }, + { + "epoch": 0.07340705726395842, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8417 + }, + { + "epoch": 0.073415778549127, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8418 + }, + { + "epoch": 0.07342449983429558, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8419 + }, + { + "epoch": 0.07343322111946417, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8420 + }, + { + "epoch": 0.07344194240463274, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 8421 + }, + { + "epoch": 0.07345066368980133, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 8422 + }, + { + "epoch": 0.07345938497496991, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 8423 + }, + { + "epoch": 0.0734681062601385, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 8424 + }, + { + "epoch": 0.07347682754530707, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8425 + }, + { + "epoch": 0.07348554883047566, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8426 + }, + { + "epoch": 0.07349427011564424, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8427 + }, + { + "epoch": 0.07350299140081282, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 8428 + }, + { + "epoch": 0.0735117126859814, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 8429 + }, + { + "epoch": 0.07352043397114999, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 8430 + }, + { + "epoch": 0.07352915525631858, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 8431 + }, + { + "epoch": 0.07353787654148715, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 8432 + }, + { + "epoch": 0.07354659782665574, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8433 + }, + { + "epoch": 0.07355531911182432, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 8434 + }, + { + "epoch": 0.0735640403969929, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8435 + }, + { + "epoch": 0.07357276168216148, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 8436 + }, + { + "epoch": 0.07358148296733007, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 8437 + }, + { + "epoch": 0.07359020425249865, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8438 + }, + { + "epoch": 0.07359892553766723, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 8439 + }, + { + "epoch": 0.07360764682283581, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 8440 + }, + { + "epoch": 0.0736163681080044, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8441 + }, + { + "epoch": 0.07362508939317297, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 8442 + }, + { + "epoch": 0.07363381067834156, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8443 + }, + { + "epoch": 0.07364253196351014, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 8444 + }, + { + "epoch": 0.07365125324867873, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 8445 + }, + { + "epoch": 0.0736599745338473, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 8446 + }, + { + "epoch": 0.07366869581901589, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 8447 + }, + { + "epoch": 0.07367741710418448, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 8448 + }, + { + "epoch": 0.07368613838935306, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8449 + }, + { + "epoch": 0.07369485967452163, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 8450 + }, + { + "epoch": 0.07370358095969022, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 8451 + }, + { + "epoch": 0.07371230224485881, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 8452 + }, + { + "epoch": 0.07372102353002738, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8453 + }, + { + "epoch": 0.07372974481519597, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 8454 + }, + { + "epoch": 0.07373846610036455, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 8455 + }, + { + "epoch": 0.07374718738553314, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8456 + }, + { + "epoch": 0.07375590867070171, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 8457 + }, + { + "epoch": 0.0737646299558703, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 8458 + }, + { + "epoch": 0.07377335124103888, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8459 + }, + { + "epoch": 0.07378207252620746, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8460 + }, + { + "epoch": 0.07379079381137604, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8461 + }, + { + "epoch": 0.07379951509654463, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 8462 + }, + { + "epoch": 0.07380823638171322, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 8463 + }, + { + "epoch": 0.07381695766688179, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 8464 + }, + { + "epoch": 0.07382567895205037, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 8465 + }, + { + "epoch": 0.07383440023721896, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 8466 + }, + { + "epoch": 0.07384312152238753, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8467 + }, + { + "epoch": 0.07385184280755612, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 8468 + }, + { + "epoch": 0.0738605640927247, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 8469 + }, + { + "epoch": 0.07386928537789329, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 8470 + }, + { + "epoch": 0.07387800666306187, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8471 + }, + { + "epoch": 0.07388672794823045, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 8472 + }, + { + "epoch": 0.07389544923339904, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8473 + }, + { + "epoch": 0.07390417051856761, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 8474 + }, + { + "epoch": 0.0739128918037362, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 8475 + }, + { + "epoch": 0.07392161308890478, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 8476 + }, + { + "epoch": 0.07393033437407337, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8477 + }, + { + "epoch": 0.07393905565924194, + "grad_norm": 0.337890625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8478 + }, + { + "epoch": 0.07394777694441053, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 8479 + }, + { + "epoch": 0.07395649822957912, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 8480 + }, + { + "epoch": 0.07396521951474769, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 8481 + }, + { + "epoch": 0.07397394079991627, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8482 + }, + { + "epoch": 0.07398266208508486, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8483 + }, + { + "epoch": 0.07399138337025345, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8484 + }, + { + "epoch": 0.07400010465542202, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8485 + }, + { + "epoch": 0.0740088259405906, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 8486 + }, + { + "epoch": 0.07401754722575919, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 8487 + }, + { + "epoch": 0.07402626851092776, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 8488 + }, + { + "epoch": 0.07403498979609635, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8489 + }, + { + "epoch": 0.07404371108126494, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 8490 + }, + { + "epoch": 0.07405243236643352, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8491 + }, + { + "epoch": 0.0740611536516021, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8492 + }, + { + "epoch": 0.07406987493677068, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 8493 + }, + { + "epoch": 0.07407859622193927, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 8494 + }, + { + "epoch": 0.07408731750710784, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 8495 + }, + { + "epoch": 0.07409603879227643, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 8496 + }, + { + "epoch": 0.07410476007744501, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 8497 + }, + { + "epoch": 0.0741134813626136, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 8498 + }, + { + "epoch": 0.07412220264778217, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8499 + }, + { + "epoch": 0.07413092393295076, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8500 + }, + { + "epoch": 0.07413964521811935, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 8501 + }, + { + "epoch": 0.07414836650328792, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 8502 + }, + { + "epoch": 0.0741570877884565, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8503 + }, + { + "epoch": 0.07416580907362509, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8504 + }, + { + "epoch": 0.07417453035879368, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8505 + }, + { + "epoch": 0.07418325164396225, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 8506 + }, + { + "epoch": 0.07419197292913084, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 8507 + }, + { + "epoch": 0.07420069421429942, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 8508 + }, + { + "epoch": 0.074209415499468, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 8509 + }, + { + "epoch": 0.07421813678463658, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 8510 + }, + { + "epoch": 0.07422685806980517, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 8511 + }, + { + "epoch": 0.07423557935497375, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8512 + }, + { + "epoch": 0.07424430064014233, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 8513 + }, + { + "epoch": 0.07425302192531091, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8514 + }, + { + "epoch": 0.0742617432104795, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 8515 + }, + { + "epoch": 0.07427046449564807, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8516 + }, + { + "epoch": 0.07427918578081666, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 8517 + }, + { + "epoch": 0.07428790706598525, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8518 + }, + { + "epoch": 0.07429662835115383, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8519 + }, + { + "epoch": 0.0743053496363224, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 8520 + }, + { + "epoch": 0.07431407092149099, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 8521 + }, + { + "epoch": 0.07432279220665958, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 8522 + }, + { + "epoch": 0.07433151349182815, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 8523 + }, + { + "epoch": 0.07434023477699674, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8524 + }, + { + "epoch": 0.07434895606216532, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 8525 + }, + { + "epoch": 0.07435767734733391, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 8526 + }, + { + "epoch": 0.07436639863250248, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 8527 + }, + { + "epoch": 0.07437511991767107, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 8528 + }, + { + "epoch": 0.07438384120283965, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8529 + }, + { + "epoch": 0.07439256248800823, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8530 + }, + { + "epoch": 0.07440128377317681, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 8531 + }, + { + "epoch": 0.0744100050583454, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8532 + }, + { + "epoch": 0.07441872634351399, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 8533 + }, + { + "epoch": 0.07442744762868256, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8534 + }, + { + "epoch": 0.07443616891385114, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8535 + }, + { + "epoch": 0.07444489019901973, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8536 + }, + { + "epoch": 0.0744536114841883, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8537 + }, + { + "epoch": 0.07446233276935689, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 8538 + }, + { + "epoch": 0.07447105405452548, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8539 + }, + { + "epoch": 0.07447977533969406, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8540 + }, + { + "epoch": 0.07448849662486264, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8541 + }, + { + "epoch": 0.07449721791003122, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 8542 + }, + { + "epoch": 0.07450593919519981, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8543 + }, + { + "epoch": 0.07451466048036838, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 8544 + }, + { + "epoch": 0.07452338176553697, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 8545 + }, + { + "epoch": 0.07453210305070555, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 8546 + }, + { + "epoch": 0.07454082433587414, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 8547 + }, + { + "epoch": 0.07454954562104271, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 8548 + }, + { + "epoch": 0.0745582669062113, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 8549 + }, + { + "epoch": 0.07456698819137988, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8550 + }, + { + "epoch": 0.07457570947654846, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 8551 + }, + { + "epoch": 0.07458443076171704, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8552 + }, + { + "epoch": 0.07459315204688563, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 8553 + }, + { + "epoch": 0.07460187333205422, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8554 + }, + { + "epoch": 0.07461059461722279, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8555 + }, + { + "epoch": 0.07461931590239138, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 8556 + }, + { + "epoch": 0.07462803718755996, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8557 + }, + { + "epoch": 0.07463675847272853, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8558 + }, + { + "epoch": 0.07464547975789712, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8559 + }, + { + "epoch": 0.07465420104306571, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 8560 + }, + { + "epoch": 0.0746629223282343, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 8561 + }, + { + "epoch": 0.07467164361340287, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 8562 + }, + { + "epoch": 0.07468036489857145, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 8563 + }, + { + "epoch": 0.07468908618374004, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 8564 + }, + { + "epoch": 0.07469780746890863, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 8565 + }, + { + "epoch": 0.0747065287540772, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8566 + }, + { + "epoch": 0.07471525003924578, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8567 + }, + { + "epoch": 0.07472397132441437, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8568 + }, + { + "epoch": 0.07473269260958294, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 8569 + }, + { + "epoch": 0.07474141389475153, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 8570 + }, + { + "epoch": 0.07475013517992012, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 8571 + }, + { + "epoch": 0.0747588564650887, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 8572 + }, + { + "epoch": 0.07476757775025727, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8573 + }, + { + "epoch": 0.07477629903542586, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 8574 + }, + { + "epoch": 0.07478502032059445, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8575 + }, + { + "epoch": 0.07479374160576302, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8576 + }, + { + "epoch": 0.0748024628909316, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 8577 + }, + { + "epoch": 0.07481118417610019, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8578 + }, + { + "epoch": 0.07481990546126878, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 8579 + }, + { + "epoch": 0.07482862674643735, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 8580 + }, + { + "epoch": 0.07483734803160594, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8581 + }, + { + "epoch": 0.07484606931677452, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 8582 + }, + { + "epoch": 0.0748547906019431, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 8583 + }, + { + "epoch": 0.07486351188711168, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 8584 + }, + { + "epoch": 0.07487223317228027, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 8585 + }, + { + "epoch": 0.07488095445744886, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 8586 + }, + { + "epoch": 0.07488967574261743, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 8587 + }, + { + "epoch": 0.07489839702778602, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8588 + }, + { + "epoch": 0.0749071183129546, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8589 + }, + { + "epoch": 0.07491583959812317, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8590 + }, + { + "epoch": 0.07492456088329176, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8591 + }, + { + "epoch": 0.07493328216846035, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8592 + }, + { + "epoch": 0.07494200345362893, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 8593 + }, + { + "epoch": 0.0749507247387975, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 8594 + }, + { + "epoch": 0.07495944602396609, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 8595 + }, + { + "epoch": 0.07496816730913468, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 8596 + }, + { + "epoch": 0.07497688859430325, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8597 + }, + { + "epoch": 0.07498560987947184, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 8598 + }, + { + "epoch": 0.07499433116464042, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8599 + }, + { + "epoch": 0.07500305244980901, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 8600 + }, + { + "epoch": 0.07501177373497758, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8601 + }, + { + "epoch": 0.07502049502014617, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8602 + }, + { + "epoch": 0.07502921630531476, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 8603 + }, + { + "epoch": 0.07503793759048333, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 8604 + }, + { + "epoch": 0.07504665887565191, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 8605 + }, + { + "epoch": 0.0750553801608205, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 8606 + }, + { + "epoch": 0.07506410144598909, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 8607 + }, + { + "epoch": 0.07507282273115766, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 8608 + }, + { + "epoch": 0.07508154401632625, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 8609 + }, + { + "epoch": 0.07509026530149483, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 8610 + }, + { + "epoch": 0.0750989865866634, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 8611 + }, + { + "epoch": 0.07510770787183199, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 8612 + }, + { + "epoch": 0.07511642915700058, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 8613 + }, + { + "epoch": 0.07512515044216916, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 8614 + }, + { + "epoch": 0.07513387172733774, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8615 + }, + { + "epoch": 0.07514259301250632, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 8616 + }, + { + "epoch": 0.07515131429767491, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8617 + }, + { + "epoch": 0.07516003558284348, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 8618 + }, + { + "epoch": 0.07516875686801207, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 8619 + }, + { + "epoch": 0.07517747815318065, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8620 + }, + { + "epoch": 0.07518619943834924, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 8621 + }, + { + "epoch": 0.07519492072351781, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 8622 + }, + { + "epoch": 0.0752036420086864, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 8623 + }, + { + "epoch": 0.07521236329385499, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8624 + }, + { + "epoch": 0.07522108457902356, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 8625 + }, + { + "epoch": 0.07522980586419215, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 8626 + }, + { + "epoch": 0.07523852714936073, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 8627 + }, + { + "epoch": 0.07524724843452932, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 8628 + }, + { + "epoch": 0.07525596971969789, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 8629 + }, + { + "epoch": 0.07526469100486648, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8630 + }, + { + "epoch": 0.07527341229003506, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8631 + }, + { + "epoch": 0.07528213357520364, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 8632 + }, + { + "epoch": 0.07529085486037222, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 8633 + }, + { + "epoch": 0.07529957614554081, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 8634 + }, + { + "epoch": 0.0753082974307094, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 8635 + }, + { + "epoch": 0.07531701871587797, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 8636 + }, + { + "epoch": 0.07532574000104655, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 8637 + }, + { + "epoch": 0.07533446128621514, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 8638 + }, + { + "epoch": 0.07534318257138371, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8639 + }, + { + "epoch": 0.0753519038565523, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8640 + }, + { + "epoch": 0.07536062514172089, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 8641 + }, + { + "epoch": 0.07536934642688947, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 8642 + }, + { + "epoch": 0.07537806771205804, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 8643 + }, + { + "epoch": 0.07538678899722663, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 8644 + }, + { + "epoch": 0.07539551028239522, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8645 + }, + { + "epoch": 0.07540423156756379, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 8646 + }, + { + "epoch": 0.07541295285273238, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8647 + }, + { + "epoch": 0.07542167413790096, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 8648 + }, + { + "epoch": 0.07543039542306955, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 8649 + }, + { + "epoch": 0.07543911670823812, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8650 + }, + { + "epoch": 0.07544783799340671, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8651 + }, + { + "epoch": 0.0754565592785753, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 8652 + }, + { + "epoch": 0.07546528056374387, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 8653 + }, + { + "epoch": 0.07547400184891245, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8654 + }, + { + "epoch": 0.07548272313408104, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8655 + }, + { + "epoch": 0.07549144441924963, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8656 + }, + { + "epoch": 0.0755001657044182, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 8657 + }, + { + "epoch": 0.07550888698958678, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8658 + }, + { + "epoch": 0.07551760827475537, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 8659 + }, + { + "epoch": 0.07552632955992394, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 8660 + }, + { + "epoch": 0.07553505084509253, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 8661 + }, + { + "epoch": 0.07554377213026112, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 8662 + }, + { + "epoch": 0.0755524934154297, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8663 + }, + { + "epoch": 0.07556121470059828, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8664 + }, + { + "epoch": 0.07556993598576686, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 8665 + }, + { + "epoch": 0.07557865727093545, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 8666 + }, + { + "epoch": 0.07558737855610402, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 8667 + }, + { + "epoch": 0.07559609984127261, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8668 + }, + { + "epoch": 0.0756048211264412, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 8669 + }, + { + "epoch": 0.07561354241160978, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8670 + }, + { + "epoch": 0.07562226369677835, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 8671 + }, + { + "epoch": 0.07563098498194694, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 8672 + }, + { + "epoch": 0.07563970626711553, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8673 + }, + { + "epoch": 0.0756484275522841, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 8674 + }, + { + "epoch": 0.07565714883745268, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 8675 + }, + { + "epoch": 0.07566587012262127, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8676 + }, + { + "epoch": 0.07567459140778986, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 8677 + }, + { + "epoch": 0.07568331269295843, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 8678 + }, + { + "epoch": 0.07569203397812702, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8679 + }, + { + "epoch": 0.0757007552632956, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8680 + }, + { + "epoch": 0.07570947654846419, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8681 + }, + { + "epoch": 0.07571819783363276, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 8682 + }, + { + "epoch": 0.07572691911880135, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8683 + }, + { + "epoch": 0.07573564040396993, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 8684 + }, + { + "epoch": 0.0757443616891385, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8685 + }, + { + "epoch": 0.07575308297430709, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 8686 + }, + { + "epoch": 0.07576180425947568, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8687 + }, + { + "epoch": 0.07577052554464427, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 8688 + }, + { + "epoch": 0.07577924682981284, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8689 + }, + { + "epoch": 0.07578796811498142, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8690 + }, + { + "epoch": 0.07579668940015001, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 8691 + }, + { + "epoch": 0.07580541068531858, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 8692 + }, + { + "epoch": 0.07581413197048717, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8693 + }, + { + "epoch": 0.07582285325565576, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 8694 + }, + { + "epoch": 0.07583157454082434, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 8695 + }, + { + "epoch": 0.07584029582599291, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 8696 + }, + { + "epoch": 0.0758490171111615, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 8697 + }, + { + "epoch": 0.07585773839633009, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 8698 + }, + { + "epoch": 0.07586645968149866, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 8699 + }, + { + "epoch": 0.07587518096666725, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8700 + }, + { + "epoch": 0.07588390225183583, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 8701 + }, + { + "epoch": 0.07589262353700442, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 8702 + }, + { + "epoch": 0.07590134482217299, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8703 + }, + { + "epoch": 0.07591006610734158, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8704 + }, + { + "epoch": 0.07591878739251016, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 8705 + }, + { + "epoch": 0.07592750867767874, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8706 + }, + { + "epoch": 0.07593622996284732, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 8707 + }, + { + "epoch": 0.07594495124801591, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8708 + }, + { + "epoch": 0.0759536725331845, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 8709 + }, + { + "epoch": 0.07596239381835307, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 8710 + }, + { + "epoch": 0.07597111510352166, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 8711 + }, + { + "epoch": 0.07597983638869024, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8712 + }, + { + "epoch": 0.07598855767385881, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8713 + }, + { + "epoch": 0.0759972789590274, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 8714 + }, + { + "epoch": 0.07600600024419599, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 8715 + }, + { + "epoch": 0.07601472152936457, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8716 + }, + { + "epoch": 0.07602344281453315, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 8717 + }, + { + "epoch": 0.07603216409970173, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8718 + }, + { + "epoch": 0.07604088538487032, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8719 + }, + { + "epoch": 0.07604960667003889, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8720 + }, + { + "epoch": 0.07605832795520748, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8721 + }, + { + "epoch": 0.07606704924037606, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 8722 + }, + { + "epoch": 0.07607577052554465, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8723 + }, + { + "epoch": 0.07608449181071322, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8724 + }, + { + "epoch": 0.07609321309588181, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 8725 + }, + { + "epoch": 0.0761019343810504, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 8726 + }, + { + "epoch": 0.07611065566621897, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 8727 + }, + { + "epoch": 0.07611937695138755, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 8728 + }, + { + "epoch": 0.07612809823655614, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8729 + }, + { + "epoch": 0.07613681952172473, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 8730 + }, + { + "epoch": 0.0761455408068933, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 8731 + }, + { + "epoch": 0.07615426209206189, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 8732 + }, + { + "epoch": 0.07616298337723047, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8733 + }, + { + "epoch": 0.07617170466239905, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8734 + }, + { + "epoch": 0.07618042594756763, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 8735 + }, + { + "epoch": 0.07618914723273622, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8736 + }, + { + "epoch": 0.0761978685179048, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 8737 + }, + { + "epoch": 0.07620658980307338, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 8738 + }, + { + "epoch": 0.07621531108824196, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 8739 + }, + { + "epoch": 0.07622403237341055, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 8740 + }, + { + "epoch": 0.07623275365857912, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8741 + }, + { + "epoch": 0.07624147494374771, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8742 + }, + { + "epoch": 0.0762501962289163, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 8743 + }, + { + "epoch": 0.07625891751408488, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 8744 + }, + { + "epoch": 0.07626763879925345, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 8745 + }, + { + "epoch": 0.07627636008442204, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 8746 + }, + { + "epoch": 0.07628508136959063, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 8747 + }, + { + "epoch": 0.0762938026547592, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 8748 + }, + { + "epoch": 0.07630252393992779, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8749 + }, + { + "epoch": 0.07631124522509637, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8750 + }, + { + "epoch": 0.07631996651026496, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8751 + }, + { + "epoch": 0.07632868779543353, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 8752 + }, + { + "epoch": 0.07633740908060212, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 8753 + }, + { + "epoch": 0.0763461303657707, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8754 + }, + { + "epoch": 0.07635485165093928, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 8755 + }, + { + "epoch": 0.07636357293610786, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 8756 + }, + { + "epoch": 0.07637229422127645, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8757 + }, + { + "epoch": 0.07638101550644504, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8758 + }, + { + "epoch": 0.07638973679161361, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8759 + }, + { + "epoch": 0.0763984580767822, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 8760 + }, + { + "epoch": 0.07640717936195078, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 8761 + }, + { + "epoch": 0.07641590064711935, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 8762 + }, + { + "epoch": 0.07642462193228794, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 8763 + }, + { + "epoch": 0.07643334321745653, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8764 + }, + { + "epoch": 0.07644206450262511, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 8765 + }, + { + "epoch": 0.07645078578779368, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8766 + }, + { + "epoch": 0.07645950707296227, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 8767 + }, + { + "epoch": 0.07646822835813086, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 8768 + }, + { + "epoch": 0.07647694964329943, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 8769 + }, + { + "epoch": 0.07648567092846802, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 8770 + }, + { + "epoch": 0.0764943922136366, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 8771 + }, + { + "epoch": 0.07650311349880519, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8772 + }, + { + "epoch": 0.07651183478397376, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 8773 + }, + { + "epoch": 0.07652055606914235, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 8774 + }, + { + "epoch": 0.07652927735431093, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8775 + }, + { + "epoch": 0.07653799863947951, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8776 + }, + { + "epoch": 0.0765467199246481, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 8777 + }, + { + "epoch": 0.07655544120981668, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 8778 + }, + { + "epoch": 0.07656416249498527, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 8779 + }, + { + "epoch": 0.07657288378015384, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 8780 + }, + { + "epoch": 0.07658160506532243, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 8781 + }, + { + "epoch": 0.07659032635049101, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8782 + }, + { + "epoch": 0.07659904763565958, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 8783 + }, + { + "epoch": 0.07660776892082817, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 8784 + }, + { + "epoch": 0.07661649020599676, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8785 + }, + { + "epoch": 0.07662521149116534, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8786 + }, + { + "epoch": 0.07663393277633392, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8787 + }, + { + "epoch": 0.0766426540615025, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 8788 + }, + { + "epoch": 0.07665137534667109, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 8789 + }, + { + "epoch": 0.07666009663183967, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 8790 + }, + { + "epoch": 0.07666881791700825, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 8791 + }, + { + "epoch": 0.07667753920217683, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 8792 + }, + { + "epoch": 0.07668626048734542, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 8793 + }, + { + "epoch": 0.07669498177251399, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8794 + }, + { + "epoch": 0.07670370305768258, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8795 + }, + { + "epoch": 0.07671242434285117, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 8796 + }, + { + "epoch": 0.07672114562801975, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 8797 + }, + { + "epoch": 0.07672986691318832, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8798 + }, + { + "epoch": 0.07673858819835691, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 8799 + }, + { + "epoch": 0.0767473094835255, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 8800 + }, + { + "epoch": 0.07675603076869407, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 8801 + }, + { + "epoch": 0.07676475205386266, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 8802 + }, + { + "epoch": 0.07677347333903124, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 8803 + }, + { + "epoch": 0.07678219462419983, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8804 + }, + { + "epoch": 0.0767909159093684, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8805 + }, + { + "epoch": 0.07679963719453699, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8806 + }, + { + "epoch": 0.07680835847970557, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 8807 + }, + { + "epoch": 0.07681707976487415, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8808 + }, + { + "epoch": 0.07682580105004273, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 8809 + }, + { + "epoch": 0.07683452233521132, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8810 + }, + { + "epoch": 0.0768432436203799, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 8811 + }, + { + "epoch": 0.07685196490554848, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 8812 + }, + { + "epoch": 0.07686068619071706, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8813 + }, + { + "epoch": 0.07686940747588565, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8814 + }, + { + "epoch": 0.07687812876105422, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 8815 + }, + { + "epoch": 0.07688685004622281, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8816 + }, + { + "epoch": 0.0768955713313914, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8817 + }, + { + "epoch": 0.07690429261655998, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8818 + }, + { + "epoch": 0.07691301390172856, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 8819 + }, + { + "epoch": 0.07692173518689714, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 8820 + }, + { + "epoch": 0.07693045647206573, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 8821 + }, + { + "epoch": 0.0769391777572343, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8822 + }, + { + "epoch": 0.07694789904240289, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 8823 + }, + { + "epoch": 0.07695662032757147, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8824 + }, + { + "epoch": 0.07696534161274006, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8825 + }, + { + "epoch": 0.07697406289790863, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 8826 + }, + { + "epoch": 0.07698278418307722, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8827 + }, + { + "epoch": 0.0769915054682458, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 8828 + }, + { + "epoch": 0.07700022675341438, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 8829 + }, + { + "epoch": 0.07700894803858296, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8830 + }, + { + "epoch": 0.07701766932375155, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8831 + }, + { + "epoch": 0.07702639060892014, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8832 + }, + { + "epoch": 0.07703511189408871, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8833 + }, + { + "epoch": 0.0770438331792573, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8834 + }, + { + "epoch": 0.07705255446442588, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 8835 + }, + { + "epoch": 0.07706127574959445, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 8836 + }, + { + "epoch": 0.07706999703476304, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 8837 + }, + { + "epoch": 0.07707871831993163, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8838 + }, + { + "epoch": 0.07708743960510021, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 8839 + }, + { + "epoch": 0.07709616089026879, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 8840 + }, + { + "epoch": 0.07710488217543737, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 8841 + }, + { + "epoch": 0.07711360346060596, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8842 + }, + { + "epoch": 0.07712232474577453, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 8843 + }, + { + "epoch": 0.07713104603094312, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8844 + }, + { + "epoch": 0.0771397673161117, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 8845 + }, + { + "epoch": 0.07714848860128029, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 8846 + }, + { + "epoch": 0.07715720988644886, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 8847 + }, + { + "epoch": 0.07716593117161745, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 8848 + }, + { + "epoch": 0.07717465245678604, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 8849 + }, + { + "epoch": 0.07718337374195461, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 8850 + }, + { + "epoch": 0.0771920950271232, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8851 + }, + { + "epoch": 0.07720081631229178, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 8852 + }, + { + "epoch": 0.07720953759746037, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 8853 + }, + { + "epoch": 0.07721825888262894, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 8854 + }, + { + "epoch": 0.07722698016779753, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 8855 + }, + { + "epoch": 0.07723570145296611, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 8856 + }, + { + "epoch": 0.07724442273813469, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8857 + }, + { + "epoch": 0.07725314402330327, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8858 + }, + { + "epoch": 0.07726186530847186, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8859 + }, + { + "epoch": 0.07727058659364044, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8860 + }, + { + "epoch": 0.07727930787880902, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 8861 + }, + { + "epoch": 0.0772880291639776, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 8862 + }, + { + "epoch": 0.07729675044914619, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8863 + }, + { + "epoch": 0.07730547173431476, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8864 + }, + { + "epoch": 0.07731419301948335, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8865 + }, + { + "epoch": 0.07732291430465194, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8866 + }, + { + "epoch": 0.07733163558982052, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 8867 + }, + { + "epoch": 0.0773403568749891, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 8868 + }, + { + "epoch": 0.07734907816015768, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 8869 + }, + { + "epoch": 0.07735779944532627, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 8870 + }, + { + "epoch": 0.07736652073049484, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8871 + }, + { + "epoch": 0.07737524201566343, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 8872 + }, + { + "epoch": 0.07738396330083201, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 8873 + }, + { + "epoch": 0.0773926845860006, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 8874 + }, + { + "epoch": 0.07740140587116917, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 8875 + }, + { + "epoch": 0.07741012715633776, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 8876 + }, + { + "epoch": 0.07741884844150634, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 8877 + }, + { + "epoch": 0.07742756972667492, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8878 + }, + { + "epoch": 0.0774362910118435, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 8879 + }, + { + "epoch": 0.07744501229701209, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 8880 + }, + { + "epoch": 0.07745373358218068, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 8881 + }, + { + "epoch": 0.07746245486734925, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 8882 + }, + { + "epoch": 0.07747117615251783, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8883 + }, + { + "epoch": 0.07747989743768642, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 8884 + }, + { + "epoch": 0.077488618722855, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 8885 + }, + { + "epoch": 0.07749734000802358, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8886 + }, + { + "epoch": 0.07750606129319217, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 8887 + }, + { + "epoch": 0.07751478257836075, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 8888 + }, + { + "epoch": 0.07752350386352932, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 8889 + }, + { + "epoch": 0.07753222514869791, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 8890 + }, + { + "epoch": 0.0775409464338665, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 8891 + }, + { + "epoch": 0.07754966771903507, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8892 + }, + { + "epoch": 0.07755838900420366, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8893 + }, + { + "epoch": 0.07756711028937224, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 8894 + }, + { + "epoch": 0.07757583157454083, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8895 + }, + { + "epoch": 0.0775845528597094, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8896 + }, + { + "epoch": 0.07759327414487799, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 8897 + }, + { + "epoch": 0.07760199543004657, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 8898 + }, + { + "epoch": 0.07761071671521515, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 8899 + }, + { + "epoch": 0.07761943800038373, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8900 + }, + { + "epoch": 0.07762815928555232, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8901 + }, + { + "epoch": 0.0776368805707209, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 8902 + }, + { + "epoch": 0.07764560185588948, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 8903 + }, + { + "epoch": 0.07765432314105807, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8904 + }, + { + "epoch": 0.07766304442622665, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 8905 + }, + { + "epoch": 0.07767176571139524, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8906 + }, + { + "epoch": 0.07768048699656381, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 8907 + }, + { + "epoch": 0.0776892082817324, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8908 + }, + { + "epoch": 0.07769792956690098, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 8909 + }, + { + "epoch": 0.07770665085206956, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8910 + }, + { + "epoch": 0.07771537213723814, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 8911 + }, + { + "epoch": 0.07772409342240673, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8912 + }, + { + "epoch": 0.07773281470757532, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 8913 + }, + { + "epoch": 0.07774153599274389, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 8914 + }, + { + "epoch": 0.07775025727791247, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 8915 + }, + { + "epoch": 0.07775897856308106, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 8916 + }, + { + "epoch": 0.07776769984824963, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 8917 + }, + { + "epoch": 0.07777642113341822, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 8918 + }, + { + "epoch": 0.0777851424185868, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8919 + }, + { + "epoch": 0.07779386370375539, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8920 + }, + { + "epoch": 0.07780258498892396, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 8921 + }, + { + "epoch": 0.07781130627409255, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 8922 + }, + { + "epoch": 0.07782002755926114, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 8923 + }, + { + "epoch": 0.07782874884442971, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 8924 + }, + { + "epoch": 0.0778374701295983, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8925 + }, + { + "epoch": 0.07784619141476688, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 8926 + }, + { + "epoch": 0.07785491269993547, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8927 + }, + { + "epoch": 0.07786363398510404, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8928 + }, + { + "epoch": 0.07787235527027263, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8929 + }, + { + "epoch": 0.07788107655544121, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8930 + }, + { + "epoch": 0.07788979784060979, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 8931 + }, + { + "epoch": 0.07789851912577837, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8932 + }, + { + "epoch": 0.07790724041094696, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8933 + }, + { + "epoch": 0.07791596169611555, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8934 + }, + { + "epoch": 0.07792468298128412, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8935 + }, + { + "epoch": 0.0779334042664527, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 8936 + }, + { + "epoch": 0.07794212555162129, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8937 + }, + { + "epoch": 0.07795084683678986, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8938 + }, + { + "epoch": 0.07795956812195845, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 8939 + }, + { + "epoch": 0.07796828940712704, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 8940 + }, + { + "epoch": 0.07797701069229562, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 8941 + }, + { + "epoch": 0.0779857319774642, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8942 + }, + { + "epoch": 0.07799445326263278, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 8943 + }, + { + "epoch": 0.07800317454780137, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 8944 + }, + { + "epoch": 0.07801189583296994, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 8945 + }, + { + "epoch": 0.07802061711813853, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8946 + }, + { + "epoch": 0.07802933840330711, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 8947 + }, + { + "epoch": 0.0780380596884757, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 8948 + }, + { + "epoch": 0.07804678097364427, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 8949 + }, + { + "epoch": 0.07805550225881286, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 8950 + }, + { + "epoch": 0.07806422354398145, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 8951 + }, + { + "epoch": 0.07807294482915002, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 8952 + }, + { + "epoch": 0.0780816661143186, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 8953 + }, + { + "epoch": 0.07809038739948719, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 8954 + }, + { + "epoch": 0.07809910868465578, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8955 + }, + { + "epoch": 0.07810782996982435, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 8956 + }, + { + "epoch": 0.07811655125499294, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 8957 + }, + { + "epoch": 0.07812527254016152, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 8958 + }, + { + "epoch": 0.0781339938253301, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 8959 + }, + { + "epoch": 0.07814271511049868, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 8960 + }, + { + "epoch": 0.07815143639566727, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 8961 + }, + { + "epoch": 0.07816015768083585, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 8962 + }, + { + "epoch": 0.07816887896600443, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 8963 + }, + { + "epoch": 0.07817760025117301, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8964 + }, + { + "epoch": 0.0781863215363416, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 8965 + }, + { + "epoch": 0.07819504282151017, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 8966 + }, + { + "epoch": 0.07820376410667876, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 8967 + }, + { + "epoch": 0.07821248539184734, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 8968 + }, + { + "epoch": 0.07822120667701593, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8969 + }, + { + "epoch": 0.0782299279621845, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8970 + }, + { + "epoch": 0.07823864924735309, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 8971 + }, + { + "epoch": 0.07824737053252168, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8972 + }, + { + "epoch": 0.07825609181769025, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 8973 + }, + { + "epoch": 0.07826481310285884, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 8974 + }, + { + "epoch": 0.07827353438802742, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 8975 + }, + { + "epoch": 0.07828225567319601, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 8976 + }, + { + "epoch": 0.07829097695836458, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 8977 + }, + { + "epoch": 0.07829969824353317, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 8978 + }, + { + "epoch": 0.07830841952870175, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8979 + }, + { + "epoch": 0.07831714081387033, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8980 + }, + { + "epoch": 0.07832586209903891, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 8981 + }, + { + "epoch": 0.0783345833842075, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 8982 + }, + { + "epoch": 0.07834330466937608, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 8983 + }, + { + "epoch": 0.07835202595454466, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8984 + }, + { + "epoch": 0.07836074723971324, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8985 + }, + { + "epoch": 0.07836946852488183, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8986 + }, + { + "epoch": 0.0783781898100504, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 8987 + }, + { + "epoch": 0.07838691109521899, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 8988 + }, + { + "epoch": 0.07839563238038758, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 8989 + }, + { + "epoch": 0.07840435366555616, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8990 + }, + { + "epoch": 0.07841307495072473, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 8991 + }, + { + "epoch": 0.07842179623589332, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8992 + }, + { + "epoch": 0.07843051752106191, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 8993 + }, + { + "epoch": 0.07843923880623048, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8994 + }, + { + "epoch": 0.07844796009139907, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 8995 + }, + { + "epoch": 0.07845668137656765, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8996 + }, + { + "epoch": 0.07846540266173624, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 8997 + }, + { + "epoch": 0.07847412394690481, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8998 + }, + { + "epoch": 0.0784828452320734, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8999 + }, + { + "epoch": 0.07849156651724198, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9000 + }, + { + "epoch": 0.07850028780241056, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9001 + }, + { + "epoch": 0.07850900908757914, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9002 + }, + { + "epoch": 0.07851773037274773, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 9003 + }, + { + "epoch": 0.07852645165791632, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 9004 + }, + { + "epoch": 0.07853517294308489, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9005 + }, + { + "epoch": 0.07854389422825347, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 9006 + }, + { + "epoch": 0.07855261551342206, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 9007 + }, + { + "epoch": 0.07856133679859063, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 9008 + }, + { + "epoch": 0.07857005808375922, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9009 + }, + { + "epoch": 0.0785787793689278, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 9010 + }, + { + "epoch": 0.07858750065409639, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9011 + }, + { + "epoch": 0.07859622193926497, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 9012 + }, + { + "epoch": 0.07860494322443355, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 9013 + }, + { + "epoch": 0.07861366450960214, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 9014 + }, + { + "epoch": 0.07862238579477071, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 9015 + }, + { + "epoch": 0.0786311070799393, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 9016 + }, + { + "epoch": 0.07863982836510788, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 9017 + }, + { + "epoch": 0.07864854965027647, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 9018 + }, + { + "epoch": 0.07865727093544504, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9019 + }, + { + "epoch": 0.07866599222061363, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9020 + }, + { + "epoch": 0.07867471350578222, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 9021 + }, + { + "epoch": 0.0786834347909508, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 9022 + }, + { + "epoch": 0.07869215607611937, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9023 + }, + { + "epoch": 0.07870087736128796, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 9024 + }, + { + "epoch": 0.07870959864645655, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9025 + }, + { + "epoch": 0.07871831993162512, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9026 + }, + { + "epoch": 0.0787270412167937, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 9027 + }, + { + "epoch": 0.07873576250196229, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 9028 + }, + { + "epoch": 0.07874448378713088, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9029 + }, + { + "epoch": 0.07875320507229945, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 9030 + }, + { + "epoch": 0.07876192635746804, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 9031 + }, + { + "epoch": 0.07877064764263662, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9032 + }, + { + "epoch": 0.0787793689278052, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 9033 + }, + { + "epoch": 0.07878809021297378, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 9034 + }, + { + "epoch": 0.07879681149814237, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9035 + }, + { + "epoch": 0.07880553278331096, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9036 + }, + { + "epoch": 0.07881425406847953, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 9037 + }, + { + "epoch": 0.07882297535364811, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9038 + }, + { + "epoch": 0.0788316966388167, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 9039 + }, + { + "epoch": 0.07884041792398527, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 9040 + }, + { + "epoch": 0.07884913920915386, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 9041 + }, + { + "epoch": 0.07885786049432245, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 9042 + }, + { + "epoch": 0.07886658177949103, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 9043 + }, + { + "epoch": 0.0788753030646596, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 9044 + }, + { + "epoch": 0.07888402434982819, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 9045 + }, + { + "epoch": 0.07889274563499678, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 9046 + }, + { + "epoch": 0.07890146692016535, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9047 + }, + { + "epoch": 0.07891018820533394, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 9048 + }, + { + "epoch": 0.07891890949050252, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9049 + }, + { + "epoch": 0.07892763077567111, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 9050 + }, + { + "epoch": 0.07893635206083968, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 9051 + }, + { + "epoch": 0.07894507334600827, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 9052 + }, + { + "epoch": 0.07895379463117685, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 9053 + }, + { + "epoch": 0.07896251591634543, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9054 + }, + { + "epoch": 0.07897123720151401, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 9055 + }, + { + "epoch": 0.0789799584866826, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 9056 + }, + { + "epoch": 0.07898867977185119, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9057 + }, + { + "epoch": 0.07899740105701976, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 9058 + }, + { + "epoch": 0.07900612234218835, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9059 + }, + { + "epoch": 0.07901484362735693, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9060 + }, + { + "epoch": 0.0790235649125255, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 9061 + }, + { + "epoch": 0.07903228619769409, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 9062 + }, + { + "epoch": 0.07904100748286268, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 9063 + }, + { + "epoch": 0.07904972876803126, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9064 + }, + { + "epoch": 0.07905845005319984, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 9065 + }, + { + "epoch": 0.07906717133836842, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 9066 + }, + { + "epoch": 0.07907589262353701, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 9067 + }, + { + "epoch": 0.07908461390870558, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 9068 + }, + { + "epoch": 0.07909333519387417, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 9069 + }, + { + "epoch": 0.07910205647904275, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 9070 + }, + { + "epoch": 0.07911077776421134, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 9071 + }, + { + "epoch": 0.07911949904937991, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9072 + }, + { + "epoch": 0.0791282203345485, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 9073 + }, + { + "epoch": 0.07913694161971709, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 9074 + }, + { + "epoch": 0.07914566290488566, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9075 + }, + { + "epoch": 0.07915438419005424, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 9076 + }, + { + "epoch": 0.07916310547522283, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 9077 + }, + { + "epoch": 0.07917182676039142, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9078 + }, + { + "epoch": 0.07918054804555999, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9079 + }, + { + "epoch": 0.07918926933072858, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9080 + }, + { + "epoch": 0.07919799061589716, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 9081 + }, + { + "epoch": 0.07920671190106574, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 9082 + }, + { + "epoch": 0.07921543318623432, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 9083 + }, + { + "epoch": 0.07922415447140291, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9084 + }, + { + "epoch": 0.0792328757565715, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9085 + }, + { + "epoch": 0.07924159704174007, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 9086 + }, + { + "epoch": 0.07925031832690865, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 9087 + }, + { + "epoch": 0.07925903961207724, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9088 + }, + { + "epoch": 0.07926776089724581, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 9089 + }, + { + "epoch": 0.0792764821824144, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 9090 + }, + { + "epoch": 0.07928520346758298, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9091 + }, + { + "epoch": 0.07929392475275157, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 9092 + }, + { + "epoch": 0.07930264603792014, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 9093 + }, + { + "epoch": 0.07931136732308873, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 9094 + }, + { + "epoch": 0.07932008860825732, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 9095 + }, + { + "epoch": 0.07932880989342589, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 9096 + }, + { + "epoch": 0.07933753117859448, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 9097 + }, + { + "epoch": 0.07934625246376306, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 9098 + }, + { + "epoch": 0.07935497374893165, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 9099 + }, + { + "epoch": 0.07936369503410022, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 9100 + }, + { + "epoch": 0.07937241631926881, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 9101 + }, + { + "epoch": 0.0793811376044374, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9102 + }, + { + "epoch": 0.07938985888960597, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 9103 + }, + { + "epoch": 0.07939858017477455, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9104 + }, + { + "epoch": 0.07940730145994314, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 9105 + }, + { + "epoch": 0.07941602274511173, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 9106 + }, + { + "epoch": 0.0794247440302803, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9107 + }, + { + "epoch": 0.07943346531544888, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 9108 + }, + { + "epoch": 0.07944218660061747, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 9109 + }, + { + "epoch": 0.07945090788578604, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 9110 + }, + { + "epoch": 0.07945962917095463, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 9111 + }, + { + "epoch": 0.07946835045612322, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 9112 + }, + { + "epoch": 0.0794770717412918, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9113 + }, + { + "epoch": 0.07948579302646037, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 9114 + }, + { + "epoch": 0.07949451431162896, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 9115 + }, + { + "epoch": 0.07950323559679755, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 9116 + }, + { + "epoch": 0.07951195688196612, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9117 + }, + { + "epoch": 0.0795206781671347, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 9118 + }, + { + "epoch": 0.07952939945230329, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9119 + }, + { + "epoch": 0.07953812073747188, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9120 + }, + { + "epoch": 0.07954684202264045, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 9121 + }, + { + "epoch": 0.07955556330780904, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 9122 + }, + { + "epoch": 0.07956428459297762, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9123 + }, + { + "epoch": 0.0795730058781462, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 9124 + }, + { + "epoch": 0.07958172716331478, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 9125 + }, + { + "epoch": 0.07959044844848337, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 9126 + }, + { + "epoch": 0.07959916973365196, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 9127 + }, + { + "epoch": 0.07960789101882053, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 9128 + }, + { + "epoch": 0.07961661230398911, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9129 + }, + { + "epoch": 0.0796253335891577, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 9130 + }, + { + "epoch": 0.07963405487432627, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9131 + }, + { + "epoch": 0.07964277615949486, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 9132 + }, + { + "epoch": 0.07965149744466345, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9133 + }, + { + "epoch": 0.07966021872983203, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9134 + }, + { + "epoch": 0.0796689400150006, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 9135 + }, + { + "epoch": 0.07967766130016919, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 9136 + }, + { + "epoch": 0.07968638258533778, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9137 + }, + { + "epoch": 0.07969510387050636, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 9138 + }, + { + "epoch": 0.07970382515567494, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9139 + }, + { + "epoch": 0.07971254644084352, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 9140 + }, + { + "epoch": 0.07972126772601211, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 9141 + }, + { + "epoch": 0.07972998901118068, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 9142 + }, + { + "epoch": 0.07973871029634927, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 9143 + }, + { + "epoch": 0.07974743158151786, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9144 + }, + { + "epoch": 0.07975615286668644, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 9145 + }, + { + "epoch": 0.07976487415185501, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9146 + }, + { + "epoch": 0.0797735954370236, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 9147 + }, + { + "epoch": 0.07978231672219219, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9148 + }, + { + "epoch": 0.07979103800736076, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 9149 + }, + { + "epoch": 0.07979975929252935, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 9150 + }, + { + "epoch": 0.07980848057769793, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 9151 + }, + { + "epoch": 0.07981720186286652, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 9152 + }, + { + "epoch": 0.07982592314803509, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 9153 + }, + { + "epoch": 0.07983464443320368, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9154 + }, + { + "epoch": 0.07984336571837226, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9155 + }, + { + "epoch": 0.07985208700354084, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9156 + }, + { + "epoch": 0.07986080828870942, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 9157 + }, + { + "epoch": 0.07986952957387801, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 9158 + }, + { + "epoch": 0.0798782508590466, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 9159 + }, + { + "epoch": 0.07988697214421517, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 9160 + }, + { + "epoch": 0.07989569342938375, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 9161 + }, + { + "epoch": 0.07990441471455234, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 9162 + }, + { + "epoch": 0.07991313599972091, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 9163 + }, + { + "epoch": 0.0799218572848895, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 9164 + }, + { + "epoch": 0.07993057857005809, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 9165 + }, + { + "epoch": 0.07993929985522667, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 9166 + }, + { + "epoch": 0.07994802114039525, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 9167 + }, + { + "epoch": 0.07995674242556383, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 9168 + }, + { + "epoch": 0.07996546371073242, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 9169 + }, + { + "epoch": 0.07997418499590099, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9170 + }, + { + "epoch": 0.07998290628106958, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 9171 + }, + { + "epoch": 0.07999162756623816, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 9172 + }, + { + "epoch": 0.08000034885140675, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 9173 + }, + { + "epoch": 0.08000907013657532, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 9174 + }, + { + "epoch": 0.08001779142174391, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9175 + }, + { + "epoch": 0.0800265127069125, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9176 + }, + { + "epoch": 0.08003523399208107, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9177 + }, + { + "epoch": 0.08004395527724965, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 9178 + }, + { + "epoch": 0.08005267656241824, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 9179 + }, + { + "epoch": 0.08006139784758683, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 9180 + }, + { + "epoch": 0.0800701191327554, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 9181 + }, + { + "epoch": 0.08007884041792399, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 9182 + }, + { + "epoch": 0.08008756170309257, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9183 + }, + { + "epoch": 0.08009628298826114, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 9184 + }, + { + "epoch": 0.08010500427342973, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9185 + }, + { + "epoch": 0.08011372555859832, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 9186 + }, + { + "epoch": 0.0801224468437669, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9187 + }, + { + "epoch": 0.08013116812893548, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 9188 + }, + { + "epoch": 0.08013988941410406, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9189 + }, + { + "epoch": 0.08014861069927265, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9190 + }, + { + "epoch": 0.08015733198444122, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 9191 + }, + { + "epoch": 0.08016605326960981, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 9192 + }, + { + "epoch": 0.0801747745547784, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9193 + }, + { + "epoch": 0.08018349583994698, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9194 + }, + { + "epoch": 0.08019221712511555, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9195 + }, + { + "epoch": 0.08020093841028414, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9196 + }, + { + "epoch": 0.08020965969545273, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9197 + }, + { + "epoch": 0.0802183809806213, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 9198 + }, + { + "epoch": 0.08022710226578988, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 9199 + }, + { + "epoch": 0.08023582355095847, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 9200 + }, + { + "epoch": 0.08024454483612706, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 9201 + }, + { + "epoch": 0.08025326612129563, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9202 + }, + { + "epoch": 0.08026198740646422, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 9203 + }, + { + "epoch": 0.0802707086916328, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 9204 + }, + { + "epoch": 0.08027942997680138, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9205 + }, + { + "epoch": 0.08028815126196996, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 9206 + }, + { + "epoch": 0.08029687254713855, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 9207 + }, + { + "epoch": 0.08030559383230713, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 9208 + }, + { + "epoch": 0.08031431511747571, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9209 + }, + { + "epoch": 0.0803230364026443, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9210 + }, + { + "epoch": 0.08033175768781288, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9211 + }, + { + "epoch": 0.08034047897298145, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 9212 + }, + { + "epoch": 0.08034920025815004, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 9213 + }, + { + "epoch": 0.08035792154331863, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 9214 + }, + { + "epoch": 0.08036664282848721, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 9215 + }, + { + "epoch": 0.08037536411365578, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 9216 + }, + { + "epoch": 0.08038408539882437, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9217 + }, + { + "epoch": 0.08039280668399296, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 9218 + }, + { + "epoch": 0.08040152796916153, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 9219 + }, + { + "epoch": 0.08041024925433012, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 9220 + }, + { + "epoch": 0.0804189705394987, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 9221 + }, + { + "epoch": 0.08042769182466729, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 9222 + }, + { + "epoch": 0.08043641310983586, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9223 + }, + { + "epoch": 0.08044513439500445, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 9224 + }, + { + "epoch": 0.08045385568017303, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9225 + }, + { + "epoch": 0.0804625769653416, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 9226 + }, + { + "epoch": 0.08047129825051019, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 9227 + }, + { + "epoch": 0.08048001953567878, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 9228 + }, + { + "epoch": 0.08048874082084737, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9229 + }, + { + "epoch": 0.08049746210601594, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9230 + }, + { + "epoch": 0.08050618339118452, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 9231 + }, + { + "epoch": 0.08051490467635311, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 9232 + }, + { + "epoch": 0.08052362596152168, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9233 + }, + { + "epoch": 0.08053234724669027, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 9234 + }, + { + "epoch": 0.08054106853185886, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9235 + }, + { + "epoch": 0.08054978981702744, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 9236 + }, + { + "epoch": 0.08055851110219601, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 9237 + }, + { + "epoch": 0.0805672323873646, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 9238 + }, + { + "epoch": 0.08057595367253319, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 9239 + }, + { + "epoch": 0.08058467495770176, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 9240 + }, + { + "epoch": 0.08059339624287035, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 9241 + }, + { + "epoch": 0.08060211752803893, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 9242 + }, + { + "epoch": 0.08061083881320752, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 9243 + }, + { + "epoch": 0.08061956009837609, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 9244 + }, + { + "epoch": 0.08062828138354468, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 9245 + }, + { + "epoch": 0.08063700266871326, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 9246 + }, + { + "epoch": 0.08064572395388184, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 9247 + }, + { + "epoch": 0.08065444523905042, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9248 + }, + { + "epoch": 0.08066316652421901, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9249 + }, + { + "epoch": 0.0806718878093876, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 9250 + }, + { + "epoch": 0.08068060909455617, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9251 + }, + { + "epoch": 0.08068933037972476, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 9252 + }, + { + "epoch": 0.08069805166489334, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9253 + }, + { + "epoch": 0.08070677295006193, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 9254 + }, + { + "epoch": 0.0807154942352305, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 9255 + }, + { + "epoch": 0.08072421552039909, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 9256 + }, + { + "epoch": 0.08073293680556767, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 9257 + }, + { + "epoch": 0.08074165809073625, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9258 + }, + { + "epoch": 0.08075037937590483, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 9259 + }, + { + "epoch": 0.08075910066107342, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 9260 + }, + { + "epoch": 0.080767821946242, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9261 + }, + { + "epoch": 0.08077654323141058, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 9262 + }, + { + "epoch": 0.08078526451657916, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 9263 + }, + { + "epoch": 0.08079398580174775, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 9264 + }, + { + "epoch": 0.08080270708691632, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 9265 + }, + { + "epoch": 0.08081142837208491, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9266 + }, + { + "epoch": 0.0808201496572535, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9267 + }, + { + "epoch": 0.08082887094242208, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 9268 + }, + { + "epoch": 0.08083759222759065, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9269 + }, + { + "epoch": 0.08084631351275924, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9270 + }, + { + "epoch": 0.08085503479792783, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9271 + }, + { + "epoch": 0.0808637560830964, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9272 + }, + { + "epoch": 0.08087247736826499, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9273 + }, + { + "epoch": 0.08088119865343357, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 9274 + }, + { + "epoch": 0.08088991993860216, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 9275 + }, + { + "epoch": 0.08089864122377073, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 9276 + }, + { + "epoch": 0.08090736250893932, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9277 + }, + { + "epoch": 0.0809160837941079, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 9278 + }, + { + "epoch": 0.08092480507927648, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 9279 + }, + { + "epoch": 0.08093352636444506, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 9280 + }, + { + "epoch": 0.08094224764961365, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 9281 + }, + { + "epoch": 0.08095096893478224, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 9282 + }, + { + "epoch": 0.08095969021995081, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 9283 + }, + { + "epoch": 0.0809684115051194, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 9284 + }, + { + "epoch": 0.08097713279028798, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 9285 + }, + { + "epoch": 0.08098585407545655, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 9286 + }, + { + "epoch": 0.08099457536062514, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9287 + }, + { + "epoch": 0.08100329664579373, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 9288 + }, + { + "epoch": 0.08101201793096231, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 9289 + }, + { + "epoch": 0.08102073921613089, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 9290 + }, + { + "epoch": 0.08102946050129947, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 9291 + }, + { + "epoch": 0.08103818178646806, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 9292 + }, + { + "epoch": 0.08104690307163663, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 9293 + }, + { + "epoch": 0.08105562435680522, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 9294 + }, + { + "epoch": 0.0810643456419738, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 9295 + }, + { + "epoch": 0.08107306692714239, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9296 + }, + { + "epoch": 0.08108178821231096, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9297 + }, + { + "epoch": 0.08109050949747955, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9298 + }, + { + "epoch": 0.08109923078264814, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9299 + }, + { + "epoch": 0.08110795206781671, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9300 + }, + { + "epoch": 0.0811166733529853, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 9301 + }, + { + "epoch": 0.08112539463815388, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 9302 + }, + { + "epoch": 0.08113411592332247, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9303 + }, + { + "epoch": 0.08114283720849104, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 9304 + }, + { + "epoch": 0.08115155849365963, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 9305 + }, + { + "epoch": 0.08116027977882821, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 9306 + }, + { + "epoch": 0.08116900106399678, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 9307 + }, + { + "epoch": 0.08117772234916537, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 9308 + }, + { + "epoch": 0.08118644363433396, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 9309 + }, + { + "epoch": 0.08119516491950254, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 9310 + }, + { + "epoch": 0.08120388620467112, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 9311 + }, + { + "epoch": 0.0812126074898397, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9312 + }, + { + "epoch": 0.08122132877500829, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 9313 + }, + { + "epoch": 0.08123005006017686, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 9314 + }, + { + "epoch": 0.08123877134534545, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 9315 + }, + { + "epoch": 0.08124749263051403, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 9316 + }, + { + "epoch": 0.08125621391568262, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 9317 + }, + { + "epoch": 0.0812649352008512, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 9318 + }, + { + "epoch": 0.08127365648601978, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 9319 + }, + { + "epoch": 0.08128237777118837, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 9320 + }, + { + "epoch": 0.08129109905635694, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 9321 + }, + { + "epoch": 0.08129982034152552, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 9322 + }, + { + "epoch": 0.08130854162669411, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 9323 + }, + { + "epoch": 0.0813172629118627, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 9324 + }, + { + "epoch": 0.08132598419703127, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 9325 + }, + { + "epoch": 0.08133470548219986, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 9326 + }, + { + "epoch": 0.08134342676736844, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 9327 + }, + { + "epoch": 0.08135214805253702, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9328 + }, + { + "epoch": 0.0813608693377056, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 9329 + }, + { + "epoch": 0.08136959062287419, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 9330 + }, + { + "epoch": 0.08137831190804277, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 9331 + }, + { + "epoch": 0.08138703319321135, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 9332 + }, + { + "epoch": 0.08139575447837993, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 9333 + }, + { + "epoch": 0.08140447576354852, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9334 + }, + { + "epoch": 0.08141319704871709, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 9335 + }, + { + "epoch": 0.08142191833388568, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 9336 + }, + { + "epoch": 0.08143063961905427, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9337 + }, + { + "epoch": 0.08143936090422285, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9338 + }, + { + "epoch": 0.08144808218939142, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 9339 + }, + { + "epoch": 0.08145680347456001, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 9340 + }, + { + "epoch": 0.0814655247597286, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9341 + }, + { + "epoch": 0.08147424604489717, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9342 + }, + { + "epoch": 0.08148296733006576, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9343 + }, + { + "epoch": 0.08149168861523434, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 9344 + }, + { + "epoch": 0.08150040990040293, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 9345 + }, + { + "epoch": 0.0815091311855715, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 9346 + }, + { + "epoch": 0.08151785247074009, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 9347 + }, + { + "epoch": 0.08152657375590867, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 9348 + }, + { + "epoch": 0.08153529504107725, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9349 + }, + { + "epoch": 0.08154401632624583, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 9350 + }, + { + "epoch": 0.08155273761141442, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9351 + }, + { + "epoch": 0.081561458896583, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 9352 + }, + { + "epoch": 0.08157018018175158, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9353 + }, + { + "epoch": 0.08157890146692016, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 9354 + }, + { + "epoch": 0.08158762275208875, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 9355 + }, + { + "epoch": 0.08159634403725732, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 9356 + }, + { + "epoch": 0.08160506532242591, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9357 + }, + { + "epoch": 0.0816137866075945, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 9358 + }, + { + "epoch": 0.08162250789276308, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 9359 + }, + { + "epoch": 0.08163122917793166, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9360 + }, + { + "epoch": 0.08163995046310024, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9361 + }, + { + "epoch": 0.08164867174826883, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 9362 + }, + { + "epoch": 0.0816573930334374, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9363 + }, + { + "epoch": 0.08166611431860599, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 9364 + }, + { + "epoch": 0.08167483560377457, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 9365 + }, + { + "epoch": 0.08168355688894316, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 9366 + }, + { + "epoch": 0.08169227817411173, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 9367 + }, + { + "epoch": 0.08170099945928032, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 9368 + }, + { + "epoch": 0.0817097207444489, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 9369 + }, + { + "epoch": 0.08171844202961749, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9370 + }, + { + "epoch": 0.08172716331478606, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 9371 + }, + { + "epoch": 0.08173588459995465, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9372 + }, + { + "epoch": 0.08174460588512324, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 9373 + }, + { + "epoch": 0.08175332717029181, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 9374 + }, + { + "epoch": 0.0817620484554604, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 9375 + }, + { + "epoch": 0.08177076974062898, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 9376 + }, + { + "epoch": 0.08177949102579757, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 9377 + }, + { + "epoch": 0.08178821231096614, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 9378 + }, + { + "epoch": 0.08179693359613473, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 9379 + }, + { + "epoch": 0.08180565488130331, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9380 + }, + { + "epoch": 0.08181437616647189, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9381 + }, + { + "epoch": 0.08182309745164047, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9382 + }, + { + "epoch": 0.08183181873680906, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 9383 + }, + { + "epoch": 0.08184054002197765, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 9384 + }, + { + "epoch": 0.08184926130714622, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 9385 + }, + { + "epoch": 0.0818579825923148, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 9386 + }, + { + "epoch": 0.08186670387748339, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 9387 + }, + { + "epoch": 0.08187542516265196, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 9388 + }, + { + "epoch": 0.08188414644782055, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 9389 + }, + { + "epoch": 0.08189286773298914, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 9390 + }, + { + "epoch": 0.08190158901815772, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 9391 + }, + { + "epoch": 0.0819103103033263, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 9392 + }, + { + "epoch": 0.08191903158849488, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9393 + }, + { + "epoch": 0.08192775287366347, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 9394 + }, + { + "epoch": 0.08193647415883204, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 9395 + }, + { + "epoch": 0.08194519544400063, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 9396 + }, + { + "epoch": 0.08195391672916921, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 9397 + }, + { + "epoch": 0.0819626380143378, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 9398 + }, + { + "epoch": 0.08197135929950637, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 9399 + }, + { + "epoch": 0.08198008058467496, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 9400 + }, + { + "epoch": 0.08198880186984354, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9401 + }, + { + "epoch": 0.08199752315501212, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9402 + }, + { + "epoch": 0.0820062444401807, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 9403 + }, + { + "epoch": 0.08201496572534929, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9404 + }, + { + "epoch": 0.08202368701051788, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 9405 + }, + { + "epoch": 0.08203240829568645, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 9406 + }, + { + "epoch": 0.08204112958085504, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 9407 + }, + { + "epoch": 0.08204985086602362, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 9408 + }, + { + "epoch": 0.0820585721511922, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 9409 + }, + { + "epoch": 0.08206729343636078, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 9410 + }, + { + "epoch": 0.08207601472152937, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9411 + }, + { + "epoch": 0.08208473600669795, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 9412 + }, + { + "epoch": 0.08209345729186653, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 9413 + }, + { + "epoch": 0.08210217857703511, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9414 + }, + { + "epoch": 0.0821108998622037, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 9415 + }, + { + "epoch": 0.08211962114737227, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 9416 + }, + { + "epoch": 0.08212834243254086, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9417 + }, + { + "epoch": 0.08213706371770944, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 9418 + }, + { + "epoch": 0.08214578500287803, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 9419 + }, + { + "epoch": 0.0821545062880466, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 9420 + }, + { + "epoch": 0.08216322757321519, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9421 + }, + { + "epoch": 0.08217194885838378, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 9422 + }, + { + "epoch": 0.08218067014355235, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 9423 + }, + { + "epoch": 0.08218939142872093, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9424 + }, + { + "epoch": 0.08219811271388952, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9425 + }, + { + "epoch": 0.08220683399905811, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 9426 + }, + { + "epoch": 0.08221555528422668, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 9427 + }, + { + "epoch": 0.08222427656939527, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9428 + }, + { + "epoch": 0.08223299785456385, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 9429 + }, + { + "epoch": 0.08224171913973242, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9430 + }, + { + "epoch": 0.08225044042490101, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 9431 + }, + { + "epoch": 0.0822591617100696, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 9432 + }, + { + "epoch": 0.08226788299523818, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 9433 + }, + { + "epoch": 0.08227660428040676, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 9434 + }, + { + "epoch": 0.08228532556557534, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9435 + }, + { + "epoch": 0.08229404685074393, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 9436 + }, + { + "epoch": 0.0823027681359125, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9437 + }, + { + "epoch": 0.08231148942108109, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9438 + }, + { + "epoch": 0.08232021070624967, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 9439 + }, + { + "epoch": 0.08232893199141826, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 9440 + }, + { + "epoch": 0.08233765327658683, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9441 + }, + { + "epoch": 0.08234637456175542, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 9442 + }, + { + "epoch": 0.082355095846924, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 9443 + }, + { + "epoch": 0.08236381713209258, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 9444 + }, + { + "epoch": 0.08237253841726117, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 9445 + }, + { + "epoch": 0.08238125970242975, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 9446 + }, + { + "epoch": 0.08238998098759834, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9447 + }, + { + "epoch": 0.08239870227276691, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 9448 + }, + { + "epoch": 0.0824074235579355, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 9449 + }, + { + "epoch": 0.08241614484310408, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 9450 + }, + { + "epoch": 0.08242486612827266, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 9451 + }, + { + "epoch": 0.08243358741344124, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 9452 + }, + { + "epoch": 0.08244230869860983, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9453 + }, + { + "epoch": 0.08245102998377841, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9454 + }, + { + "epoch": 0.08245975126894699, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 9455 + }, + { + "epoch": 0.08246847255411557, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 9456 + }, + { + "epoch": 0.08247719383928416, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9457 + }, + { + "epoch": 0.08248591512445273, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 9458 + }, + { + "epoch": 0.08249463640962132, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9459 + }, + { + "epoch": 0.0825033576947899, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 9460 + }, + { + "epoch": 0.08251207897995849, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9461 + }, + { + "epoch": 0.08252080026512706, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 9462 + }, + { + "epoch": 0.08252952155029565, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9463 + }, + { + "epoch": 0.08253824283546424, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 9464 + }, + { + "epoch": 0.08254696412063281, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 9465 + }, + { + "epoch": 0.0825556854058014, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 9466 + }, + { + "epoch": 0.08256440669096998, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9467 + }, + { + "epoch": 0.08257312797613857, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 9468 + }, + { + "epoch": 0.08258184926130714, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 9469 + }, + { + "epoch": 0.08259057054647573, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 9470 + }, + { + "epoch": 0.08259929183164431, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9471 + }, + { + "epoch": 0.08260801311681289, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 9472 + }, + { + "epoch": 0.08261673440198147, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9473 + }, + { + "epoch": 0.08262545568715006, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9474 + }, + { + "epoch": 0.08263417697231865, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 9475 + }, + { + "epoch": 0.08264289825748722, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 9476 + }, + { + "epoch": 0.0826516195426558, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 9477 + }, + { + "epoch": 0.08266034082782439, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 9478 + }, + { + "epoch": 0.08266906211299298, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9479 + }, + { + "epoch": 0.08267778339816155, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 9480 + }, + { + "epoch": 0.08268650468333014, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 9481 + }, + { + "epoch": 0.08269522596849872, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9482 + }, + { + "epoch": 0.0827039472536673, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 9483 + }, + { + "epoch": 0.08271266853883588, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9484 + }, + { + "epoch": 0.08272138982400447, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 9485 + }, + { + "epoch": 0.08273011110917305, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 9486 + }, + { + "epoch": 0.08273883239434163, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 9487 + }, + { + "epoch": 0.08274755367951021, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 9488 + }, + { + "epoch": 0.0827562749646788, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 9489 + }, + { + "epoch": 0.08276499624984737, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 9490 + }, + { + "epoch": 0.08277371753501596, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 9491 + }, + { + "epoch": 0.08278243882018455, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 9492 + }, + { + "epoch": 0.08279116010535313, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 9493 + }, + { + "epoch": 0.0827998813905217, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 9494 + }, + { + "epoch": 0.08280860267569029, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 9495 + }, + { + "epoch": 0.08281732396085888, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 9496 + }, + { + "epoch": 0.08282604524602745, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9497 + }, + { + "epoch": 0.08283476653119604, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 9498 + }, + { + "epoch": 0.08284348781636462, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9499 + }, + { + "epoch": 0.08285220910153321, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 9500 + }, + { + "epoch": 0.08286093038670178, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 9501 + }, + { + "epoch": 0.08286965167187037, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 9502 + }, + { + "epoch": 0.08287837295703895, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 9503 + }, + { + "epoch": 0.08288709424220753, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9504 + }, + { + "epoch": 0.08289581552737611, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 9505 + }, + { + "epoch": 0.0829045368125447, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9506 + }, + { + "epoch": 0.08291325809771329, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9507 + }, + { + "epoch": 0.08292197938288186, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9508 + }, + { + "epoch": 0.08293070066805044, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 9509 + }, + { + "epoch": 0.08293942195321903, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9510 + }, + { + "epoch": 0.0829481432383876, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 9511 + }, + { + "epoch": 0.08295686452355619, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9512 + }, + { + "epoch": 0.08296558580872478, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 9513 + }, + { + "epoch": 0.08297430709389336, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 9514 + }, + { + "epoch": 0.08298302837906193, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 9515 + }, + { + "epoch": 0.08299174966423052, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9516 + }, + { + "epoch": 0.08300047094939911, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9517 + }, + { + "epoch": 0.08300919223456768, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9518 + }, + { + "epoch": 0.08301791351973627, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 9519 + }, + { + "epoch": 0.08302663480490485, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9520 + }, + { + "epoch": 0.08303535609007344, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 9521 + }, + { + "epoch": 0.08304407737524201, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9522 + }, + { + "epoch": 0.0830527986604106, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 9523 + }, + { + "epoch": 0.08306151994557918, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 9524 + }, + { + "epoch": 0.08307024123074776, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9525 + }, + { + "epoch": 0.08307896251591634, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 9526 + }, + { + "epoch": 0.08308768380108493, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 9527 + }, + { + "epoch": 0.08309640508625352, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 9528 + }, + { + "epoch": 0.08310512637142209, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9529 + }, + { + "epoch": 0.08311384765659068, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 9530 + }, + { + "epoch": 0.08312256894175926, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 9531 + }, + { + "epoch": 0.08313129022692783, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 9532 + }, + { + "epoch": 0.08314001151209642, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 9533 + }, + { + "epoch": 0.08314873279726501, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 9534 + }, + { + "epoch": 0.0831574540824336, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 9535 + }, + { + "epoch": 0.08316617536760217, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9536 + }, + { + "epoch": 0.08317489665277075, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 9537 + }, + { + "epoch": 0.08318361793793934, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 9538 + }, + { + "epoch": 0.08319233922310791, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 9539 + }, + { + "epoch": 0.0832010605082765, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 9540 + }, + { + "epoch": 0.08320978179344508, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9541 + }, + { + "epoch": 0.08321850307861367, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 9542 + }, + { + "epoch": 0.08322722436378224, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 9543 + }, + { + "epoch": 0.08323594564895083, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 9544 + }, + { + "epoch": 0.08324466693411942, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 9545 + }, + { + "epoch": 0.08325338821928799, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9546 + }, + { + "epoch": 0.08326210950445657, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9547 + }, + { + "epoch": 0.08327083078962516, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 9548 + }, + { + "epoch": 0.08327955207479375, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 9549 + }, + { + "epoch": 0.08328827335996232, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 9550 + }, + { + "epoch": 0.0832969946451309, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 9551 + }, + { + "epoch": 0.08330571593029949, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 9552 + }, + { + "epoch": 0.08331443721546807, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 9553 + }, + { + "epoch": 0.08332315850063665, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 9554 + }, + { + "epoch": 0.08333187978580524, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9555 + }, + { + "epoch": 0.08334060107097382, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9556 + }, + { + "epoch": 0.0833493223561424, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 9557 + }, + { + "epoch": 0.08335804364131098, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 9558 + }, + { + "epoch": 0.08336676492647957, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 9559 + }, + { + "epoch": 0.08337548621164814, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 9560 + }, + { + "epoch": 0.08338420749681673, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 9561 + }, + { + "epoch": 0.08339292878198531, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 9562 + }, + { + "epoch": 0.0834016500671539, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 9563 + }, + { + "epoch": 0.08341037135232247, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9564 + }, + { + "epoch": 0.08341909263749106, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 9565 + }, + { + "epoch": 0.08342781392265965, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9566 + }, + { + "epoch": 0.08343653520782822, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9567 + }, + { + "epoch": 0.0834452564929968, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 9568 + }, + { + "epoch": 0.08345397777816539, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9569 + }, + { + "epoch": 0.08346269906333398, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 9570 + }, + { + "epoch": 0.08347142034850255, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 9571 + }, + { + "epoch": 0.08348014163367114, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9572 + }, + { + "epoch": 0.08348886291883972, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 9573 + }, + { + "epoch": 0.0834975842040083, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 9574 + }, + { + "epoch": 0.08350630548917688, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9575 + }, + { + "epoch": 0.08351502677434547, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 9576 + }, + { + "epoch": 0.08352374805951406, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 9577 + }, + { + "epoch": 0.08353246934468263, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 9578 + }, + { + "epoch": 0.08354119062985121, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9579 + }, + { + "epoch": 0.0835499119150198, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 9580 + }, + { + "epoch": 0.08355863320018837, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 9581 + }, + { + "epoch": 0.08356735448535696, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9582 + }, + { + "epoch": 0.08357607577052555, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 9583 + }, + { + "epoch": 0.08358479705569413, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 9584 + }, + { + "epoch": 0.0835935183408627, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 9585 + }, + { + "epoch": 0.08360223962603129, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9586 + }, + { + "epoch": 0.08361096091119988, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 9587 + }, + { + "epoch": 0.08361968219636845, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9588 + }, + { + "epoch": 0.08362840348153704, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 9589 + }, + { + "epoch": 0.08363712476670562, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 9590 + }, + { + "epoch": 0.08364584605187421, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 9591 + }, + { + "epoch": 0.08365456733704278, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 9592 + }, + { + "epoch": 0.08366328862221137, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9593 + }, + { + "epoch": 0.08367200990737995, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9594 + }, + { + "epoch": 0.08368073119254854, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 9595 + }, + { + "epoch": 0.08368945247771711, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 9596 + }, + { + "epoch": 0.0836981737628857, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9597 + }, + { + "epoch": 0.08370689504805429, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 9598 + }, + { + "epoch": 0.08371561633322286, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 9599 + }, + { + "epoch": 0.08372433761839145, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 9600 + }, + { + "epoch": 0.08373305890356003, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 9601 + }, + { + "epoch": 0.08374178018872862, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 9602 + }, + { + "epoch": 0.08375050147389719, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 9603 + }, + { + "epoch": 0.08375922275906578, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 9604 + }, + { + "epoch": 0.08376794404423436, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 9605 + }, + { + "epoch": 0.08377666532940294, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 9606 + }, + { + "epoch": 0.08378538661457152, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 9607 + }, + { + "epoch": 0.08379410789974011, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 9608 + }, + { + "epoch": 0.0838028291849087, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9609 + }, + { + "epoch": 0.08381155047007727, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 9610 + }, + { + "epoch": 0.08382027175524585, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 9611 + }, + { + "epoch": 0.08382899304041444, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 9612 + }, + { + "epoch": 0.08383771432558301, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 9613 + }, + { + "epoch": 0.0838464356107516, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 9614 + }, + { + "epoch": 0.08385515689592019, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 9615 + }, + { + "epoch": 0.08386387818108877, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9616 + }, + { + "epoch": 0.08387259946625734, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9617 + }, + { + "epoch": 0.08388132075142593, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 9618 + }, + { + "epoch": 0.08389004203659452, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 9619 + }, + { + "epoch": 0.08389876332176309, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 9620 + }, + { + "epoch": 0.08390748460693168, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9621 + }, + { + "epoch": 0.08391620589210026, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 9622 + }, + { + "epoch": 0.08392492717726885, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 9623 + }, + { + "epoch": 0.08393364846243742, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 9624 + }, + { + "epoch": 0.08394236974760601, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 9625 + }, + { + "epoch": 0.0839510910327746, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 9626 + }, + { + "epoch": 0.08395981231794317, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 9627 + }, + { + "epoch": 0.08396853360311175, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9628 + }, + { + "epoch": 0.08397725488828034, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 9629 + }, + { + "epoch": 0.08398597617344893, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9630 + }, + { + "epoch": 0.0839946974586175, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 9631 + }, + { + "epoch": 0.08400341874378608, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 9632 + }, + { + "epoch": 0.08401214002895467, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9633 + }, + { + "epoch": 0.08402086131412324, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9634 + }, + { + "epoch": 0.08402958259929183, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 9635 + }, + { + "epoch": 0.08403830388446042, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 9636 + }, + { + "epoch": 0.084047025169629, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 9637 + }, + { + "epoch": 0.08405574645479758, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 9638 + }, + { + "epoch": 0.08406446773996616, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9639 + }, + { + "epoch": 0.08407318902513475, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 9640 + }, + { + "epoch": 0.08408191031030332, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9641 + }, + { + "epoch": 0.0840906315954719, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 9642 + }, + { + "epoch": 0.0840993528806405, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 9643 + }, + { + "epoch": 0.08410807416580908, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9644 + }, + { + "epoch": 0.08411679545097765, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 9645 + }, + { + "epoch": 0.08412551673614624, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 9646 + }, + { + "epoch": 0.08413423802131482, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 9647 + }, + { + "epoch": 0.0841429593064834, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 9648 + }, + { + "epoch": 0.08415168059165198, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9649 + }, + { + "epoch": 0.08416040187682057, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9650 + }, + { + "epoch": 0.08416912316198916, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 9651 + }, + { + "epoch": 0.08417784444715773, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9652 + }, + { + "epoch": 0.08418656573232632, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 9653 + }, + { + "epoch": 0.0841952870174949, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9654 + }, + { + "epoch": 0.08420400830266347, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9655 + }, + { + "epoch": 0.08421272958783206, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 9656 + }, + { + "epoch": 0.08422145087300065, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 9657 + }, + { + "epoch": 0.08423017215816923, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 9658 + }, + { + "epoch": 0.0842388934433378, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 9659 + }, + { + "epoch": 0.08424761472850639, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 9660 + }, + { + "epoch": 0.08425633601367498, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 9661 + }, + { + "epoch": 0.08426505729884355, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 9662 + }, + { + "epoch": 0.08427377858401214, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 9663 + }, + { + "epoch": 0.08428249986918072, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 9664 + }, + { + "epoch": 0.08429122115434931, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 9665 + }, + { + "epoch": 0.08429994243951788, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 9666 + }, + { + "epoch": 0.08430866372468647, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 9667 + }, + { + "epoch": 0.08431738500985506, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 9668 + }, + { + "epoch": 0.08432610629502363, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 9669 + }, + { + "epoch": 0.08433482758019221, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 9670 + }, + { + "epoch": 0.0843435488653608, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 9671 + }, + { + "epoch": 0.08435227015052939, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9672 + }, + { + "epoch": 0.08436099143569796, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 9673 + }, + { + "epoch": 0.08436971272086655, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9674 + }, + { + "epoch": 0.08437843400603513, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 9675 + }, + { + "epoch": 0.0843871552912037, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 9676 + }, + { + "epoch": 0.08439587657637229, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 9677 + }, + { + "epoch": 0.08440459786154088, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9678 + }, + { + "epoch": 0.08441331914670946, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9679 + }, + { + "epoch": 0.08442204043187804, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 9680 + }, + { + "epoch": 0.08443076171704662, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9681 + }, + { + "epoch": 0.08443948300221521, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 9682 + }, + { + "epoch": 0.08444820428738378, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 9683 + }, + { + "epoch": 0.08445692557255237, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 9684 + }, + { + "epoch": 0.08446564685772096, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9685 + }, + { + "epoch": 0.08447436814288954, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9686 + }, + { + "epoch": 0.08448308942805811, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 9687 + }, + { + "epoch": 0.0844918107132267, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 9688 + }, + { + "epoch": 0.08450053199839529, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9689 + }, + { + "epoch": 0.08450925328356386, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 9690 + }, + { + "epoch": 0.08451797456873245, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 9691 + }, + { + "epoch": 0.08452669585390103, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9692 + }, + { + "epoch": 0.08453541713906962, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 9693 + }, + { + "epoch": 0.08454413842423819, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 9694 + }, + { + "epoch": 0.08455285970940678, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 9695 + }, + { + "epoch": 0.08456158099457536, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 9696 + }, + { + "epoch": 0.08457030227974394, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 9697 + }, + { + "epoch": 0.08457902356491252, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 9698 + }, + { + "epoch": 0.08458774485008111, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9699 + }, + { + "epoch": 0.0845964661352497, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9700 + }, + { + "epoch": 0.08460518742041827, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 9701 + }, + { + "epoch": 0.08461390870558685, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 9702 + }, + { + "epoch": 0.08462262999075544, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 9703 + }, + { + "epoch": 0.08463135127592401, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 9704 + }, + { + "epoch": 0.0846400725610926, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 9705 + }, + { + "epoch": 0.08464879384626119, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 9706 + }, + { + "epoch": 0.08465751513142977, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 9707 + }, + { + "epoch": 0.08466623641659834, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9708 + }, + { + "epoch": 0.08467495770176693, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9709 + }, + { + "epoch": 0.08468367898693552, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9710 + }, + { + "epoch": 0.0846924002721041, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 9711 + }, + { + "epoch": 0.08470112155727268, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 9712 + }, + { + "epoch": 0.08470984284244126, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 9713 + }, + { + "epoch": 0.08471856412760985, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 9714 + }, + { + "epoch": 0.08472728541277842, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9715 + }, + { + "epoch": 0.08473600669794701, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 9716 + }, + { + "epoch": 0.0847447279831156, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 9717 + }, + { + "epoch": 0.08475344926828418, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 9718 + }, + { + "epoch": 0.08476217055345275, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 9719 + }, + { + "epoch": 0.08477089183862134, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 9720 + }, + { + "epoch": 0.08477961312378993, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 9721 + }, + { + "epoch": 0.0847883344089585, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 9722 + }, + { + "epoch": 0.08479705569412709, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 9723 + }, + { + "epoch": 0.08480577697929567, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9724 + }, + { + "epoch": 0.08481449826446426, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9725 + }, + { + "epoch": 0.08482321954963283, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 9726 + }, + { + "epoch": 0.08483194083480142, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 9727 + }, + { + "epoch": 0.08484066211997, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 9728 + }, + { + "epoch": 0.08484938340513858, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 9729 + }, + { + "epoch": 0.08485810469030716, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9730 + }, + { + "epoch": 0.08486682597547575, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 9731 + }, + { + "epoch": 0.08487554726064434, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 9732 + }, + { + "epoch": 0.08488426854581291, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 9733 + }, + { + "epoch": 0.0848929898309815, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 9734 + }, + { + "epoch": 0.08490171111615008, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 9735 + }, + { + "epoch": 0.08491043240131865, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 9736 + }, + { + "epoch": 0.08491915368648724, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9737 + }, + { + "epoch": 0.08492787497165583, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9738 + }, + { + "epoch": 0.08493659625682441, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 9739 + }, + { + "epoch": 0.08494531754199298, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 9740 + }, + { + "epoch": 0.08495403882716157, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 9741 + }, + { + "epoch": 0.08496276011233016, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 9742 + }, + { + "epoch": 0.08497148139749873, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 9743 + }, + { + "epoch": 0.08498020268266732, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 9744 + }, + { + "epoch": 0.0849889239678359, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 9745 + }, + { + "epoch": 0.08499764525300449, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 9746 + }, + { + "epoch": 0.08500636653817306, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 9747 + }, + { + "epoch": 0.08501508782334165, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 9748 + }, + { + "epoch": 0.08502380910851023, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9749 + }, + { + "epoch": 0.0850325303936788, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 9750 + }, + { + "epoch": 0.0850412516788474, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 9751 + }, + { + "epoch": 0.08504997296401598, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 9752 + }, + { + "epoch": 0.08505869424918457, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9753 + }, + { + "epoch": 0.08506741553435314, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9754 + }, + { + "epoch": 0.08507613681952172, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 9755 + }, + { + "epoch": 0.08508485810469031, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 9756 + }, + { + "epoch": 0.08509357938985888, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 9757 + }, + { + "epoch": 0.08510230067502747, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9758 + }, + { + "epoch": 0.08511102196019606, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9759 + }, + { + "epoch": 0.08511974324536464, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 9760 + }, + { + "epoch": 0.08512846453053322, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 9761 + }, + { + "epoch": 0.0851371858157018, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9762 + }, + { + "epoch": 0.08514590710087039, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9763 + }, + { + "epoch": 0.08515462838603896, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9764 + }, + { + "epoch": 0.08516334967120755, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 9765 + }, + { + "epoch": 0.08517207095637613, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 9766 + }, + { + "epoch": 0.08518079224154472, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 9767 + }, + { + "epoch": 0.08518951352671329, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 9768 + }, + { + "epoch": 0.08519823481188188, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 9769 + }, + { + "epoch": 0.08520695609705047, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9770 + }, + { + "epoch": 0.08521567738221904, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 9771 + }, + { + "epoch": 0.08522439866738762, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 9772 + }, + { + "epoch": 0.08523311995255621, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 9773 + }, + { + "epoch": 0.0852418412377248, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 9774 + }, + { + "epoch": 0.08525056252289337, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 9775 + }, + { + "epoch": 0.08525928380806196, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9776 + }, + { + "epoch": 0.08526800509323054, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9777 + }, + { + "epoch": 0.08527672637839911, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 9778 + }, + { + "epoch": 0.0852854476635677, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 9779 + }, + { + "epoch": 0.08529416894873629, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 9780 + }, + { + "epoch": 0.08530289023390487, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 9781 + }, + { + "epoch": 0.08531161151907345, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9782 + }, + { + "epoch": 0.08532033280424203, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 9783 + }, + { + "epoch": 0.08532905408941062, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9784 + }, + { + "epoch": 0.08533777537457919, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9785 + }, + { + "epoch": 0.08534649665974778, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 9786 + }, + { + "epoch": 0.08535521794491636, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 9787 + }, + { + "epoch": 0.08536393923008495, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 9788 + }, + { + "epoch": 0.08537266051525352, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9789 + }, + { + "epoch": 0.08538138180042211, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 9790 + }, + { + "epoch": 0.0853901030855907, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9791 + }, + { + "epoch": 0.08539882437075927, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9792 + }, + { + "epoch": 0.08540754565592786, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9793 + }, + { + "epoch": 0.08541626694109644, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 9794 + }, + { + "epoch": 0.08542498822626503, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 9795 + }, + { + "epoch": 0.0854337095114336, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9796 + }, + { + "epoch": 0.08544243079660219, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9797 + }, + { + "epoch": 0.08545115208177077, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 9798 + }, + { + "epoch": 0.08545987336693935, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 9799 + }, + { + "epoch": 0.08546859465210793, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 9800 + }, + { + "epoch": 0.08547731593727652, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9801 + }, + { + "epoch": 0.0854860372224451, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 9802 + }, + { + "epoch": 0.08549475850761368, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9803 + }, + { + "epoch": 0.08550347979278226, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 9804 + }, + { + "epoch": 0.08551220107795085, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 9805 + }, + { + "epoch": 0.08552092236311942, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 9806 + }, + { + "epoch": 0.08552964364828801, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 9807 + }, + { + "epoch": 0.0855383649334566, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 9808 + }, + { + "epoch": 0.08554708621862518, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 9809 + }, + { + "epoch": 0.08555580750379375, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 9810 + }, + { + "epoch": 0.08556452878896234, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 9811 + }, + { + "epoch": 0.08557325007413093, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 9812 + }, + { + "epoch": 0.0855819713592995, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 9813 + }, + { + "epoch": 0.08559069264446809, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 9814 + }, + { + "epoch": 0.08559941392963667, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 9815 + }, + { + "epoch": 0.08560813521480526, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 9816 + }, + { + "epoch": 0.08561685649997383, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 9817 + }, + { + "epoch": 0.08562557778514242, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9818 + }, + { + "epoch": 0.085634299070311, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9819 + }, + { + "epoch": 0.08564302035547958, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 9820 + }, + { + "epoch": 0.08565174164064816, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 9821 + }, + { + "epoch": 0.08566046292581675, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 9822 + }, + { + "epoch": 0.08566918421098534, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 9823 + }, + { + "epoch": 0.08567790549615391, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9824 + }, + { + "epoch": 0.0856866267813225, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9825 + }, + { + "epoch": 0.08569534806649108, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 9826 + }, + { + "epoch": 0.08570406935165967, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 9827 + }, + { + "epoch": 0.08571279063682824, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9828 + }, + { + "epoch": 0.08572151192199683, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9829 + }, + { + "epoch": 0.08573023320716541, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9830 + }, + { + "epoch": 0.08573895449233399, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9831 + }, + { + "epoch": 0.08574767577750257, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 9832 + }, + { + "epoch": 0.08575639706267116, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9833 + }, + { + "epoch": 0.08576511834783974, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 9834 + }, + { + "epoch": 0.08577383963300832, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 9835 + }, + { + "epoch": 0.0857825609181769, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9836 + }, + { + "epoch": 0.08579128220334549, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9837 + }, + { + "epoch": 0.08580000348851406, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 9838 + }, + { + "epoch": 0.08580872477368265, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9839 + }, + { + "epoch": 0.08581744605885124, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 9840 + }, + { + "epoch": 0.08582616734401982, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 9841 + }, + { + "epoch": 0.0858348886291884, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9842 + }, + { + "epoch": 0.08584360991435698, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 9843 + }, + { + "epoch": 0.08585233119952557, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 9844 + }, + { + "epoch": 0.08586105248469414, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 9845 + }, + { + "epoch": 0.08586977376986273, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 9846 + }, + { + "epoch": 0.08587849505503131, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 9847 + }, + { + "epoch": 0.0858872163401999, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 9848 + }, + { + "epoch": 0.08589593762536847, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9849 + }, + { + "epoch": 0.08590465891053706, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9850 + }, + { + "epoch": 0.08591338019570564, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 9851 + }, + { + "epoch": 0.08592210148087422, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 9852 + }, + { + "epoch": 0.0859308227660428, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9853 + }, + { + "epoch": 0.08593954405121139, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9854 + }, + { + "epoch": 0.08594826533637998, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9855 + }, + { + "epoch": 0.08595698662154855, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 9856 + }, + { + "epoch": 0.08596570790671713, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9857 + }, + { + "epoch": 0.08597442919188572, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 9858 + }, + { + "epoch": 0.0859831504770543, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9859 + }, + { + "epoch": 0.08599187176222288, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9860 + }, + { + "epoch": 0.08600059304739147, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 9861 + }, + { + "epoch": 0.08600931433256005, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 9862 + }, + { + "epoch": 0.08601803561772862, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 9863 + }, + { + "epoch": 0.08602675690289721, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 9864 + }, + { + "epoch": 0.0860354781880658, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 9865 + }, + { + "epoch": 0.08604419947323437, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 9866 + }, + { + "epoch": 0.08605292075840296, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9867 + }, + { + "epoch": 0.08606164204357154, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 9868 + }, + { + "epoch": 0.08607036332874013, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 9869 + }, + { + "epoch": 0.0860790846139087, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 9870 + }, + { + "epoch": 0.08608780589907729, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 9871 + }, + { + "epoch": 0.08609652718424587, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 9872 + }, + { + "epoch": 0.08610524846941445, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 9873 + }, + { + "epoch": 0.08611396975458303, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 9874 + }, + { + "epoch": 0.08612269103975162, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9875 + }, + { + "epoch": 0.0861314123249202, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 9876 + }, + { + "epoch": 0.08614013361008878, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 9877 + }, + { + "epoch": 0.08614885489525737, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 9878 + }, + { + "epoch": 0.08615757618042595, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9879 + }, + { + "epoch": 0.08616629746559452, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9880 + }, + { + "epoch": 0.08617501875076311, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9881 + }, + { + "epoch": 0.0861837400359317, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 9882 + }, + { + "epoch": 0.08619246132110028, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 9883 + }, + { + "epoch": 0.08620118260626886, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9884 + }, + { + "epoch": 0.08620990389143744, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 9885 + }, + { + "epoch": 0.08621862517660603, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 9886 + }, + { + "epoch": 0.0862273464617746, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9887 + }, + { + "epoch": 0.08623606774694319, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9888 + }, + { + "epoch": 0.08624478903211177, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9889 + }, + { + "epoch": 0.08625351031728036, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9890 + }, + { + "epoch": 0.08626223160244893, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 9891 + }, + { + "epoch": 0.08627095288761752, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 9892 + }, + { + "epoch": 0.0862796741727861, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 9893 + }, + { + "epoch": 0.08628839545795468, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 9894 + }, + { + "epoch": 0.08629711674312326, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 9895 + }, + { + "epoch": 0.08630583802829185, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 9896 + }, + { + "epoch": 0.08631455931346044, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 9897 + }, + { + "epoch": 0.08632328059862901, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9898 + }, + { + "epoch": 0.0863320018837976, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9899 + }, + { + "epoch": 0.08634072316896618, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 9900 + }, + { + "epoch": 0.08634944445413476, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 9901 + }, + { + "epoch": 0.08635816573930334, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 9902 + }, + { + "epoch": 0.08636688702447193, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 9903 + }, + { + "epoch": 0.08637560830964051, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 9904 + }, + { + "epoch": 0.08638432959480909, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 9905 + }, + { + "epoch": 0.08639305087997767, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9906 + }, + { + "epoch": 0.08640177216514626, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 9907 + }, + { + "epoch": 0.08641049345031483, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9908 + }, + { + "epoch": 0.08641921473548342, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9909 + }, + { + "epoch": 0.086427936020652, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 9910 + }, + { + "epoch": 0.08643665730582059, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 9911 + }, + { + "epoch": 0.08644537859098916, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 9912 + }, + { + "epoch": 0.08645409987615775, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 9913 + }, + { + "epoch": 0.08646282116132634, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 9914 + }, + { + "epoch": 0.08647154244649491, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 9915 + }, + { + "epoch": 0.0864802637316635, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 9916 + }, + { + "epoch": 0.08648898501683208, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 9917 + }, + { + "epoch": 0.08649770630200067, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 9918 + }, + { + "epoch": 0.08650642758716924, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 9919 + }, + { + "epoch": 0.08651514887233783, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 9920 + }, + { + "epoch": 0.08652387015750641, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 9921 + }, + { + "epoch": 0.08653259144267499, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 9922 + }, + { + "epoch": 0.08654131272784357, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 9923 + }, + { + "epoch": 0.08655003401301216, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 9924 + }, + { + "epoch": 0.08655875529818075, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 9925 + }, + { + "epoch": 0.08656747658334932, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 9926 + }, + { + "epoch": 0.0865761978685179, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 9927 + }, + { + "epoch": 0.08658491915368649, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 9928 + }, + { + "epoch": 0.08659364043885506, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 9929 + }, + { + "epoch": 0.08660236172402365, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 9930 + }, + { + "epoch": 0.08661108300919224, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 9931 + }, + { + "epoch": 0.08661980429436082, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 9932 + }, + { + "epoch": 0.0866285255795294, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 9933 + }, + { + "epoch": 0.08663724686469798, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9934 + }, + { + "epoch": 0.08664596814986657, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 9935 + }, + { + "epoch": 0.08665468943503514, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 9936 + }, + { + "epoch": 0.08666341072020373, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 9937 + }, + { + "epoch": 0.08667213200537231, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 9938 + }, + { + "epoch": 0.0866808532905409, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 9939 + }, + { + "epoch": 0.08668957457570947, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 9940 + }, + { + "epoch": 0.08669829586087806, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 9941 + }, + { + "epoch": 0.08670701714604664, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 9942 + }, + { + "epoch": 0.08671573843121523, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 9943 + }, + { + "epoch": 0.0867244597163838, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 9944 + }, + { + "epoch": 0.08673318100155239, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 9945 + }, + { + "epoch": 0.08674190228672098, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 9946 + }, + { + "epoch": 0.08675062357188955, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9947 + }, + { + "epoch": 0.08675934485705813, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 9948 + }, + { + "epoch": 0.08676806614222672, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 9949 + }, + { + "epoch": 0.08677678742739531, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 9950 + }, + { + "epoch": 0.08678550871256388, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 9951 + }, + { + "epoch": 0.08679422999773247, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 9952 + }, + { + "epoch": 0.08680295128290105, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 9953 + }, + { + "epoch": 0.08681167256806963, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9954 + }, + { + "epoch": 0.08682039385323821, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 9955 + }, + { + "epoch": 0.0868291151384068, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 9956 + }, + { + "epoch": 0.08683783642357538, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 9957 + }, + { + "epoch": 0.08684655770874396, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9958 + }, + { + "epoch": 0.08685527899391254, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 9959 + }, + { + "epoch": 0.08686400027908113, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 9960 + }, + { + "epoch": 0.0868727215642497, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 9961 + }, + { + "epoch": 0.08688144284941829, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 9962 + }, + { + "epoch": 0.08689016413458688, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9963 + }, + { + "epoch": 0.08689888541975546, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 9964 + }, + { + "epoch": 0.08690760670492403, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 9965 + }, + { + "epoch": 0.08691632799009262, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 9966 + }, + { + "epoch": 0.08692504927526121, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 9967 + }, + { + "epoch": 0.08693377056042978, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9968 + }, + { + "epoch": 0.08694249184559837, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9969 + }, + { + "epoch": 0.08695121313076695, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 9970 + }, + { + "epoch": 0.08695993441593554, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 9971 + }, + { + "epoch": 0.08696865570110411, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 9972 + }, + { + "epoch": 0.0869773769862727, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 9973 + }, + { + "epoch": 0.08698609827144128, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9974 + }, + { + "epoch": 0.08699481955660986, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 9975 + }, + { + "epoch": 0.08700354084177844, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9976 + }, + { + "epoch": 0.08701226212694703, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 9977 + }, + { + "epoch": 0.08702098341211562, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 9978 + }, + { + "epoch": 0.08702970469728419, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 9979 + }, + { + "epoch": 0.08703842598245277, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 9980 + }, + { + "epoch": 0.08704714726762136, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 9981 + }, + { + "epoch": 0.08705586855278993, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 9982 + }, + { + "epoch": 0.08706458983795852, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 9983 + }, + { + "epoch": 0.0870733111231271, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 9984 + }, + { + "epoch": 0.08708203240829569, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 9985 + }, + { + "epoch": 0.08709075369346427, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 9986 + }, + { + "epoch": 0.08709947497863285, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 9987 + }, + { + "epoch": 0.08710819626380144, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 9988 + }, + { + "epoch": 0.08711691754897001, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9989 + }, + { + "epoch": 0.0871256388341386, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 9990 + }, + { + "epoch": 0.08713436011930718, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 9991 + }, + { + "epoch": 0.08714308140447577, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 9992 + }, + { + "epoch": 0.08715180268964434, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 9993 + }, + { + "epoch": 0.08716052397481293, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9994 + }, + { + "epoch": 0.08716924525998151, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9995 + }, + { + "epoch": 0.08717796654515009, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 9996 + }, + { + "epoch": 0.08718668783031867, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9997 + }, + { + "epoch": 0.08719540911548726, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 9998 + }, + { + "epoch": 0.08720413040065585, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 9999 + }, + { + "epoch": 0.08721285168582442, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 10000 + }, + { + "epoch": 0.087221572970993, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 10001 + }, + { + "epoch": 0.08723029425616159, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 10002 + }, + { + "epoch": 0.08723901554133016, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 10003 + }, + { + "epoch": 0.08724773682649875, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 10004 + }, + { + "epoch": 0.08725645811166734, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 10005 + }, + { + "epoch": 0.08726517939683592, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 10006 + }, + { + "epoch": 0.0872739006820045, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 10007 + }, + { + "epoch": 0.08728262196717308, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 10008 + }, + { + "epoch": 0.08729134325234167, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 10009 + }, + { + "epoch": 0.08730006453751024, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 10010 + }, + { + "epoch": 0.08730878582267883, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 10011 + }, + { + "epoch": 0.08731750710784741, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 10012 + }, + { + "epoch": 0.087326228393016, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 10013 + }, + { + "epoch": 0.08733494967818457, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 10014 + }, + { + "epoch": 0.08734367096335316, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 10015 + }, + { + "epoch": 0.08735239224852175, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10016 + }, + { + "epoch": 0.08736111353369032, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 10017 + }, + { + "epoch": 0.0873698348188589, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 10018 + }, + { + "epoch": 0.08737855610402749, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 10019 + }, + { + "epoch": 0.08738727738919608, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10020 + }, + { + "epoch": 0.08739599867436465, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 10021 + }, + { + "epoch": 0.08740471995953324, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 10022 + }, + { + "epoch": 0.08741344124470182, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10023 + }, + { + "epoch": 0.0874221625298704, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 10024 + }, + { + "epoch": 0.08743088381503898, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 10025 + }, + { + "epoch": 0.08743960510020757, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 10026 + }, + { + "epoch": 0.08744832638537615, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 10027 + }, + { + "epoch": 0.08745704767054473, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 10028 + }, + { + "epoch": 0.08746576895571331, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 10029 + }, + { + "epoch": 0.0874744902408819, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 10030 + }, + { + "epoch": 0.08748321152605047, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 10031 + }, + { + "epoch": 0.08749193281121906, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 10032 + }, + { + "epoch": 0.08750065409638765, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 10033 + }, + { + "epoch": 0.08750937538155623, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 10034 + }, + { + "epoch": 0.0875180966667248, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 10035 + }, + { + "epoch": 0.08752681795189339, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 10036 + }, + { + "epoch": 0.08753553923706198, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 10037 + }, + { + "epoch": 0.08754426052223055, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10038 + }, + { + "epoch": 0.08755298180739914, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 10039 + }, + { + "epoch": 0.08756170309256772, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10040 + }, + { + "epoch": 0.08757042437773631, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 10041 + }, + { + "epoch": 0.08757914566290488, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 10042 + }, + { + "epoch": 0.08758786694807347, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 10043 + }, + { + "epoch": 0.08759658823324205, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 10044 + }, + { + "epoch": 0.08760530951841063, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 10045 + }, + { + "epoch": 0.08761403080357921, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 10046 + }, + { + "epoch": 0.0876227520887478, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 10047 + }, + { + "epoch": 0.08763147337391639, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 10048 + }, + { + "epoch": 0.08764019465908496, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 10049 + }, + { + "epoch": 0.08764891594425354, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 10050 + }, + { + "epoch": 0.08765763722942213, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 10051 + }, + { + "epoch": 0.0876663585145907, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 10052 + }, + { + "epoch": 0.08767507979975929, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 10053 + }, + { + "epoch": 0.08768380108492788, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 10054 + }, + { + "epoch": 0.08769252237009646, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 10055 + }, + { + "epoch": 0.08770124365526503, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 10056 + }, + { + "epoch": 0.08770996494043362, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 10057 + }, + { + "epoch": 0.08771868622560221, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 10058 + }, + { + "epoch": 0.0877274075107708, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10059 + }, + { + "epoch": 0.08773612879593937, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 10060 + }, + { + "epoch": 0.08774485008110795, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 10061 + }, + { + "epoch": 0.08775357136627654, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 10062 + }, + { + "epoch": 0.08776229265144511, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 10063 + }, + { + "epoch": 0.0877710139366137, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 10064 + }, + { + "epoch": 0.08777973522178228, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 10065 + }, + { + "epoch": 0.08778845650695087, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 10066 + }, + { + "epoch": 0.08779717779211944, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 10067 + }, + { + "epoch": 0.08780589907728803, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 10068 + }, + { + "epoch": 0.08781462036245662, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10069 + }, + { + "epoch": 0.08782334164762519, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 10070 + }, + { + "epoch": 0.08783206293279378, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 10071 + }, + { + "epoch": 0.08784078421796236, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 10072 + }, + { + "epoch": 0.08784950550313095, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 10073 + }, + { + "epoch": 0.08785822678829952, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 10074 + }, + { + "epoch": 0.0878669480734681, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 10075 + }, + { + "epoch": 0.0878756693586367, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 10076 + }, + { + "epoch": 0.08788439064380527, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10077 + }, + { + "epoch": 0.08789311192897385, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 10078 + }, + { + "epoch": 0.08790183321414244, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 10079 + }, + { + "epoch": 0.08791055449931102, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 10080 + }, + { + "epoch": 0.0879192757844796, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10081 + }, + { + "epoch": 0.08792799706964818, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 10082 + }, + { + "epoch": 0.08793671835481677, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 10083 + }, + { + "epoch": 0.08794543963998534, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 10084 + }, + { + "epoch": 0.08795416092515393, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 10085 + }, + { + "epoch": 0.08796288221032252, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 10086 + }, + { + "epoch": 0.0879716034954911, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 10087 + }, + { + "epoch": 0.08798032478065967, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 10088 + }, + { + "epoch": 0.08798904606582826, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 10089 + }, + { + "epoch": 0.08799776735099685, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 10090 + }, + { + "epoch": 0.08800648863616542, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 10091 + }, + { + "epoch": 0.088015209921334, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 10092 + }, + { + "epoch": 0.08802393120650259, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 10093 + }, + { + "epoch": 0.08803265249167118, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 10094 + }, + { + "epoch": 0.08804137377683975, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 10095 + }, + { + "epoch": 0.08805009506200834, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 10096 + }, + { + "epoch": 0.08805881634717692, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 10097 + }, + { + "epoch": 0.0880675376323455, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 10098 + }, + { + "epoch": 0.08807625891751408, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 10099 + }, + { + "epoch": 0.08808498020268267, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 10100 + }, + { + "epoch": 0.08809370148785126, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 10101 + }, + { + "epoch": 0.08810242277301983, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 10102 + }, + { + "epoch": 0.08811114405818841, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 10103 + }, + { + "epoch": 0.088119865343357, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 10104 + }, + { + "epoch": 0.08812858662852557, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 10105 + }, + { + "epoch": 0.08813730791369416, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 10106 + }, + { + "epoch": 0.08814602919886275, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 10107 + }, + { + "epoch": 0.08815475048403133, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10108 + }, + { + "epoch": 0.0881634717691999, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10109 + }, + { + "epoch": 0.08817219305436849, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 10110 + }, + { + "epoch": 0.08818091433953708, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 10111 + }, + { + "epoch": 0.08818963562470565, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 10112 + }, + { + "epoch": 0.08819835690987424, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 10113 + }, + { + "epoch": 0.08820707819504282, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 10114 + }, + { + "epoch": 0.08821579948021141, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 10115 + }, + { + "epoch": 0.08822452076537998, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 10116 + }, + { + "epoch": 0.08823324205054857, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 10117 + }, + { + "epoch": 0.08824196333571716, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 10118 + }, + { + "epoch": 0.08825068462088573, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 10119 + }, + { + "epoch": 0.08825940590605431, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 10120 + }, + { + "epoch": 0.0882681271912229, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 10121 + }, + { + "epoch": 0.08827684847639149, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 10122 + }, + { + "epoch": 0.08828556976156006, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 10123 + }, + { + "epoch": 0.08829429104672865, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 10124 + }, + { + "epoch": 0.08830301233189723, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10125 + }, + { + "epoch": 0.0883117336170658, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 10126 + }, + { + "epoch": 0.08832045490223439, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 10127 + }, + { + "epoch": 0.08832917618740298, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 10128 + }, + { + "epoch": 0.08833789747257156, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 10129 + }, + { + "epoch": 0.08834661875774014, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 10130 + }, + { + "epoch": 0.08835534004290872, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 10131 + }, + { + "epoch": 0.08836406132807731, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 10132 + }, + { + "epoch": 0.08837278261324588, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 10133 + }, + { + "epoch": 0.08838150389841447, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 10134 + }, + { + "epoch": 0.08839022518358305, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10135 + }, + { + "epoch": 0.08839894646875164, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 10136 + }, + { + "epoch": 0.08840766775392021, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 10137 + }, + { + "epoch": 0.0884163890390888, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 10138 + }, + { + "epoch": 0.08842511032425739, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 10139 + }, + { + "epoch": 0.08843383160942596, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 10140 + }, + { + "epoch": 0.08844255289459454, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 10141 + }, + { + "epoch": 0.08845127417976313, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 10142 + }, + { + "epoch": 0.08845999546493172, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 10143 + }, + { + "epoch": 0.08846871675010029, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 10144 + }, + { + "epoch": 0.08847743803526888, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 10145 + }, + { + "epoch": 0.08848615932043746, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 10146 + }, + { + "epoch": 0.08849488060560604, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 10147 + }, + { + "epoch": 0.08850360189077462, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 10148 + }, + { + "epoch": 0.08851232317594321, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 10149 + }, + { + "epoch": 0.0885210444611118, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10150 + }, + { + "epoch": 0.08852976574628037, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 10151 + }, + { + "epoch": 0.08853848703144895, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 10152 + }, + { + "epoch": 0.08854720831661754, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 10153 + }, + { + "epoch": 0.08855592960178611, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 10154 + }, + { + "epoch": 0.0885646508869547, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 10155 + }, + { + "epoch": 0.08857337217212329, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 10156 + }, + { + "epoch": 0.08858209345729187, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 10157 + }, + { + "epoch": 0.08859081474246044, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 10158 + }, + { + "epoch": 0.08859953602762903, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10159 + }, + { + "epoch": 0.08860825731279762, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10160 + }, + { + "epoch": 0.08861697859796619, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 10161 + }, + { + "epoch": 0.08862569988313478, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 10162 + }, + { + "epoch": 0.08863442116830336, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 10163 + }, + { + "epoch": 0.08864314245347195, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 10164 + }, + { + "epoch": 0.08865186373864052, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 10165 + }, + { + "epoch": 0.08866058502380911, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 10166 + }, + { + "epoch": 0.0886693063089777, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 10167 + }, + { + "epoch": 0.08867802759414628, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 10168 + }, + { + "epoch": 0.08868674887931485, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 10169 + }, + { + "epoch": 0.08869547016448344, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 10170 + }, + { + "epoch": 0.08870419144965203, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 10171 + }, + { + "epoch": 0.0887129127348206, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 10172 + }, + { + "epoch": 0.08872163401998918, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 10173 + }, + { + "epoch": 0.08873035530515777, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10174 + }, + { + "epoch": 0.08873907659032636, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 10175 + }, + { + "epoch": 0.08874779787549493, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 10176 + }, + { + "epoch": 0.08875651916066352, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 10177 + }, + { + "epoch": 0.0887652404458321, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 10178 + }, + { + "epoch": 0.08877396173100068, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 10179 + }, + { + "epoch": 0.08878268301616926, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10180 + }, + { + "epoch": 0.08879140430133785, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 10181 + }, + { + "epoch": 0.08880012558650643, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 10182 + }, + { + "epoch": 0.088808846871675, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 10183 + }, + { + "epoch": 0.0888175681568436, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 10184 + }, + { + "epoch": 0.08882628944201218, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10185 + }, + { + "epoch": 0.08883501072718075, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 10186 + }, + { + "epoch": 0.08884373201234934, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 10187 + }, + { + "epoch": 0.08885245329751792, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 10188 + }, + { + "epoch": 0.08886117458268651, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 10189 + }, + { + "epoch": 0.08886989586785508, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 10190 + }, + { + "epoch": 0.08887861715302367, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 10191 + }, + { + "epoch": 0.08888733843819226, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 10192 + }, + { + "epoch": 0.08889605972336083, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 10193 + }, + { + "epoch": 0.08890478100852942, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 10194 + }, + { + "epoch": 0.088913502293698, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 10195 + }, + { + "epoch": 0.08892222357886659, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 10196 + }, + { + "epoch": 0.08893094486403516, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 10197 + }, + { + "epoch": 0.08893966614920375, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 10198 + }, + { + "epoch": 0.08894838743437233, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 10199 + }, + { + "epoch": 0.0889571087195409, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 10200 + }, + { + "epoch": 0.08896583000470949, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 10201 + }, + { + "epoch": 0.08897455128987808, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 10202 + }, + { + "epoch": 0.08898327257504667, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 10203 + }, + { + "epoch": 0.08899199386021524, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 10204 + }, + { + "epoch": 0.08900071514538382, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 10205 + }, + { + "epoch": 0.08900943643055241, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 10206 + }, + { + "epoch": 0.08901815771572098, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10207 + }, + { + "epoch": 0.08902687900088957, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 10208 + }, + { + "epoch": 0.08903560028605816, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 10209 + }, + { + "epoch": 0.08904432157122674, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 10210 + }, + { + "epoch": 0.08905304285639531, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 10211 + }, + { + "epoch": 0.0890617641415639, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 10212 + }, + { + "epoch": 0.08907048542673249, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 10213 + }, + { + "epoch": 0.08907920671190106, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 10214 + }, + { + "epoch": 0.08908792799706965, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 10215 + }, + { + "epoch": 0.08909664928223823, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 10216 + }, + { + "epoch": 0.08910537056740682, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 10217 + }, + { + "epoch": 0.08911409185257539, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10218 + }, + { + "epoch": 0.08912281313774398, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 10219 + }, + { + "epoch": 0.08913153442291256, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 10220 + }, + { + "epoch": 0.08914025570808114, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 10221 + }, + { + "epoch": 0.08914897699324972, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 10222 + }, + { + "epoch": 0.08915769827841831, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 10223 + }, + { + "epoch": 0.0891664195635869, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 10224 + }, + { + "epoch": 0.08917514084875547, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 10225 + }, + { + "epoch": 0.08918386213392406, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 10226 + }, + { + "epoch": 0.08919258341909264, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 10227 + }, + { + "epoch": 0.08920130470426121, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 10228 + }, + { + "epoch": 0.0892100259894298, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 10229 + }, + { + "epoch": 0.08921874727459839, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 10230 + }, + { + "epoch": 0.08922746855976697, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10231 + }, + { + "epoch": 0.08923618984493555, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 10232 + }, + { + "epoch": 0.08924491113010413, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 10233 + }, + { + "epoch": 0.08925363241527272, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 10234 + }, + { + "epoch": 0.08926235370044129, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 10235 + }, + { + "epoch": 0.08927107498560988, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10236 + }, + { + "epoch": 0.08927979627077846, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 10237 + }, + { + "epoch": 0.08928851755594705, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 10238 + }, + { + "epoch": 0.08929723884111562, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 10239 + }, + { + "epoch": 0.08930596012628421, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 10240 + }, + { + "epoch": 0.0893146814114528, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 10241 + }, + { + "epoch": 0.08932340269662137, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 10242 + }, + { + "epoch": 0.08933212398178995, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 10243 + }, + { + "epoch": 0.08934084526695854, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 10244 + }, + { + "epoch": 0.08934956655212713, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 10245 + }, + { + "epoch": 0.0893582878372957, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 10246 + }, + { + "epoch": 0.08936700912246429, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 10247 + }, + { + "epoch": 0.08937573040763287, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10248 + }, + { + "epoch": 0.08938445169280144, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 10249 + }, + { + "epoch": 0.08939317297797003, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 10250 + }, + { + "epoch": 0.08940189426313862, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 10251 + }, + { + "epoch": 0.0894106155483072, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 10252 + }, + { + "epoch": 0.08941933683347578, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 10253 + }, + { + "epoch": 0.08942805811864436, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 10254 + }, + { + "epoch": 0.08943677940381295, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 10255 + }, + { + "epoch": 0.08944550068898152, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 10256 + }, + { + "epoch": 0.08945422197415011, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 10257 + }, + { + "epoch": 0.0894629432593187, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 10258 + }, + { + "epoch": 0.08947166454448728, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 10259 + }, + { + "epoch": 0.08948038582965585, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 10260 + }, + { + "epoch": 0.08948910711482444, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 10261 + }, + { + "epoch": 0.08949782839999303, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 10262 + }, + { + "epoch": 0.0895065496851616, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 10263 + }, + { + "epoch": 0.08951527097033019, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 10264 + }, + { + "epoch": 0.08952399225549877, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 10265 + }, + { + "epoch": 0.08953271354066736, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10266 + }, + { + "epoch": 0.08954143482583593, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 10267 + }, + { + "epoch": 0.08955015611100452, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 10268 + }, + { + "epoch": 0.0895588773961731, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 10269 + }, + { + "epoch": 0.08956759868134168, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 10270 + }, + { + "epoch": 0.08957631996651026, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 10271 + }, + { + "epoch": 0.08958504125167885, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 10272 + }, + { + "epoch": 0.08959376253684743, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 10273 + }, + { + "epoch": 0.08960248382201601, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 10274 + }, + { + "epoch": 0.0896112051071846, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 10275 + }, + { + "epoch": 0.08961992639235318, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 10276 + }, + { + "epoch": 0.08962864767752175, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10277 + }, + { + "epoch": 0.08963736896269034, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 10278 + }, + { + "epoch": 0.08964609024785893, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 10279 + }, + { + "epoch": 0.08965481153302751, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 10280 + }, + { + "epoch": 0.08966353281819608, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 10281 + }, + { + "epoch": 0.08967225410336467, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 10282 + }, + { + "epoch": 0.08968097538853326, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10283 + }, + { + "epoch": 0.08968969667370184, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 10284 + }, + { + "epoch": 0.08969841795887042, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 10285 + }, + { + "epoch": 0.089707139244039, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 10286 + }, + { + "epoch": 0.08971586052920759, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 10287 + }, + { + "epoch": 0.08972458181437616, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 10288 + }, + { + "epoch": 0.08973330309954475, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10289 + }, + { + "epoch": 0.08974202438471333, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 10290 + }, + { + "epoch": 0.08975074566988192, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 10291 + }, + { + "epoch": 0.0897594669550505, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 10292 + }, + { + "epoch": 0.08976818824021908, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 10293 + }, + { + "epoch": 0.08977690952538767, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 10294 + }, + { + "epoch": 0.08978563081055624, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 10295 + }, + { + "epoch": 0.08979435209572482, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 10296 + }, + { + "epoch": 0.08980307338089341, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 10297 + }, + { + "epoch": 0.089811794666062, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10298 + }, + { + "epoch": 0.08982051595123057, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 10299 + }, + { + "epoch": 0.08982923723639916, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 10300 + }, + { + "epoch": 0.08983795852156774, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 10301 + }, + { + "epoch": 0.08984667980673632, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 10302 + }, + { + "epoch": 0.0898554010919049, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10303 + }, + { + "epoch": 0.08986412237707349, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 10304 + }, + { + "epoch": 0.08987284366224207, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10305 + }, + { + "epoch": 0.08988156494741065, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 10306 + }, + { + "epoch": 0.08989028623257923, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 10307 + }, + { + "epoch": 0.08989900751774782, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 10308 + }, + { + "epoch": 0.08990772880291639, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 10309 + }, + { + "epoch": 0.08991645008808498, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 10310 + }, + { + "epoch": 0.08992517137325357, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 10311 + }, + { + "epoch": 0.08993389265842215, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 10312 + }, + { + "epoch": 0.08994261394359072, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 10313 + }, + { + "epoch": 0.08995133522875931, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 10314 + }, + { + "epoch": 0.0899600565139279, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 10315 + }, + { + "epoch": 0.08996877779909647, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 10316 + }, + { + "epoch": 0.08997749908426506, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 10317 + }, + { + "epoch": 0.08998622036943364, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 10318 + }, + { + "epoch": 0.08999494165460223, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 10319 + }, + { + "epoch": 0.0900036629397708, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 10320 + }, + { + "epoch": 0.09001238422493939, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 10321 + }, + { + "epoch": 0.09002110551010797, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 10322 + }, + { + "epoch": 0.09002982679527655, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 10323 + }, + { + "epoch": 0.09003854808044513, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 10324 + }, + { + "epoch": 0.09004726936561372, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10325 + }, + { + "epoch": 0.0900559906507823, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 10326 + }, + { + "epoch": 0.09006471193595088, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 10327 + }, + { + "epoch": 0.09007343322111946, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 10328 + }, + { + "epoch": 0.09008215450628805, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 10329 + }, + { + "epoch": 0.09009087579145662, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 10330 + }, + { + "epoch": 0.09009959707662521, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10331 + }, + { + "epoch": 0.0901083183617938, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 10332 + }, + { + "epoch": 0.09011703964696238, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 10333 + }, + { + "epoch": 0.09012576093213095, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 10334 + }, + { + "epoch": 0.09013448221729954, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 10335 + }, + { + "epoch": 0.09014320350246813, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 10336 + }, + { + "epoch": 0.0901519247876367, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 10337 + }, + { + "epoch": 0.09016064607280529, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 10338 + }, + { + "epoch": 0.09016936735797387, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 10339 + }, + { + "epoch": 0.09017808864314246, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 10340 + }, + { + "epoch": 0.09018680992831103, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 10341 + }, + { + "epoch": 0.09019553121347962, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 10342 + }, + { + "epoch": 0.0902042524986482, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 10343 + }, + { + "epoch": 0.09021297378381678, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 10344 + }, + { + "epoch": 0.09022169506898536, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 10345 + }, + { + "epoch": 0.09023041635415395, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 10346 + }, + { + "epoch": 0.09023913763932254, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 10347 + }, + { + "epoch": 0.09024785892449111, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 10348 + }, + { + "epoch": 0.0902565802096597, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 10349 + }, + { + "epoch": 0.09026530149482828, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10350 + }, + { + "epoch": 0.09027402277999685, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 10351 + }, + { + "epoch": 0.09028274406516544, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10352 + }, + { + "epoch": 0.09029146535033403, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 10353 + }, + { + "epoch": 0.09030018663550261, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 10354 + }, + { + "epoch": 0.09030890792067119, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 10355 + }, + { + "epoch": 0.09031762920583977, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 10356 + }, + { + "epoch": 0.09032635049100836, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 10357 + }, + { + "epoch": 0.09033507177617693, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 10358 + }, + { + "epoch": 0.09034379306134552, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 10359 + }, + { + "epoch": 0.0903525143465141, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 10360 + }, + { + "epoch": 0.09036123563168269, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 10361 + }, + { + "epoch": 0.09036995691685126, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 10362 + }, + { + "epoch": 0.09037867820201985, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 10363 + }, + { + "epoch": 0.09038739948718844, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 10364 + }, + { + "epoch": 0.09039612077235701, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 10365 + }, + { + "epoch": 0.0904048420575256, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 10366 + }, + { + "epoch": 0.09041356334269418, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 10367 + }, + { + "epoch": 0.09042228462786277, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 10368 + }, + { + "epoch": 0.09043100591303134, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10369 + }, + { + "epoch": 0.09043972719819993, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 10370 + }, + { + "epoch": 0.09044844848336851, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 10371 + }, + { + "epoch": 0.09045716976853709, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 10372 + }, + { + "epoch": 0.09046589105370567, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 10373 + }, + { + "epoch": 0.09047461233887426, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 10374 + }, + { + "epoch": 0.09048333362404284, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 10375 + }, + { + "epoch": 0.09049205490921142, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 10376 + }, + { + "epoch": 0.09050077619438, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 10377 + }, + { + "epoch": 0.09050949747954859, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 10378 + }, + { + "epoch": 0.09051821876471716, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 10379 + }, + { + "epoch": 0.09052694004988575, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 10380 + }, + { + "epoch": 0.09053566133505433, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 10381 + }, + { + "epoch": 0.09054438262022292, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10382 + }, + { + "epoch": 0.0905531039053915, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 10383 + }, + { + "epoch": 0.09056182519056008, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 10384 + }, + { + "epoch": 0.09057054647572867, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 10385 + }, + { + "epoch": 0.09057926776089724, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10386 + }, + { + "epoch": 0.09058798904606583, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 10387 + }, + { + "epoch": 0.09059671033123441, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 10388 + }, + { + "epoch": 0.090605431616403, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 10389 + }, + { + "epoch": 0.09061415290157157, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 10390 + }, + { + "epoch": 0.09062287418674016, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 10391 + }, + { + "epoch": 0.09063159547190874, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 10392 + }, + { + "epoch": 0.09064031675707732, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 10393 + }, + { + "epoch": 0.0906490380422459, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 10394 + }, + { + "epoch": 0.09065775932741449, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 10395 + }, + { + "epoch": 0.09066648061258308, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 10396 + }, + { + "epoch": 0.09067520189775165, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 10397 + }, + { + "epoch": 0.09068392318292023, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 10398 + }, + { + "epoch": 0.09069264446808882, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 10399 + }, + { + "epoch": 0.0907013657532574, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 10400 + }, + { + "epoch": 0.09071008703842598, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 10401 + }, + { + "epoch": 0.09071880832359457, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 10402 + }, + { + "epoch": 0.09072752960876315, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 10403 + }, + { + "epoch": 0.09073625089393172, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10404 + }, + { + "epoch": 0.09074497217910031, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 10405 + }, + { + "epoch": 0.0907536934642689, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 10406 + }, + { + "epoch": 0.09076241474943748, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 10407 + }, + { + "epoch": 0.09077113603460606, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 10408 + }, + { + "epoch": 0.09077985731977464, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 10409 + }, + { + "epoch": 0.09078857860494323, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 10410 + }, + { + "epoch": 0.0907972998901118, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 10411 + }, + { + "epoch": 0.09080602117528039, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 10412 + }, + { + "epoch": 0.09081474246044897, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 10413 + }, + { + "epoch": 0.09082346374561756, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 10414 + }, + { + "epoch": 0.09083218503078613, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 10415 + }, + { + "epoch": 0.09084090631595472, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10416 + }, + { + "epoch": 0.0908496276011233, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 10417 + }, + { + "epoch": 0.09085834888629188, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10418 + }, + { + "epoch": 0.09086707017146047, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 10419 + }, + { + "epoch": 0.09087579145662905, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 10420 + }, + { + "epoch": 0.09088451274179764, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 10421 + }, + { + "epoch": 0.09089323402696621, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 10422 + }, + { + "epoch": 0.0909019553121348, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 10423 + }, + { + "epoch": 0.09091067659730338, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 10424 + }, + { + "epoch": 0.09091939788247196, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10425 + }, + { + "epoch": 0.09092811916764054, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 10426 + }, + { + "epoch": 0.09093684045280913, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10427 + }, + { + "epoch": 0.09094556173797771, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 10428 + }, + { + "epoch": 0.09095428302314629, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 10429 + }, + { + "epoch": 0.09096300430831487, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 10430 + }, + { + "epoch": 0.09097172559348346, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10431 + }, + { + "epoch": 0.09098044687865203, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 10432 + }, + { + "epoch": 0.09098916816382062, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 10433 + }, + { + "epoch": 0.0909978894489892, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 10434 + }, + { + "epoch": 0.09100661073415779, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 10435 + }, + { + "epoch": 0.09101533201932636, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 10436 + }, + { + "epoch": 0.09102405330449495, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 10437 + }, + { + "epoch": 0.09103277458966354, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 10438 + }, + { + "epoch": 0.09104149587483211, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 10439 + }, + { + "epoch": 0.0910502171600007, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 10440 + }, + { + "epoch": 0.09105893844516928, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 10441 + }, + { + "epoch": 0.09106765973033787, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 10442 + }, + { + "epoch": 0.09107638101550644, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 10443 + }, + { + "epoch": 0.09108510230067503, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 10444 + }, + { + "epoch": 0.09109382358584361, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 10445 + }, + { + "epoch": 0.09110254487101219, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 10446 + }, + { + "epoch": 0.09111126615618077, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 10447 + }, + { + "epoch": 0.09111998744134936, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 10448 + }, + { + "epoch": 0.09112870872651795, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 10449 + }, + { + "epoch": 0.09113743001168652, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 10450 + }, + { + "epoch": 0.0911461512968551, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 10451 + }, + { + "epoch": 0.09115487258202369, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 10452 + }, + { + "epoch": 0.09116359386719226, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 10453 + }, + { + "epoch": 0.09117231515236085, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 10454 + }, + { + "epoch": 0.09118103643752944, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 10455 + }, + { + "epoch": 0.09118975772269802, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 10456 + }, + { + "epoch": 0.0911984790078666, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 10457 + }, + { + "epoch": 0.09120720029303518, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 10458 + }, + { + "epoch": 0.09121592157820377, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 10459 + }, + { + "epoch": 0.09122464286337234, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 10460 + }, + { + "epoch": 0.09123336414854093, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 10461 + }, + { + "epoch": 0.09124208543370951, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 10462 + }, + { + "epoch": 0.0912508067188781, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 10463 + }, + { + "epoch": 0.09125952800404667, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 10464 + }, + { + "epoch": 0.09126824928921526, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10465 + }, + { + "epoch": 0.09127697057438384, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 10466 + }, + { + "epoch": 0.09128569185955242, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 10467 + }, + { + "epoch": 0.091294413144721, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 10468 + }, + { + "epoch": 0.09130313442988959, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 10469 + }, + { + "epoch": 0.09131185571505818, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 10470 + }, + { + "epoch": 0.09132057700022675, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 10471 + }, + { + "epoch": 0.09132929828539534, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 10472 + }, + { + "epoch": 0.09133801957056392, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 10473 + }, + { + "epoch": 0.0913467408557325, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10474 + }, + { + "epoch": 0.09135546214090108, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 10475 + }, + { + "epoch": 0.09136418342606967, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 10476 + }, + { + "epoch": 0.09137290471123825, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 10477 + }, + { + "epoch": 0.09138162599640683, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 10478 + }, + { + "epoch": 0.09139034728157541, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 10479 + }, + { + "epoch": 0.091399068566744, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 10480 + }, + { + "epoch": 0.09140778985191257, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 10481 + }, + { + "epoch": 0.09141651113708116, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 10482 + }, + { + "epoch": 0.09142523242224974, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 10483 + }, + { + "epoch": 0.09143395370741833, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 10484 + }, + { + "epoch": 0.0914426749925869, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 10485 + }, + { + "epoch": 0.09145139627775549, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 10486 + }, + { + "epoch": 0.09146011756292408, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 10487 + }, + { + "epoch": 0.09146883884809265, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 10488 + }, + { + "epoch": 0.09147756013326123, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 10489 + }, + { + "epoch": 0.09148628141842982, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 10490 + }, + { + "epoch": 0.09149500270359841, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 10491 + }, + { + "epoch": 0.09150372398876698, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 10492 + }, + { + "epoch": 0.09151244527393557, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 10493 + }, + { + "epoch": 0.09152116655910415, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 10494 + }, + { + "epoch": 0.09152988784427273, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 10495 + }, + { + "epoch": 0.09153860912944131, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 10496 + }, + { + "epoch": 0.0915473304146099, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 10497 + }, + { + "epoch": 0.09155605169977848, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 10498 + }, + { + "epoch": 0.09156477298494706, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 10499 + }, + { + "epoch": 0.09157349427011564, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10500 + }, + { + "epoch": 0.09158221555528423, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 10501 + }, + { + "epoch": 0.0915909368404528, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 10502 + }, + { + "epoch": 0.09159965812562139, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 10503 + }, + { + "epoch": 0.09160837941078998, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 10504 + }, + { + "epoch": 0.09161710069595856, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 10505 + }, + { + "epoch": 0.09162582198112713, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 10506 + }, + { + "epoch": 0.09163454326629572, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 10507 + }, + { + "epoch": 0.0916432645514643, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 10508 + }, + { + "epoch": 0.09165198583663288, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 10509 + }, + { + "epoch": 0.09166070712180147, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 10510 + }, + { + "epoch": 0.09166942840697005, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 10511 + }, + { + "epoch": 0.09167814969213864, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 10512 + }, + { + "epoch": 0.09168687097730721, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 10513 + }, + { + "epoch": 0.0916955922624758, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 10514 + }, + { + "epoch": 0.09170431354764438, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 10515 + }, + { + "epoch": 0.09171303483281297, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10516 + }, + { + "epoch": 0.09172175611798154, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 10517 + }, + { + "epoch": 0.09173047740315013, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 10518 + }, + { + "epoch": 0.09173919868831872, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 10519 + }, + { + "epoch": 0.09174791997348729, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 10520 + }, + { + "epoch": 0.09175664125865587, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 10521 + }, + { + "epoch": 0.09176536254382446, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 10522 + }, + { + "epoch": 0.09177408382899305, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10523 + }, + { + "epoch": 0.09178280511416162, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 10524 + }, + { + "epoch": 0.0917915263993302, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 10525 + }, + { + "epoch": 0.09180024768449879, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 10526 + }, + { + "epoch": 0.09180896896966736, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 10527 + }, + { + "epoch": 0.09181769025483595, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 10528 + }, + { + "epoch": 0.09182641154000454, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 10529 + }, + { + "epoch": 0.09183513282517312, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 10530 + }, + { + "epoch": 0.0918438541103417, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 10531 + }, + { + "epoch": 0.09185257539551028, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 10532 + }, + { + "epoch": 0.09186129668067887, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10533 + }, + { + "epoch": 0.09187001796584744, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 10534 + }, + { + "epoch": 0.09187873925101603, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 10535 + }, + { + "epoch": 0.09188746053618461, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 10536 + }, + { + "epoch": 0.0918961818213532, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 10537 + }, + { + "epoch": 0.09190490310652177, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 10538 + }, + { + "epoch": 0.09191362439169036, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 10539 + }, + { + "epoch": 0.09192234567685895, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10540 + }, + { + "epoch": 0.09193106696202752, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 10541 + }, + { + "epoch": 0.0919397882471961, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 10542 + }, + { + "epoch": 0.09194850953236469, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 10543 + }, + { + "epoch": 0.09195723081753328, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 10544 + }, + { + "epoch": 0.09196595210270185, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 10545 + }, + { + "epoch": 0.09197467338787044, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10546 + }, + { + "epoch": 0.09198339467303902, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 10547 + }, + { + "epoch": 0.0919921159582076, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 10548 + }, + { + "epoch": 0.09200083724337618, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 10549 + }, + { + "epoch": 0.09200955852854477, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 10550 + }, + { + "epoch": 0.09201827981371336, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10551 + }, + { + "epoch": 0.09202700109888193, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 10552 + }, + { + "epoch": 0.09203572238405051, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 10553 + }, + { + "epoch": 0.0920444436692191, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 10554 + }, + { + "epoch": 0.09205316495438767, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 10555 + }, + { + "epoch": 0.09206188623955626, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 10556 + }, + { + "epoch": 0.09207060752472485, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 10557 + }, + { + "epoch": 0.09207932880989343, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 10558 + }, + { + "epoch": 0.092088050095062, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 10559 + }, + { + "epoch": 0.09209677138023059, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 10560 + }, + { + "epoch": 0.09210549266539918, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 10561 + }, + { + "epoch": 0.09211421395056775, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10562 + }, + { + "epoch": 0.09212293523573634, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 10563 + }, + { + "epoch": 0.09213165652090492, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 10564 + }, + { + "epoch": 0.09214037780607351, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 10565 + }, + { + "epoch": 0.09214909909124208, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 10566 + }, + { + "epoch": 0.09215782037641067, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 10567 + }, + { + "epoch": 0.09216654166157925, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 10568 + }, + { + "epoch": 0.09217526294674783, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 10569 + }, + { + "epoch": 0.09218398423191641, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 10570 + }, + { + "epoch": 0.092192705517085, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 10571 + }, + { + "epoch": 0.09220142680225359, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 10572 + }, + { + "epoch": 0.09221014808742216, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 10573 + }, + { + "epoch": 0.09221886937259074, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 10574 + }, + { + "epoch": 0.09222759065775933, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 10575 + }, + { + "epoch": 0.0922363119429279, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 10576 + }, + { + "epoch": 0.09224503322809649, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 10577 + }, + { + "epoch": 0.09225375451326508, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 10578 + }, + { + "epoch": 0.09226247579843366, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 10579 + }, + { + "epoch": 0.09227119708360224, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 10580 + }, + { + "epoch": 0.09227991836877082, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 10581 + }, + { + "epoch": 0.09228863965393941, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10582 + }, + { + "epoch": 0.09229736093910798, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 10583 + }, + { + "epoch": 0.09230608222427657, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 10584 + }, + { + "epoch": 0.09231480350944515, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 10585 + }, + { + "epoch": 0.09232352479461374, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 10586 + }, + { + "epoch": 0.09233224607978231, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 10587 + }, + { + "epoch": 0.0923409673649509, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 10588 + }, + { + "epoch": 0.09234968865011949, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 10589 + }, + { + "epoch": 0.09235840993528806, + "grad_norm": 0.41796875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 10590 + }, + { + "epoch": 0.09236713122045664, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 10591 + }, + { + "epoch": 0.09237585250562523, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10592 + }, + { + "epoch": 0.09238457379079382, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 10593 + }, + { + "epoch": 0.09239329507596239, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 10594 + }, + { + "epoch": 0.09240201636113098, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 10595 + }, + { + "epoch": 0.09241073764629956, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 10596 + }, + { + "epoch": 0.09241945893146813, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 10597 + }, + { + "epoch": 0.09242818021663672, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 10598 + }, + { + "epoch": 0.09243690150180531, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 10599 + }, + { + "epoch": 0.0924456227869739, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 10600 + }, + { + "epoch": 0.09245434407214247, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 10601 + }, + { + "epoch": 0.09246306535731105, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 10602 + }, + { + "epoch": 0.09247178664247964, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 10603 + }, + { + "epoch": 0.09248050792764821, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 10604 + }, + { + "epoch": 0.0924892292128168, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10605 + }, + { + "epoch": 0.09249795049798538, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 10606 + }, + { + "epoch": 0.09250667178315397, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 10607 + }, + { + "epoch": 0.09251539306832254, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 10608 + }, + { + "epoch": 0.09252411435349113, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 10609 + }, + { + "epoch": 0.09253283563865972, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 10610 + }, + { + "epoch": 0.09254155692382829, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 10611 + }, + { + "epoch": 0.09255027820899688, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 10612 + }, + { + "epoch": 0.09255899949416546, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 10613 + }, + { + "epoch": 0.09256772077933405, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 10614 + }, + { + "epoch": 0.09257644206450262, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10615 + }, + { + "epoch": 0.0925851633496712, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10616 + }, + { + "epoch": 0.0925938846348398, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 10617 + }, + { + "epoch": 0.09260260592000837, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10618 + }, + { + "epoch": 0.09261132720517695, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 10619 + }, + { + "epoch": 0.09262004849034554, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 10620 + }, + { + "epoch": 0.09262876977551412, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 10621 + }, + { + "epoch": 0.0926374910606827, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 10622 + }, + { + "epoch": 0.09264621234585128, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 10623 + }, + { + "epoch": 0.09265493363101987, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 10624 + }, + { + "epoch": 0.09266365491618844, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10625 + }, + { + "epoch": 0.09267237620135703, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 10626 + }, + { + "epoch": 0.09268109748652562, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 10627 + }, + { + "epoch": 0.0926898187716942, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 10628 + }, + { + "epoch": 0.09269854005686277, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 10629 + }, + { + "epoch": 0.09270726134203136, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 10630 + }, + { + "epoch": 0.09271598262719995, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 10631 + }, + { + "epoch": 0.09272470391236853, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 10632 + }, + { + "epoch": 0.0927334251975371, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 10633 + }, + { + "epoch": 0.09274214648270569, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 10634 + }, + { + "epoch": 0.09275086776787428, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 10635 + }, + { + "epoch": 0.09275958905304285, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 10636 + }, + { + "epoch": 0.09276831033821144, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 10637 + }, + { + "epoch": 0.09277703162338002, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 10638 + }, + { + "epoch": 0.09278575290854861, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 10639 + }, + { + "epoch": 0.09279447419371718, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 10640 + }, + { + "epoch": 0.09280319547888577, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 10641 + }, + { + "epoch": 0.09281191676405436, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 10642 + }, + { + "epoch": 0.09282063804922293, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 10643 + }, + { + "epoch": 0.09282935933439151, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 10644 + }, + { + "epoch": 0.0928380806195601, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 10645 + }, + { + "epoch": 0.09284680190472869, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 10646 + }, + { + "epoch": 0.09285552318989726, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 10647 + }, + { + "epoch": 0.09286424447506585, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 10648 + }, + { + "epoch": 0.09287296576023443, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 10649 + }, + { + "epoch": 0.092881687045403, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 10650 + }, + { + "epoch": 0.09289040833057159, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 10651 + }, + { + "epoch": 0.09289912961574018, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 10652 + }, + { + "epoch": 0.09290785090090876, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 10653 + }, + { + "epoch": 0.09291657218607734, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 10654 + }, + { + "epoch": 0.09292529347124592, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10655 + }, + { + "epoch": 0.09293401475641451, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 10656 + }, + { + "epoch": 0.09294273604158308, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10657 + }, + { + "epoch": 0.09295145732675167, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 10658 + }, + { + "epoch": 0.09296017861192025, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10659 + }, + { + "epoch": 0.09296889989708884, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 10660 + }, + { + "epoch": 0.09297762118225741, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 10661 + }, + { + "epoch": 0.092986342467426, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10662 + }, + { + "epoch": 0.09299506375259459, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 10663 + }, + { + "epoch": 0.09300378503776316, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10664 + }, + { + "epoch": 0.09301250632293175, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 10665 + }, + { + "epoch": 0.09302122760810033, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 10666 + }, + { + "epoch": 0.09302994889326892, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 10667 + }, + { + "epoch": 0.09303867017843749, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 10668 + }, + { + "epoch": 0.09304739146360608, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 10669 + }, + { + "epoch": 0.09305611274877466, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 10670 + }, + { + "epoch": 0.09306483403394324, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 10671 + }, + { + "epoch": 0.09307355531911182, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 10672 + }, + { + "epoch": 0.09308227660428041, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 10673 + }, + { + "epoch": 0.093090997889449, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 10674 + }, + { + "epoch": 0.09309971917461757, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 10675 + }, + { + "epoch": 0.09310844045978615, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 10676 + }, + { + "epoch": 0.09311716174495474, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 10677 + }, + { + "epoch": 0.09312588303012331, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 10678 + }, + { + "epoch": 0.0931346043152919, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 10679 + }, + { + "epoch": 0.09314332560046049, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 10680 + }, + { + "epoch": 0.09315204688562907, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 10681 + }, + { + "epoch": 0.09316076817079764, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 10682 + }, + { + "epoch": 0.09316948945596623, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 10683 + }, + { + "epoch": 0.09317821074113482, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10684 + }, + { + "epoch": 0.09318693202630339, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 10685 + }, + { + "epoch": 0.09319565331147198, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10686 + }, + { + "epoch": 0.09320437459664056, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 10687 + }, + { + "epoch": 0.09321309588180915, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 10688 + }, + { + "epoch": 0.09322181716697772, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 10689 + }, + { + "epoch": 0.09323053845214631, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 10690 + }, + { + "epoch": 0.0932392597373149, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 10691 + }, + { + "epoch": 0.09324798102248347, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 10692 + }, + { + "epoch": 0.09325670230765205, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 10693 + }, + { + "epoch": 0.09326542359282064, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 10694 + }, + { + "epoch": 0.09327414487798923, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 10695 + }, + { + "epoch": 0.0932828661631578, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 10696 + }, + { + "epoch": 0.09329158744832639, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 10697 + }, + { + "epoch": 0.09330030873349497, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10698 + }, + { + "epoch": 0.09330903001866354, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 10699 + }, + { + "epoch": 0.09331775130383213, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10700 + }, + { + "epoch": 0.09332647258900072, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 10701 + }, + { + "epoch": 0.0933351938741693, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 10702 + }, + { + "epoch": 0.09334391515933788, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 10703 + }, + { + "epoch": 0.09335263644450646, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 10704 + }, + { + "epoch": 0.09336135772967505, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 10705 + }, + { + "epoch": 0.09337007901484362, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 10706 + }, + { + "epoch": 0.09337880030001221, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 10707 + }, + { + "epoch": 0.0933875215851808, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 10708 + }, + { + "epoch": 0.09339624287034938, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 10709 + }, + { + "epoch": 0.09340496415551795, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 10710 + }, + { + "epoch": 0.09341368544068654, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 10711 + }, + { + "epoch": 0.09342240672585513, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 10712 + }, + { + "epoch": 0.0934311280110237, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10713 + }, + { + "epoch": 0.09343984929619228, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 10714 + }, + { + "epoch": 0.09344857058136087, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10715 + }, + { + "epoch": 0.09345729186652946, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 10716 + }, + { + "epoch": 0.09346601315169803, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 10717 + }, + { + "epoch": 0.09347473443686662, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 10718 + }, + { + "epoch": 0.0934834557220352, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 10719 + }, + { + "epoch": 0.09349217700720377, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 10720 + }, + { + "epoch": 0.09350089829237236, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 10721 + }, + { + "epoch": 0.09350961957754095, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 10722 + }, + { + "epoch": 0.09351834086270953, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 10723 + }, + { + "epoch": 0.0935270621478781, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 10724 + }, + { + "epoch": 0.0935357834330467, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 10725 + }, + { + "epoch": 0.09354450471821528, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 10726 + }, + { + "epoch": 0.09355322600338385, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 10727 + }, + { + "epoch": 0.09356194728855244, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 10728 + }, + { + "epoch": 0.09357066857372102, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 10729 + }, + { + "epoch": 0.09357938985888961, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 10730 + }, + { + "epoch": 0.09358811114405818, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 10731 + }, + { + "epoch": 0.09359683242922677, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 10732 + }, + { + "epoch": 0.09360555371439536, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 10733 + }, + { + "epoch": 0.09361427499956393, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 10734 + }, + { + "epoch": 0.09362299628473252, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 10735 + }, + { + "epoch": 0.0936317175699011, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 10736 + }, + { + "epoch": 0.09364043885506969, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 10737 + }, + { + "epoch": 0.09364916014023826, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 10738 + }, + { + "epoch": 0.09365788142540685, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 10739 + }, + { + "epoch": 0.09366660271057543, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 10740 + }, + { + "epoch": 0.093675323995744, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 10741 + }, + { + "epoch": 0.09368404528091259, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 10742 + }, + { + "epoch": 0.09369276656608118, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 10743 + }, + { + "epoch": 0.09370148785124977, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 10744 + }, + { + "epoch": 0.09371020913641834, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 10745 + }, + { + "epoch": 0.09371893042158692, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 10746 + }, + { + "epoch": 0.09372765170675551, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 10747 + }, + { + "epoch": 0.0937363729919241, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 10748 + }, + { + "epoch": 0.09374509427709267, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 10749 + }, + { + "epoch": 0.09375381556226126, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 10750 + }, + { + "epoch": 0.09376253684742984, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 10751 + }, + { + "epoch": 0.09377125813259841, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 10752 + }, + { + "epoch": 0.093779979417767, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 10753 + }, + { + "epoch": 0.09378870070293559, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 10754 + }, + { + "epoch": 0.09379742198810417, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 10755 + }, + { + "epoch": 0.09380614327327275, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 10756 + }, + { + "epoch": 0.09381486455844133, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 10757 + }, + { + "epoch": 0.09382358584360992, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 10758 + }, + { + "epoch": 0.09383230712877849, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10759 + }, + { + "epoch": 0.09384102841394708, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 10760 + }, + { + "epoch": 0.09384974969911566, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10761 + }, + { + "epoch": 0.09385847098428425, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 10762 + }, + { + "epoch": 0.09386719226945282, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 10763 + }, + { + "epoch": 0.09387591355462141, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 10764 + }, + { + "epoch": 0.09388463483979, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 10765 + }, + { + "epoch": 0.09389335612495857, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 10766 + }, + { + "epoch": 0.09390207741012715, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 10767 + }, + { + "epoch": 0.09391079869529574, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 10768 + }, + { + "epoch": 0.09391951998046433, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 10769 + }, + { + "epoch": 0.0939282412656329, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 10770 + }, + { + "epoch": 0.09393696255080149, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 10771 + }, + { + "epoch": 0.09394568383597007, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 10772 + }, + { + "epoch": 0.09395440512113865, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 10773 + }, + { + "epoch": 0.09396312640630723, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10774 + }, + { + "epoch": 0.09397184769147582, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 10775 + }, + { + "epoch": 0.0939805689766444, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 10776 + }, + { + "epoch": 0.09398929026181298, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 10777 + }, + { + "epoch": 0.09399801154698156, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 10778 + }, + { + "epoch": 0.09400673283215015, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 10779 + }, + { + "epoch": 0.09401545411731872, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 10780 + }, + { + "epoch": 0.09402417540248731, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 10781 + }, + { + "epoch": 0.0940328966876559, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 10782 + }, + { + "epoch": 0.09404161797282448, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 10783 + }, + { + "epoch": 0.09405033925799305, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10784 + }, + { + "epoch": 0.09405906054316164, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 10785 + }, + { + "epoch": 0.09406778182833023, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 10786 + }, + { + "epoch": 0.0940765031134988, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10787 + }, + { + "epoch": 0.09408522439866739, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 10788 + }, + { + "epoch": 0.09409394568383597, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 10789 + }, + { + "epoch": 0.09410266696900456, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 10790 + }, + { + "epoch": 0.09411138825417313, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 10791 + }, + { + "epoch": 0.09412010953934172, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 10792 + }, + { + "epoch": 0.0941288308245103, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 10793 + }, + { + "epoch": 0.09413755210967888, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 10794 + }, + { + "epoch": 0.09414627339484746, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 10795 + }, + { + "epoch": 0.09415499468001605, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 10796 + }, + { + "epoch": 0.09416371596518464, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 10797 + }, + { + "epoch": 0.09417243725035321, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 10798 + }, + { + "epoch": 0.0941811585355218, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10799 + }, + { + "epoch": 0.09418987982069038, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 10800 + }, + { + "epoch": 0.09419860110585895, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 10801 + }, + { + "epoch": 0.09420732239102754, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 10802 + }, + { + "epoch": 0.09421604367619613, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 10803 + }, + { + "epoch": 0.09422476496136471, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 10804 + }, + { + "epoch": 0.09423348624653329, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 10805 + }, + { + "epoch": 0.09424220753170187, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 10806 + }, + { + "epoch": 0.09425092881687046, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 10807 + }, + { + "epoch": 0.09425965010203903, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 10808 + }, + { + "epoch": 0.09426837138720762, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 10809 + }, + { + "epoch": 0.0942770926723762, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 10810 + }, + { + "epoch": 0.09428581395754479, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 10811 + }, + { + "epoch": 0.09429453524271336, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 10812 + }, + { + "epoch": 0.09430325652788195, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 10813 + }, + { + "epoch": 0.09431197781305053, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 10814 + }, + { + "epoch": 0.09432069909821911, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 10815 + }, + { + "epoch": 0.0943294203833877, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 10816 + }, + { + "epoch": 0.09433814166855628, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 10817 + }, + { + "epoch": 0.09434686295372487, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 10818 + }, + { + "epoch": 0.09435558423889344, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 10819 + }, + { + "epoch": 0.09436430552406203, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 10820 + }, + { + "epoch": 0.09437302680923061, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 10821 + }, + { + "epoch": 0.09438174809439918, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 10822 + }, + { + "epoch": 0.09439046937956777, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 10823 + }, + { + "epoch": 0.09439919066473636, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 10824 + }, + { + "epoch": 0.09440791194990494, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 10825 + }, + { + "epoch": 0.09441663323507352, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 10826 + }, + { + "epoch": 0.0944253545202421, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 10827 + }, + { + "epoch": 0.09443407580541069, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 10828 + }, + { + "epoch": 0.09444279709057926, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 10829 + }, + { + "epoch": 0.09445151837574785, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 10830 + }, + { + "epoch": 0.09446023966091643, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 10831 + }, + { + "epoch": 0.09446896094608502, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 10832 + }, + { + "epoch": 0.09447768223125359, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 10833 + }, + { + "epoch": 0.09448640351642218, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 10834 + }, + { + "epoch": 0.09449512480159077, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 10835 + }, + { + "epoch": 0.09450384608675934, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 10836 + }, + { + "epoch": 0.09451256737192792, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 10837 + }, + { + "epoch": 0.09452128865709651, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10838 + }, + { + "epoch": 0.0945300099422651, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 10839 + }, + { + "epoch": 0.09453873122743367, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 10840 + }, + { + "epoch": 0.09454745251260226, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 10841 + }, + { + "epoch": 0.09455617379777084, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 10842 + }, + { + "epoch": 0.09456489508293942, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 10843 + }, + { + "epoch": 0.094573616368108, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 10844 + }, + { + "epoch": 0.09458233765327659, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 10845 + }, + { + "epoch": 0.09459105893844517, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10846 + }, + { + "epoch": 0.09459978022361375, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 10847 + }, + { + "epoch": 0.09460850150878233, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 10848 + }, + { + "epoch": 0.09461722279395092, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 10849 + }, + { + "epoch": 0.09462594407911949, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 10850 + }, + { + "epoch": 0.09463466536428808, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 10851 + }, + { + "epoch": 0.09464338664945667, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10852 + }, + { + "epoch": 0.09465210793462525, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 10853 + }, + { + "epoch": 0.09466082921979382, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 10854 + }, + { + "epoch": 0.09466955050496241, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 10855 + }, + { + "epoch": 0.094678271790131, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 10856 + }, + { + "epoch": 0.09468699307529957, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 10857 + }, + { + "epoch": 0.09469571436046816, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 10858 + }, + { + "epoch": 0.09470443564563674, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 10859 + }, + { + "epoch": 0.09471315693080533, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 10860 + }, + { + "epoch": 0.0947218782159739, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 10861 + }, + { + "epoch": 0.09473059950114249, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 10862 + }, + { + "epoch": 0.09473932078631107, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 10863 + }, + { + "epoch": 0.09474804207147966, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 10864 + }, + { + "epoch": 0.09475676335664823, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 10865 + }, + { + "epoch": 0.09476548464181682, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 10866 + }, + { + "epoch": 0.0947742059269854, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 10867 + }, + { + "epoch": 0.09478292721215398, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 10868 + }, + { + "epoch": 0.09479164849732256, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 10869 + }, + { + "epoch": 0.09480036978249115, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10870 + }, + { + "epoch": 0.09480909106765974, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 10871 + }, + { + "epoch": 0.09481781235282831, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 10872 + }, + { + "epoch": 0.0948265336379969, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 10873 + }, + { + "epoch": 0.09483525492316548, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 10874 + }, + { + "epoch": 0.09484397620833405, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 10875 + }, + { + "epoch": 0.09485269749350264, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 10876 + }, + { + "epoch": 0.09486141877867123, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 10877 + }, + { + "epoch": 0.09487014006383981, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 10878 + }, + { + "epoch": 0.09487886134900839, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10879 + }, + { + "epoch": 0.09488758263417697, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 10880 + }, + { + "epoch": 0.09489630391934556, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 10881 + }, + { + "epoch": 0.09490502520451413, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 10882 + }, + { + "epoch": 0.09491374648968272, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 10883 + }, + { + "epoch": 0.0949224677748513, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 10884 + }, + { + "epoch": 0.09493118906001989, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 10885 + }, + { + "epoch": 0.09493991034518846, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10886 + }, + { + "epoch": 0.09494863163035705, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 10887 + }, + { + "epoch": 0.09495735291552564, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 10888 + }, + { + "epoch": 0.09496607420069421, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 10889 + }, + { + "epoch": 0.0949747954858628, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10890 + }, + { + "epoch": 0.09498351677103138, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 10891 + }, + { + "epoch": 0.09499223805619997, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 10892 + }, + { + "epoch": 0.09500095934136854, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 10893 + }, + { + "epoch": 0.09500968062653713, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 10894 + }, + { + "epoch": 0.09501840191170571, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 10895 + }, + { + "epoch": 0.09502712319687429, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10896 + }, + { + "epoch": 0.09503584448204287, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10897 + }, + { + "epoch": 0.09504456576721146, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 10898 + }, + { + "epoch": 0.09505328705238004, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 10899 + }, + { + "epoch": 0.09506200833754862, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10900 + }, + { + "epoch": 0.0950707296227172, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 10901 + }, + { + "epoch": 0.09507945090788579, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 10902 + }, + { + "epoch": 0.09508817219305436, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 10903 + }, + { + "epoch": 0.09509689347822295, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10904 + }, + { + "epoch": 0.09510561476339154, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 10905 + }, + { + "epoch": 0.09511433604856012, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 10906 + }, + { + "epoch": 0.0951230573337287, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 10907 + }, + { + "epoch": 0.09513177861889728, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 10908 + }, + { + "epoch": 0.09514049990406587, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 10909 + }, + { + "epoch": 0.09514922118923444, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 10910 + }, + { + "epoch": 0.09515794247440303, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 10911 + }, + { + "epoch": 0.09516666375957161, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 10912 + }, + { + "epoch": 0.0951753850447402, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 10913 + }, + { + "epoch": 0.09518410632990877, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 10914 + }, + { + "epoch": 0.09519282761507736, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 10915 + }, + { + "epoch": 0.09520154890024594, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 10916 + }, + { + "epoch": 0.09521027018541452, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 10917 + }, + { + "epoch": 0.0952189914705831, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 10918 + }, + { + "epoch": 0.09522771275575169, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10919 + }, + { + "epoch": 0.09523643404092028, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 10920 + }, + { + "epoch": 0.09524515532608885, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10921 + }, + { + "epoch": 0.09525387661125743, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 10922 + }, + { + "epoch": 0.09526259789642602, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 10923 + }, + { + "epoch": 0.0952713191815946, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 10924 + }, + { + "epoch": 0.09528004046676318, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 10925 + }, + { + "epoch": 0.09528876175193177, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 10926 + }, + { + "epoch": 0.09529748303710035, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 10927 + }, + { + "epoch": 0.09530620432226893, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 10928 + }, + { + "epoch": 0.09531492560743751, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 10929 + }, + { + "epoch": 0.0953236468926061, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 10930 + }, + { + "epoch": 0.09533236817777467, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 10931 + }, + { + "epoch": 0.09534108946294326, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10932 + }, + { + "epoch": 0.09534981074811184, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 10933 + }, + { + "epoch": 0.09535853203328043, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 10934 + }, + { + "epoch": 0.095367253318449, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 10935 + }, + { + "epoch": 0.09537597460361759, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 10936 + }, + { + "epoch": 0.09538469588878618, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 10937 + }, + { + "epoch": 0.09539341717395475, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 10938 + }, + { + "epoch": 0.09540213845912333, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 10939 + }, + { + "epoch": 0.09541085974429192, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 10940 + }, + { + "epoch": 0.0954195810294605, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 10941 + }, + { + "epoch": 0.09542830231462908, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 10942 + }, + { + "epoch": 0.09543702359979767, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 10943 + }, + { + "epoch": 0.09544574488496625, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 10944 + }, + { + "epoch": 0.09545446617013482, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 10945 + }, + { + "epoch": 0.09546318745530341, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 10946 + }, + { + "epoch": 0.095471908740472, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 10947 + }, + { + "epoch": 0.09548063002564058, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 10948 + }, + { + "epoch": 0.09548935131080916, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 10949 + }, + { + "epoch": 0.09549807259597774, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 10950 + }, + { + "epoch": 0.09550679388114633, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 10951 + }, + { + "epoch": 0.0955155151663149, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 10952 + }, + { + "epoch": 0.09552423645148349, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 10953 + }, + { + "epoch": 0.09553295773665207, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 10954 + }, + { + "epoch": 0.09554167902182066, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 10955 + }, + { + "epoch": 0.09555040030698923, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 10956 + }, + { + "epoch": 0.09555912159215782, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 10957 + }, + { + "epoch": 0.0955678428773264, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 10958 + }, + { + "epoch": 0.09557656416249498, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 10959 + }, + { + "epoch": 0.09558528544766356, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 10960 + }, + { + "epoch": 0.09559400673283215, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 10961 + }, + { + "epoch": 0.09560272801800074, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 10962 + }, + { + "epoch": 0.09561144930316931, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 10963 + }, + { + "epoch": 0.0956201705883379, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 10964 + }, + { + "epoch": 0.09562889187350648, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 10965 + }, + { + "epoch": 0.09563761315867506, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 10966 + }, + { + "epoch": 0.09564633444384364, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 10967 + }, + { + "epoch": 0.09565505572901223, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 10968 + }, + { + "epoch": 0.09566377701418081, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 10969 + }, + { + "epoch": 0.09567249829934939, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 10970 + }, + { + "epoch": 0.09568121958451797, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 10971 + }, + { + "epoch": 0.09568994086968656, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 10972 + }, + { + "epoch": 0.09569866215485515, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 10973 + }, + { + "epoch": 0.09570738344002372, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 10974 + }, + { + "epoch": 0.0957161047251923, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 10975 + }, + { + "epoch": 0.09572482601036089, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 10976 + }, + { + "epoch": 0.09573354729552946, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 10977 + }, + { + "epoch": 0.09574226858069805, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 10978 + }, + { + "epoch": 0.09575098986586664, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 10979 + }, + { + "epoch": 0.09575971115103522, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 10980 + }, + { + "epoch": 0.0957684324362038, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 10981 + }, + { + "epoch": 0.09577715372137238, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 10982 + }, + { + "epoch": 0.09578587500654097, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 10983 + }, + { + "epoch": 0.09579459629170954, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 10984 + }, + { + "epoch": 0.09580331757687813, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 10985 + }, + { + "epoch": 0.09581203886204671, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 10986 + }, + { + "epoch": 0.0958207601472153, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 10987 + }, + { + "epoch": 0.09582948143238387, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 10988 + }, + { + "epoch": 0.09583820271755246, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 10989 + }, + { + "epoch": 0.09584692400272105, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 10990 + }, + { + "epoch": 0.09585564528788962, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 10991 + }, + { + "epoch": 0.0958643665730582, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 10992 + }, + { + "epoch": 0.09587308785822679, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 10993 + }, + { + "epoch": 0.09588180914339538, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 10994 + }, + { + "epoch": 0.09589053042856395, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 10995 + }, + { + "epoch": 0.09589925171373254, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 10996 + }, + { + "epoch": 0.09590797299890112, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 10997 + }, + { + "epoch": 0.0959166942840697, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 10998 + }, + { + "epoch": 0.09592541556923828, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 10999 + }, + { + "epoch": 0.09593413685440687, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11000 + }, + { + "epoch": 0.09594285813957545, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 11001 + }, + { + "epoch": 0.09595157942474403, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 11002 + }, + { + "epoch": 0.09596030070991261, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 11003 + }, + { + "epoch": 0.0959690219950812, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 11004 + }, + { + "epoch": 0.09597774328024977, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 11005 + }, + { + "epoch": 0.09598646456541836, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11006 + }, + { + "epoch": 0.09599518585058694, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 11007 + }, + { + "epoch": 0.09600390713575553, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 11008 + }, + { + "epoch": 0.0960126284209241, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 11009 + }, + { + "epoch": 0.09602134970609269, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 11010 + }, + { + "epoch": 0.09603007099126128, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 11011 + }, + { + "epoch": 0.09603879227642985, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 11012 + }, + { + "epoch": 0.09604751356159844, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 11013 + }, + { + "epoch": 0.09605623484676702, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 11014 + }, + { + "epoch": 0.09606495613193561, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 11015 + }, + { + "epoch": 0.09607367741710418, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 11016 + }, + { + "epoch": 0.09608239870227277, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 11017 + }, + { + "epoch": 0.09609111998744135, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 11018 + }, + { + "epoch": 0.09609984127260993, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 11019 + }, + { + "epoch": 0.09610856255777851, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 11020 + }, + { + "epoch": 0.0961172838429471, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 11021 + }, + { + "epoch": 0.09612600512811569, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 11022 + }, + { + "epoch": 0.09613472641328426, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 11023 + }, + { + "epoch": 0.09614344769845284, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 11024 + }, + { + "epoch": 0.09615216898362143, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 11025 + }, + { + "epoch": 0.09616089026879, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 11026 + }, + { + "epoch": 0.09616961155395859, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 11027 + }, + { + "epoch": 0.09617833283912718, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 11028 + }, + { + "epoch": 0.09618705412429576, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 11029 + }, + { + "epoch": 0.09619577540946433, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11030 + }, + { + "epoch": 0.09620449669463292, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 11031 + }, + { + "epoch": 0.09621321797980151, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 11032 + }, + { + "epoch": 0.09622193926497008, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 11033 + }, + { + "epoch": 0.09623066055013867, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 11034 + }, + { + "epoch": 0.09623938183530725, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 11035 + }, + { + "epoch": 0.09624810312047584, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 11036 + }, + { + "epoch": 0.09625682440564441, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 11037 + }, + { + "epoch": 0.096265545690813, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 11038 + }, + { + "epoch": 0.09627426697598158, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 11039 + }, + { + "epoch": 0.09628298826115016, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 11040 + }, + { + "epoch": 0.09629170954631874, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 11041 + }, + { + "epoch": 0.09630043083148733, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 11042 + }, + { + "epoch": 0.09630915211665592, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 11043 + }, + { + "epoch": 0.09631787340182449, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 11044 + }, + { + "epoch": 0.09632659468699308, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 11045 + }, + { + "epoch": 0.09633531597216166, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 11046 + }, + { + "epoch": 0.09634403725733023, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 11047 + }, + { + "epoch": 0.09635275854249882, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 11048 + }, + { + "epoch": 0.0963614798276674, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 11049 + }, + { + "epoch": 0.096370201112836, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 11050 + }, + { + "epoch": 0.09637892239800457, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 11051 + }, + { + "epoch": 0.09638764368317315, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 11052 + }, + { + "epoch": 0.09639636496834174, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 11053 + }, + { + "epoch": 0.09640508625351031, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 11054 + }, + { + "epoch": 0.0964138075386789, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 11055 + }, + { + "epoch": 0.09642252882384748, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 11056 + }, + { + "epoch": 0.09643125010901607, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 11057 + }, + { + "epoch": 0.09643997139418464, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 11058 + }, + { + "epoch": 0.09644869267935323, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 11059 + }, + { + "epoch": 0.09645741396452182, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 11060 + }, + { + "epoch": 0.09646613524969039, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 11061 + }, + { + "epoch": 0.09647485653485897, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 11062 + }, + { + "epoch": 0.09648357782002756, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 11063 + }, + { + "epoch": 0.09649229910519615, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 11064 + }, + { + "epoch": 0.09650102039036472, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 11065 + }, + { + "epoch": 0.0965097416755333, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 11066 + }, + { + "epoch": 0.09651846296070189, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11067 + }, + { + "epoch": 0.09652718424587046, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 11068 + }, + { + "epoch": 0.09653590553103905, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 11069 + }, + { + "epoch": 0.09654462681620764, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 11070 + }, + { + "epoch": 0.09655334810137622, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 11071 + }, + { + "epoch": 0.0965620693865448, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 11072 + }, + { + "epoch": 0.09657079067171338, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 11073 + }, + { + "epoch": 0.09657951195688197, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 11074 + }, + { + "epoch": 0.09658823324205054, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 11075 + }, + { + "epoch": 0.09659695452721913, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11076 + }, + { + "epoch": 0.09660567581238771, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 11077 + }, + { + "epoch": 0.0966143970975563, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 11078 + }, + { + "epoch": 0.09662311838272487, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 11079 + }, + { + "epoch": 0.09663183966789346, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 11080 + }, + { + "epoch": 0.09664056095306205, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 11081 + }, + { + "epoch": 0.09664928223823062, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 11082 + }, + { + "epoch": 0.0966580035233992, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 11083 + }, + { + "epoch": 0.09666672480856779, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 11084 + }, + { + "epoch": 0.09667544609373638, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 11085 + }, + { + "epoch": 0.09668416737890495, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 11086 + }, + { + "epoch": 0.09669288866407354, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 11087 + }, + { + "epoch": 0.09670160994924212, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 11088 + }, + { + "epoch": 0.09671033123441071, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 11089 + }, + { + "epoch": 0.09671905251957928, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 11090 + }, + { + "epoch": 0.09672777380474787, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11091 + }, + { + "epoch": 0.09673649508991645, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 11092 + }, + { + "epoch": 0.09674521637508503, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 11093 + }, + { + "epoch": 0.09675393766025361, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 11094 + }, + { + "epoch": 0.0967626589454222, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 11095 + }, + { + "epoch": 0.09677138023059079, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 11096 + }, + { + "epoch": 0.09678010151575936, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 11097 + }, + { + "epoch": 0.09678882280092795, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 11098 + }, + { + "epoch": 0.09679754408609653, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 11099 + }, + { + "epoch": 0.0968062653712651, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 11100 + }, + { + "epoch": 0.09681498665643369, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 11101 + }, + { + "epoch": 0.09682370794160228, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 11102 + }, + { + "epoch": 0.09683242922677086, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 11103 + }, + { + "epoch": 0.09684115051193944, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 11104 + }, + { + "epoch": 0.09684987179710802, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 11105 + }, + { + "epoch": 0.09685859308227661, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11106 + }, + { + "epoch": 0.09686731436744518, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 11107 + }, + { + "epoch": 0.09687603565261377, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11108 + }, + { + "epoch": 0.09688475693778235, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 11109 + }, + { + "epoch": 0.09689347822295094, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 11110 + }, + { + "epoch": 0.09690219950811951, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 11111 + }, + { + "epoch": 0.0969109207932881, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 11112 + }, + { + "epoch": 0.09691964207845669, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 11113 + }, + { + "epoch": 0.09692836336362526, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 11114 + }, + { + "epoch": 0.09693708464879384, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 11115 + }, + { + "epoch": 0.09694580593396243, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 11116 + }, + { + "epoch": 0.09695452721913102, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 11117 + }, + { + "epoch": 0.09696324850429959, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 11118 + }, + { + "epoch": 0.09697196978946818, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 11119 + }, + { + "epoch": 0.09698069107463676, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 11120 + }, + { + "epoch": 0.09698941235980534, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 11121 + }, + { + "epoch": 0.09699813364497392, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 11122 + }, + { + "epoch": 0.09700685493014251, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 11123 + }, + { + "epoch": 0.0970155762153111, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 11124 + }, + { + "epoch": 0.09702429750047967, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 11125 + }, + { + "epoch": 0.09703301878564825, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 11126 + }, + { + "epoch": 0.09704174007081684, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11127 + }, + { + "epoch": 0.09705046135598541, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 11128 + }, + { + "epoch": 0.097059182641154, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 11129 + }, + { + "epoch": 0.09706790392632259, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 11130 + }, + { + "epoch": 0.09707662521149117, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 11131 + }, + { + "epoch": 0.09708534649665974, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11132 + }, + { + "epoch": 0.09709406778182833, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11133 + }, + { + "epoch": 0.09710278906699692, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 11134 + }, + { + "epoch": 0.09711151035216549, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 11135 + }, + { + "epoch": 0.09712023163733408, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 11136 + }, + { + "epoch": 0.09712895292250266, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 11137 + }, + { + "epoch": 0.09713767420767125, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 11138 + }, + { + "epoch": 0.09714639549283982, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 11139 + }, + { + "epoch": 0.09715511677800841, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 11140 + }, + { + "epoch": 0.097163838063177, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 11141 + }, + { + "epoch": 0.09717255934834557, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 11142 + }, + { + "epoch": 0.09718128063351415, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 11143 + }, + { + "epoch": 0.09719000191868274, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 11144 + }, + { + "epoch": 0.09719872320385133, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 11145 + }, + { + "epoch": 0.0972074444890199, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 11146 + }, + { + "epoch": 0.09721616577418848, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 11147 + }, + { + "epoch": 0.09722488705935707, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 11148 + }, + { + "epoch": 0.09723360834452564, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 11149 + }, + { + "epoch": 0.09724232962969423, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 11150 + }, + { + "epoch": 0.09725105091486282, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 11151 + }, + { + "epoch": 0.0972597722000314, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11152 + }, + { + "epoch": 0.09726849348519997, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 11153 + }, + { + "epoch": 0.09727721477036856, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 11154 + }, + { + "epoch": 0.09728593605553715, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 11155 + }, + { + "epoch": 0.09729465734070572, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 11156 + }, + { + "epoch": 0.0973033786258743, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 11157 + }, + { + "epoch": 0.09731209991104289, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 11158 + }, + { + "epoch": 0.09732082119621148, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 11159 + }, + { + "epoch": 0.09732954248138005, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11160 + }, + { + "epoch": 0.09733826376654864, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 11161 + }, + { + "epoch": 0.09734698505171722, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 11162 + }, + { + "epoch": 0.0973557063368858, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 11163 + }, + { + "epoch": 0.09736442762205438, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 11164 + }, + { + "epoch": 0.09737314890722297, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 11165 + }, + { + "epoch": 0.09738187019239156, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 11166 + }, + { + "epoch": 0.09739059147756013, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 11167 + }, + { + "epoch": 0.09739931276272872, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 11168 + }, + { + "epoch": 0.0974080340478973, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 11169 + }, + { + "epoch": 0.09741675533306587, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 11170 + }, + { + "epoch": 0.09742547661823446, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 11171 + }, + { + "epoch": 0.09743419790340305, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 11172 + }, + { + "epoch": 0.09744291918857163, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 11173 + }, + { + "epoch": 0.0974516404737402, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 11174 + }, + { + "epoch": 0.09746036175890879, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 11175 + }, + { + "epoch": 0.09746908304407738, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 11176 + }, + { + "epoch": 0.09747780432924595, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 11177 + }, + { + "epoch": 0.09748652561441454, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 11178 + }, + { + "epoch": 0.09749524689958312, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 11179 + }, + { + "epoch": 0.09750396818475171, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 11180 + }, + { + "epoch": 0.09751268946992028, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 11181 + }, + { + "epoch": 0.09752141075508887, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 11182 + }, + { + "epoch": 0.09753013204025746, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 11183 + }, + { + "epoch": 0.09753885332542603, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 11184 + }, + { + "epoch": 0.09754757461059461, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 11185 + }, + { + "epoch": 0.0975562958957632, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 11186 + }, + { + "epoch": 0.09756501718093179, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 11187 + }, + { + "epoch": 0.09757373846610036, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 11188 + }, + { + "epoch": 0.09758245975126895, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 11189 + }, + { + "epoch": 0.09759118103643753, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 11190 + }, + { + "epoch": 0.0975999023216061, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11191 + }, + { + "epoch": 0.09760862360677469, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 11192 + }, + { + "epoch": 0.09761734489194328, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 11193 + }, + { + "epoch": 0.09762606617711186, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 11194 + }, + { + "epoch": 0.09763478746228044, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 11195 + }, + { + "epoch": 0.09764350874744902, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 11196 + }, + { + "epoch": 0.09765223003261761, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 11197 + }, + { + "epoch": 0.09766095131778618, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 11198 + }, + { + "epoch": 0.09766967260295477, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 11199 + }, + { + "epoch": 0.09767839388812335, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 11200 + }, + { + "epoch": 0.09768711517329194, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 11201 + }, + { + "epoch": 0.09769583645846051, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 11202 + }, + { + "epoch": 0.0977045577436291, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 11203 + }, + { + "epoch": 0.09771327902879769, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 11204 + }, + { + "epoch": 0.09772200031396627, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 11205 + }, + { + "epoch": 0.09773072159913485, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 11206 + }, + { + "epoch": 0.09773944288430343, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 11207 + }, + { + "epoch": 0.09774816416947202, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 11208 + }, + { + "epoch": 0.09775688545464059, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 11209 + }, + { + "epoch": 0.09776560673980918, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 11210 + }, + { + "epoch": 0.09777432802497776, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 11211 + }, + { + "epoch": 0.09778304931014635, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 11212 + }, + { + "epoch": 0.09779177059531492, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 11213 + }, + { + "epoch": 0.09780049188048351, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 11214 + }, + { + "epoch": 0.0978092131656521, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 11215 + }, + { + "epoch": 0.09781793445082067, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 11216 + }, + { + "epoch": 0.09782665573598925, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 11217 + }, + { + "epoch": 0.09783537702115784, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11218 + }, + { + "epoch": 0.09784409830632643, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 11219 + }, + { + "epoch": 0.097852819591495, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 11220 + }, + { + "epoch": 0.09786154087666359, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 11221 + }, + { + "epoch": 0.09787026216183217, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 11222 + }, + { + "epoch": 0.09787898344700074, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 11223 + }, + { + "epoch": 0.09788770473216933, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 11224 + }, + { + "epoch": 0.09789642601733792, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 11225 + }, + { + "epoch": 0.0979051473025065, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 11226 + }, + { + "epoch": 0.09791386858767508, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 11227 + }, + { + "epoch": 0.09792258987284366, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 11228 + }, + { + "epoch": 0.09793131115801225, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 11229 + }, + { + "epoch": 0.09794003244318082, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 11230 + }, + { + "epoch": 0.09794875372834941, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 11231 + }, + { + "epoch": 0.097957475013518, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 11232 + }, + { + "epoch": 0.09796619629868658, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 11233 + }, + { + "epoch": 0.09797491758385515, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 11234 + }, + { + "epoch": 0.09798363886902374, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 11235 + }, + { + "epoch": 0.09799236015419233, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 11236 + }, + { + "epoch": 0.0980010814393609, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 11237 + }, + { + "epoch": 0.09800980272452949, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 11238 + }, + { + "epoch": 0.09801852400969807, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 11239 + }, + { + "epoch": 0.09802724529486666, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 11240 + }, + { + "epoch": 0.09803596658003523, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 11241 + }, + { + "epoch": 0.09804468786520382, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 11242 + }, + { + "epoch": 0.0980534091503724, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 11243 + }, + { + "epoch": 0.09806213043554098, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 11244 + }, + { + "epoch": 0.09807085172070956, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 11245 + }, + { + "epoch": 0.09807957300587815, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 11246 + }, + { + "epoch": 0.09808829429104673, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 11247 + }, + { + "epoch": 0.09809701557621531, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 11248 + }, + { + "epoch": 0.0981057368613839, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 11249 + }, + { + "epoch": 0.09811445814655248, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 11250 + }, + { + "epoch": 0.09812317943172105, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 11251 + }, + { + "epoch": 0.09813190071688964, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 11252 + }, + { + "epoch": 0.09814062200205823, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 11253 + }, + { + "epoch": 0.09814934328722681, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 11254 + }, + { + "epoch": 0.09815806457239538, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 11255 + }, + { + "epoch": 0.09816678585756397, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 11256 + }, + { + "epoch": 0.09817550714273256, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 11257 + }, + { + "epoch": 0.09818422842790113, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 11258 + }, + { + "epoch": 0.09819294971306972, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 11259 + }, + { + "epoch": 0.0982016709982383, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 11260 + }, + { + "epoch": 0.09821039228340689, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 11261 + }, + { + "epoch": 0.09821911356857546, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 11262 + }, + { + "epoch": 0.09822783485374405, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 11263 + }, + { + "epoch": 0.09823655613891263, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 11264 + }, + { + "epoch": 0.0982452774240812, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 11265 + }, + { + "epoch": 0.09825399870924979, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 11266 + }, + { + "epoch": 0.09826271999441838, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 11267 + }, + { + "epoch": 0.09827144127958697, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 11268 + }, + { + "epoch": 0.09828016256475554, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 11269 + }, + { + "epoch": 0.09828888384992412, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11270 + }, + { + "epoch": 0.09829760513509271, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 11271 + }, + { + "epoch": 0.09830632642026128, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 11272 + }, + { + "epoch": 0.09831504770542987, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 11273 + }, + { + "epoch": 0.09832376899059846, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 11274 + }, + { + "epoch": 0.09833249027576704, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 11275 + }, + { + "epoch": 0.09834121156093562, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 11276 + }, + { + "epoch": 0.0983499328461042, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 11277 + }, + { + "epoch": 0.09835865413127279, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 11278 + }, + { + "epoch": 0.09836737541644136, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 11279 + }, + { + "epoch": 0.09837609670160995, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 11280 + }, + { + "epoch": 0.09838481798677853, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 11281 + }, + { + "epoch": 0.09839353927194712, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 11282 + }, + { + "epoch": 0.09840226055711569, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 11283 + }, + { + "epoch": 0.09841098184228428, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 11284 + }, + { + "epoch": 0.09841970312745286, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 11285 + }, + { + "epoch": 0.09842842441262144, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 11286 + }, + { + "epoch": 0.09843714569779002, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 11287 + }, + { + "epoch": 0.09844586698295861, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 11288 + }, + { + "epoch": 0.0984545882681272, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 11289 + }, + { + "epoch": 0.09846330955329577, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 11290 + }, + { + "epoch": 0.09847203083846436, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 11291 + }, + { + "epoch": 0.09848075212363294, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 11292 + }, + { + "epoch": 0.09848947340880151, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 11293 + }, + { + "epoch": 0.0984981946939701, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 11294 + }, + { + "epoch": 0.09850691597913869, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 11295 + }, + { + "epoch": 0.09851563726430727, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 11296 + }, + { + "epoch": 0.09852435854947585, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 11297 + }, + { + "epoch": 0.09853307983464443, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 11298 + }, + { + "epoch": 0.09854180111981302, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 11299 + }, + { + "epoch": 0.09855052240498159, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 11300 + }, + { + "epoch": 0.09855924369015018, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 11301 + }, + { + "epoch": 0.09856796497531876, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 11302 + }, + { + "epoch": 0.09857668626048735, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 11303 + }, + { + "epoch": 0.09858540754565592, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 11304 + }, + { + "epoch": 0.09859412883082451, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 11305 + }, + { + "epoch": 0.0986028501159931, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 11306 + }, + { + "epoch": 0.09861157140116167, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 11307 + }, + { + "epoch": 0.09862029268633025, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 11308 + }, + { + "epoch": 0.09862901397149884, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 11309 + }, + { + "epoch": 0.09863773525666743, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 11310 + }, + { + "epoch": 0.098646456541836, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 11311 + }, + { + "epoch": 0.09865517782700459, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 11312 + }, + { + "epoch": 0.09866389911217317, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 11313 + }, + { + "epoch": 0.09867262039734175, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 11314 + }, + { + "epoch": 0.09868134168251033, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 11315 + }, + { + "epoch": 0.09869006296767892, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 11316 + }, + { + "epoch": 0.0986987842528475, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 11317 + }, + { + "epoch": 0.09870750553801608, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 11318 + }, + { + "epoch": 0.09871622682318466, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 11319 + }, + { + "epoch": 0.09872494810835325, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 11320 + }, + { + "epoch": 0.09873366939352184, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 11321 + }, + { + "epoch": 0.09874239067869041, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 11322 + }, + { + "epoch": 0.098751111963859, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 11323 + }, + { + "epoch": 0.09875983324902758, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 11324 + }, + { + "epoch": 0.09876855453419615, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 11325 + }, + { + "epoch": 0.09877727581936474, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 11326 + }, + { + "epoch": 0.09878599710453333, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 11327 + }, + { + "epoch": 0.09879471838970191, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 11328 + }, + { + "epoch": 0.09880343967487049, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 11329 + }, + { + "epoch": 0.09881216096003907, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 11330 + }, + { + "epoch": 0.09882088224520766, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 11331 + }, + { + "epoch": 0.09882960353037623, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 11332 + }, + { + "epoch": 0.09883832481554482, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 11333 + }, + { + "epoch": 0.0988470461007134, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 11334 + }, + { + "epoch": 0.09885576738588199, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 11335 + }, + { + "epoch": 0.09886448867105056, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 11336 + }, + { + "epoch": 0.09887320995621915, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 11337 + }, + { + "epoch": 0.09888193124138774, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 11338 + }, + { + "epoch": 0.09889065252655631, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 11339 + }, + { + "epoch": 0.0988993738117249, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 11340 + }, + { + "epoch": 0.09890809509689348, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 11341 + }, + { + "epoch": 0.09891681638206207, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 11342 + }, + { + "epoch": 0.09892553766723064, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 11343 + }, + { + "epoch": 0.09893425895239923, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 11344 + }, + { + "epoch": 0.09894298023756781, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 11345 + }, + { + "epoch": 0.09895170152273638, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 11346 + }, + { + "epoch": 0.09896042280790497, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 11347 + }, + { + "epoch": 0.09896914409307356, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 11348 + }, + { + "epoch": 0.09897786537824214, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 11349 + }, + { + "epoch": 0.09898658666341072, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 11350 + }, + { + "epoch": 0.0989953079485793, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 11351 + }, + { + "epoch": 0.09900402923374789, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 11352 + }, + { + "epoch": 0.09901275051891646, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 11353 + }, + { + "epoch": 0.09902147180408505, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 11354 + }, + { + "epoch": 0.09903019308925363, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 11355 + }, + { + "epoch": 0.09903891437442222, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 11356 + }, + { + "epoch": 0.0990476356595908, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 11357 + }, + { + "epoch": 0.09905635694475938, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 11358 + }, + { + "epoch": 0.09906507822992797, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 11359 + }, + { + "epoch": 0.09907379951509654, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 11360 + }, + { + "epoch": 0.09908252080026513, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 11361 + }, + { + "epoch": 0.09909124208543371, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 11362 + }, + { + "epoch": 0.0990999633706023, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 11363 + }, + { + "epoch": 0.09910868465577087, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 11364 + }, + { + "epoch": 0.09911740594093946, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 11365 + }, + { + "epoch": 0.09912612722610804, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 11366 + }, + { + "epoch": 0.09913484851127662, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 11367 + }, + { + "epoch": 0.0991435697964452, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 11368 + }, + { + "epoch": 0.09915229108161379, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 11369 + }, + { + "epoch": 0.09916101236678238, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 11370 + }, + { + "epoch": 0.09916973365195095, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 11371 + }, + { + "epoch": 0.09917845493711953, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 11372 + }, + { + "epoch": 0.09918717622228812, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 11373 + }, + { + "epoch": 0.09919589750745669, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 11374 + }, + { + "epoch": 0.09920461879262528, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 11375 + }, + { + "epoch": 0.09921334007779387, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 11376 + }, + { + "epoch": 0.09922206136296245, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 11377 + }, + { + "epoch": 0.09923078264813102, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 11378 + }, + { + "epoch": 0.09923950393329961, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 11379 + }, + { + "epoch": 0.0992482252184682, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 11380 + }, + { + "epoch": 0.09925694650363677, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 11381 + }, + { + "epoch": 0.09926566778880536, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 11382 + }, + { + "epoch": 0.09927438907397394, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 11383 + }, + { + "epoch": 0.09928311035914253, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 11384 + }, + { + "epoch": 0.0992918316443111, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 11385 + }, + { + "epoch": 0.09930055292947969, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 11386 + }, + { + "epoch": 0.09930927421464827, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 11387 + }, + { + "epoch": 0.09931799549981685, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 11388 + }, + { + "epoch": 0.09932671678498543, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 11389 + }, + { + "epoch": 0.09933543807015402, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 11390 + }, + { + "epoch": 0.0993441593553226, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 11391 + }, + { + "epoch": 0.09935288064049118, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 11392 + }, + { + "epoch": 0.09936160192565976, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 11393 + }, + { + "epoch": 0.09937032321082835, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 11394 + }, + { + "epoch": 0.09937904449599692, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 11395 + }, + { + "epoch": 0.09938776578116551, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 11396 + }, + { + "epoch": 0.0993964870663341, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 11397 + }, + { + "epoch": 0.09940520835150268, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 11398 + }, + { + "epoch": 0.09941392963667126, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 11399 + }, + { + "epoch": 0.09942265092183984, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 11400 + }, + { + "epoch": 0.09943137220700843, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 11401 + }, + { + "epoch": 0.099440093492177, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 11402 + }, + { + "epoch": 0.09944881477734559, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 11403 + }, + { + "epoch": 0.09945753606251417, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 11404 + }, + { + "epoch": 0.09946625734768276, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 11405 + }, + { + "epoch": 0.09947497863285133, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 11406 + }, + { + "epoch": 0.09948369991801992, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 11407 + }, + { + "epoch": 0.0994924212031885, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 11408 + }, + { + "epoch": 0.09950114248835708, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 11409 + }, + { + "epoch": 0.09950986377352566, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 11410 + }, + { + "epoch": 0.09951858505869425, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 11411 + }, + { + "epoch": 0.09952730634386284, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 11412 + }, + { + "epoch": 0.09953602762903141, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 11413 + }, + { + "epoch": 0.0995447489142, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 11414 + }, + { + "epoch": 0.09955347019936858, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 11415 + }, + { + "epoch": 0.09956219148453715, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 11416 + }, + { + "epoch": 0.09957091276970574, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 11417 + }, + { + "epoch": 0.09957963405487433, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 11418 + }, + { + "epoch": 0.09958835534004291, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 11419 + }, + { + "epoch": 0.09959707662521149, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 11420 + }, + { + "epoch": 0.09960579791038007, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 11421 + }, + { + "epoch": 0.09961451919554866, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 11422 + }, + { + "epoch": 0.09962324048071723, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 11423 + }, + { + "epoch": 0.09963196176588582, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 11424 + }, + { + "epoch": 0.0996406830510544, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 11425 + }, + { + "epoch": 0.09964940433622299, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 11426 + }, + { + "epoch": 0.09965812562139156, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11427 + }, + { + "epoch": 0.09966684690656015, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 11428 + }, + { + "epoch": 0.09967556819172874, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 11429 + }, + { + "epoch": 0.09968428947689731, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 11430 + }, + { + "epoch": 0.0996930107620659, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 11431 + }, + { + "epoch": 0.09970173204723448, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 11432 + }, + { + "epoch": 0.09971045333240307, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 11433 + }, + { + "epoch": 0.09971917461757164, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 11434 + }, + { + "epoch": 0.09972789590274023, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 11435 + }, + { + "epoch": 0.09973661718790881, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 11436 + }, + { + "epoch": 0.0997453384730774, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 11437 + }, + { + "epoch": 0.09975405975824597, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 11438 + }, + { + "epoch": 0.09976278104341456, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 11439 + }, + { + "epoch": 0.09977150232858314, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 11440 + }, + { + "epoch": 0.09978022361375172, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 11441 + }, + { + "epoch": 0.0997889448989203, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 11442 + }, + { + "epoch": 0.09979766618408889, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 11443 + }, + { + "epoch": 0.09980638746925748, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 11444 + }, + { + "epoch": 0.09981510875442605, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 11445 + }, + { + "epoch": 0.09982383003959464, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 11446 + }, + { + "epoch": 0.09983255132476322, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 11447 + }, + { + "epoch": 0.0998412726099318, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 11448 + }, + { + "epoch": 0.09984999389510038, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 11449 + }, + { + "epoch": 0.09985871518026897, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 11450 + }, + { + "epoch": 0.09986743646543755, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11451 + }, + { + "epoch": 0.09987615775060613, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 11452 + }, + { + "epoch": 0.09988487903577471, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 11453 + }, + { + "epoch": 0.0998936003209433, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 11454 + }, + { + "epoch": 0.09990232160611187, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 11455 + }, + { + "epoch": 0.09991104289128046, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 11456 + }, + { + "epoch": 0.09991976417644904, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 11457 + }, + { + "epoch": 0.09992848546161763, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 11458 + }, + { + "epoch": 0.0999372067467862, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 11459 + }, + { + "epoch": 0.09994592803195479, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 11460 + }, + { + "epoch": 0.09995464931712338, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 11461 + }, + { + "epoch": 0.09996337060229195, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 11462 + }, + { + "epoch": 0.09997209188746053, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 11463 + }, + { + "epoch": 0.09998081317262912, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 11464 + }, + { + "epoch": 0.09998953445779771, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11465 + }, + { + "epoch": 0.09999825574296628, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 11466 + }, + { + "epoch": 0.10000697702813487, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 11467 + }, + { + "epoch": 0.10001569831330345, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 11468 + }, + { + "epoch": 0.10002441959847203, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 11469 + }, + { + "epoch": 0.10003314088364061, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 11470 + }, + { + "epoch": 0.1000418621688092, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 11471 + }, + { + "epoch": 0.10005058345397778, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 11472 + }, + { + "epoch": 0.10005930473914636, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 11473 + }, + { + "epoch": 0.10006802602431494, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 11474 + }, + { + "epoch": 0.10007674730948353, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 11475 + }, + { + "epoch": 0.1000854685946521, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 11476 + }, + { + "epoch": 0.10009418987982069, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 11477 + }, + { + "epoch": 0.10010291116498927, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 11478 + }, + { + "epoch": 0.10011163245015786, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 11479 + }, + { + "epoch": 0.10012035373532643, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 11480 + }, + { + "epoch": 0.10012907502049502, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 11481 + }, + { + "epoch": 0.1001377963056636, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 11482 + }, + { + "epoch": 0.10014651759083218, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 11483 + }, + { + "epoch": 0.10015523887600077, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 11484 + }, + { + "epoch": 0.10016396016116935, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 11485 + }, + { + "epoch": 0.10017268144633794, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 11486 + }, + { + "epoch": 0.10018140273150651, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 11487 + }, + { + "epoch": 0.1001901240166751, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 11488 + }, + { + "epoch": 0.10019884530184368, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 11489 + }, + { + "epoch": 0.10020756658701226, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 11490 + }, + { + "epoch": 0.10021628787218084, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 11491 + }, + { + "epoch": 0.10022500915734943, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 11492 + }, + { + "epoch": 0.10023373044251802, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 11493 + }, + { + "epoch": 0.10024245172768659, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 11494 + }, + { + "epoch": 0.10025117301285517, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 11495 + }, + { + "epoch": 0.10025989429802376, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 11496 + }, + { + "epoch": 0.10026861558319233, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 11497 + }, + { + "epoch": 0.10027733686836092, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 11498 + }, + { + "epoch": 0.1002860581535295, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 11499 + }, + { + "epoch": 0.10029477943869809, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 11500 + }, + { + "epoch": 0.10030350072386666, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 11501 + }, + { + "epoch": 0.10031222200903525, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 11502 + }, + { + "epoch": 0.10032094329420384, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 11503 + }, + { + "epoch": 0.10032966457937241, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 11504 + }, + { + "epoch": 0.100338385864541, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 11505 + }, + { + "epoch": 0.10034710714970958, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 11506 + }, + { + "epoch": 0.10035582843487817, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 11507 + }, + { + "epoch": 0.10036454972004674, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 11508 + }, + { + "epoch": 0.10037327100521533, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 11509 + }, + { + "epoch": 0.10038199229038391, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 11510 + }, + { + "epoch": 0.10039071357555249, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 11511 + }, + { + "epoch": 0.10039943486072107, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 11512 + }, + { + "epoch": 0.10040815614588966, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 11513 + }, + { + "epoch": 0.10041687743105825, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 11514 + }, + { + "epoch": 0.10042559871622682, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 11515 + }, + { + "epoch": 0.1004343200013954, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 11516 + }, + { + "epoch": 0.10044304128656399, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 11517 + }, + { + "epoch": 0.10045176257173256, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 11518 + }, + { + "epoch": 0.10046048385690115, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 11519 + }, + { + "epoch": 0.10046920514206974, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 11520 + }, + { + "epoch": 0.10047792642723832, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 11521 + }, + { + "epoch": 0.1004866477124069, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 11522 + }, + { + "epoch": 0.10049536899757548, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11523 + }, + { + "epoch": 0.10050409028274407, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 11524 + }, + { + "epoch": 0.10051281156791264, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 11525 + }, + { + "epoch": 0.10052153285308123, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 11526 + }, + { + "epoch": 0.10053025413824981, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 11527 + }, + { + "epoch": 0.1005389754234184, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 11528 + }, + { + "epoch": 0.10054769670858697, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 11529 + }, + { + "epoch": 0.10055641799375556, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 11530 + }, + { + "epoch": 0.10056513927892415, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 11531 + }, + { + "epoch": 0.10057386056409272, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 11532 + }, + { + "epoch": 0.1005825818492613, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 11533 + }, + { + "epoch": 0.10059130313442989, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 11534 + }, + { + "epoch": 0.10060002441959848, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 11535 + }, + { + "epoch": 0.10060874570476705, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 11536 + }, + { + "epoch": 0.10061746698993564, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 11537 + }, + { + "epoch": 0.10062618827510422, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 11538 + }, + { + "epoch": 0.1006349095602728, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 11539 + }, + { + "epoch": 0.10064363084544138, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11540 + }, + { + "epoch": 0.10065235213060997, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 11541 + }, + { + "epoch": 0.10066107341577855, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 11542 + }, + { + "epoch": 0.10066979470094713, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 11543 + }, + { + "epoch": 0.10067851598611571, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 11544 + }, + { + "epoch": 0.1006872372712843, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 11545 + }, + { + "epoch": 0.10069595855645287, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 11546 + }, + { + "epoch": 0.10070467984162146, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 11547 + }, + { + "epoch": 0.10071340112679004, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 11548 + }, + { + "epoch": 0.10072212241195863, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 11549 + }, + { + "epoch": 0.1007308436971272, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 11550 + }, + { + "epoch": 0.10073956498229579, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 11551 + }, + { + "epoch": 0.10074828626746438, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 11552 + }, + { + "epoch": 0.10075700755263296, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 11553 + }, + { + "epoch": 0.10076572883780154, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 11554 + }, + { + "epoch": 0.10077445012297012, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 11555 + }, + { + "epoch": 0.10078317140813871, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 11556 + }, + { + "epoch": 0.10079189269330728, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 11557 + }, + { + "epoch": 0.10080061397847587, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 11558 + }, + { + "epoch": 0.10080933526364445, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 11559 + }, + { + "epoch": 0.10081805654881304, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 11560 + }, + { + "epoch": 0.10082677783398161, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 11561 + }, + { + "epoch": 0.1008354991191502, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 11562 + }, + { + "epoch": 0.10084422040431879, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 11563 + }, + { + "epoch": 0.10085294168948736, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 11564 + }, + { + "epoch": 0.10086166297465594, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 11565 + }, + { + "epoch": 0.10087038425982453, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 11566 + }, + { + "epoch": 0.10087910554499312, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 11567 + }, + { + "epoch": 0.10088782683016169, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 11568 + }, + { + "epoch": 0.10089654811533028, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 11569 + }, + { + "epoch": 0.10090526940049886, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 11570 + }, + { + "epoch": 0.10091399068566743, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 11571 + }, + { + "epoch": 0.10092271197083602, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 11572 + }, + { + "epoch": 0.10093143325600461, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 11573 + }, + { + "epoch": 0.1009401545411732, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 11574 + }, + { + "epoch": 0.10094887582634177, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 11575 + }, + { + "epoch": 0.10095759711151035, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 11576 + }, + { + "epoch": 0.10096631839667894, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11577 + }, + { + "epoch": 0.10097503968184751, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 11578 + }, + { + "epoch": 0.1009837609670161, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11579 + }, + { + "epoch": 0.10099248225218468, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 11580 + }, + { + "epoch": 0.10100120353735327, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 11581 + }, + { + "epoch": 0.10100992482252184, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 11582 + }, + { + "epoch": 0.10101864610769043, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 11583 + }, + { + "epoch": 0.10102736739285902, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 11584 + }, + { + "epoch": 0.10103608867802759, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 11585 + }, + { + "epoch": 0.10104480996319617, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 11586 + }, + { + "epoch": 0.10105353124836476, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 11587 + }, + { + "epoch": 0.10106225253353335, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 11588 + }, + { + "epoch": 0.10107097381870192, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 11589 + }, + { + "epoch": 0.1010796951038705, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11590 + }, + { + "epoch": 0.10108841638903909, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 11591 + }, + { + "epoch": 0.10109713767420767, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 11592 + }, + { + "epoch": 0.10110585895937625, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 11593 + }, + { + "epoch": 0.10111458024454484, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 11594 + }, + { + "epoch": 0.10112330152971342, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 11595 + }, + { + "epoch": 0.101132022814882, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 11596 + }, + { + "epoch": 0.10114074410005058, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 11597 + }, + { + "epoch": 0.10114946538521917, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 11598 + }, + { + "epoch": 0.10115818667038774, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 11599 + }, + { + "epoch": 0.10116690795555633, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11600 + }, + { + "epoch": 0.10117562924072492, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 11601 + }, + { + "epoch": 0.1011843505258935, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 11602 + }, + { + "epoch": 0.10119307181106207, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 11603 + }, + { + "epoch": 0.10120179309623066, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 11604 + }, + { + "epoch": 0.10121051438139925, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 11605 + }, + { + "epoch": 0.10121923566656782, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 11606 + }, + { + "epoch": 0.1012279569517364, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 11607 + }, + { + "epoch": 0.10123667823690499, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 11608 + }, + { + "epoch": 0.10124539952207358, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 11609 + }, + { + "epoch": 0.10125412080724215, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 11610 + }, + { + "epoch": 0.10126284209241074, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 11611 + }, + { + "epoch": 0.10127156337757932, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 11612 + }, + { + "epoch": 0.1012802846627479, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 11613 + }, + { + "epoch": 0.10128900594791648, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 11614 + }, + { + "epoch": 0.10129772723308507, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 11615 + }, + { + "epoch": 0.10130644851825366, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 11616 + }, + { + "epoch": 0.10131516980342223, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 11617 + }, + { + "epoch": 0.10132389108859081, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 11618 + }, + { + "epoch": 0.1013326123737594, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 11619 + }, + { + "epoch": 0.10134133365892797, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 11620 + }, + { + "epoch": 0.10135005494409656, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11621 + }, + { + "epoch": 0.10135877622926515, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11622 + }, + { + "epoch": 0.10136749751443373, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 11623 + }, + { + "epoch": 0.1013762187996023, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 11624 + }, + { + "epoch": 0.10138494008477089, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 11625 + }, + { + "epoch": 0.10139366136993948, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 11626 + }, + { + "epoch": 0.10140238265510805, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 11627 + }, + { + "epoch": 0.10141110394027664, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 11628 + }, + { + "epoch": 0.10141982522544522, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 11629 + }, + { + "epoch": 0.10142854651061381, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 11630 + }, + { + "epoch": 0.10143726779578238, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11631 + }, + { + "epoch": 0.10144598908095097, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 11632 + }, + { + "epoch": 0.10145471036611955, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 11633 + }, + { + "epoch": 0.10146343165128813, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 11634 + }, + { + "epoch": 0.10147215293645671, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 11635 + }, + { + "epoch": 0.1014808742216253, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 11636 + }, + { + "epoch": 0.10148959550679389, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 11637 + }, + { + "epoch": 0.10149831679196246, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 11638 + }, + { + "epoch": 0.10150703807713105, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 11639 + }, + { + "epoch": 0.10151575936229963, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 11640 + }, + { + "epoch": 0.1015244806474682, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 11641 + }, + { + "epoch": 0.10153320193263679, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 11642 + }, + { + "epoch": 0.10154192321780538, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 11643 + }, + { + "epoch": 0.10155064450297396, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 11644 + }, + { + "epoch": 0.10155936578814254, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 11645 + }, + { + "epoch": 0.10156808707331112, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 11646 + }, + { + "epoch": 0.10157680835847971, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11647 + }, + { + "epoch": 0.10158552964364828, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 11648 + }, + { + "epoch": 0.10159425092881687, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 11649 + }, + { + "epoch": 0.10160297221398545, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 11650 + }, + { + "epoch": 0.10161169349915404, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 11651 + }, + { + "epoch": 0.10162041478432261, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 11652 + }, + { + "epoch": 0.1016291360694912, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 11653 + }, + { + "epoch": 0.10163785735465979, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 11654 + }, + { + "epoch": 0.10164657863982836, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 11655 + }, + { + "epoch": 0.10165529992499694, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 11656 + }, + { + "epoch": 0.10166402121016553, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 11657 + }, + { + "epoch": 0.10167274249533412, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 11658 + }, + { + "epoch": 0.10168146378050269, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 11659 + }, + { + "epoch": 0.10169018506567128, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 11660 + }, + { + "epoch": 0.10169890635083986, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 11661 + }, + { + "epoch": 0.10170762763600845, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 11662 + }, + { + "epoch": 0.10171634892117702, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 11663 + }, + { + "epoch": 0.10172507020634561, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 11664 + }, + { + "epoch": 0.1017337914915142, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 11665 + }, + { + "epoch": 0.10174251277668277, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 11666 + }, + { + "epoch": 0.10175123406185135, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 11667 + }, + { + "epoch": 0.10175995534701994, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 11668 + }, + { + "epoch": 0.10176867663218853, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 11669 + }, + { + "epoch": 0.1017773979173571, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 11670 + }, + { + "epoch": 0.10178611920252569, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 11671 + }, + { + "epoch": 0.10179484048769427, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 11672 + }, + { + "epoch": 0.10180356177286284, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 11673 + }, + { + "epoch": 0.10181228305803143, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 11674 + }, + { + "epoch": 0.10182100434320002, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 11675 + }, + { + "epoch": 0.1018297256283686, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 11676 + }, + { + "epoch": 0.10183844691353718, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 11677 + }, + { + "epoch": 0.10184716819870576, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 11678 + }, + { + "epoch": 0.10185588948387435, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 11679 + }, + { + "epoch": 0.10186461076904292, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 11680 + }, + { + "epoch": 0.10187333205421151, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 11681 + }, + { + "epoch": 0.1018820533393801, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 11682 + }, + { + "epoch": 0.10189077462454868, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 11683 + }, + { + "epoch": 0.10189949590971725, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 11684 + }, + { + "epoch": 0.10190821719488584, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 11685 + }, + { + "epoch": 0.10191693848005443, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11686 + }, + { + "epoch": 0.101925659765223, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 11687 + }, + { + "epoch": 0.10193438105039158, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 11688 + }, + { + "epoch": 0.10194310233556017, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 11689 + }, + { + "epoch": 0.10195182362072876, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 11690 + }, + { + "epoch": 0.10196054490589733, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 11691 + }, + { + "epoch": 0.10196926619106592, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 11692 + }, + { + "epoch": 0.1019779874762345, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 11693 + }, + { + "epoch": 0.10198670876140307, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 11694 + }, + { + "epoch": 0.10199543004657166, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 11695 + }, + { + "epoch": 0.10200415133174025, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11696 + }, + { + "epoch": 0.10201287261690883, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 11697 + }, + { + "epoch": 0.1020215939020774, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 11698 + }, + { + "epoch": 0.10203031518724599, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 11699 + }, + { + "epoch": 0.10203903647241458, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 11700 + }, + { + "epoch": 0.10204775775758315, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11701 + }, + { + "epoch": 0.10205647904275174, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 11702 + }, + { + "epoch": 0.10206520032792032, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 11703 + }, + { + "epoch": 0.10207392161308891, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 11704 + }, + { + "epoch": 0.10208264289825748, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 11705 + }, + { + "epoch": 0.10209136418342607, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 11706 + }, + { + "epoch": 0.10210008546859466, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 11707 + }, + { + "epoch": 0.10210880675376323, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 11708 + }, + { + "epoch": 0.10211752803893182, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 11709 + }, + { + "epoch": 0.1021262493241004, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 11710 + }, + { + "epoch": 0.10213497060926899, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 11711 + }, + { + "epoch": 0.10214369189443756, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 11712 + }, + { + "epoch": 0.10215241317960615, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 11713 + }, + { + "epoch": 0.10216113446477473, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 11714 + }, + { + "epoch": 0.1021698557499433, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 11715 + }, + { + "epoch": 0.10217857703511189, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 11716 + }, + { + "epoch": 0.10218729832028048, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 11717 + }, + { + "epoch": 0.10219601960544906, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 11718 + }, + { + "epoch": 0.10220474089061764, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 11719 + }, + { + "epoch": 0.10221346217578622, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 11720 + }, + { + "epoch": 0.10222218346095481, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 11721 + }, + { + "epoch": 0.10223090474612338, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 11722 + }, + { + "epoch": 0.10223962603129197, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 11723 + }, + { + "epoch": 0.10224834731646056, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 11724 + }, + { + "epoch": 0.10225706860162914, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 11725 + }, + { + "epoch": 0.10226578988679771, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 11726 + }, + { + "epoch": 0.1022745111719663, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 11727 + }, + { + "epoch": 0.10228323245713489, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 11728 + }, + { + "epoch": 0.10229195374230346, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 11729 + }, + { + "epoch": 0.10230067502747205, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 11730 + }, + { + "epoch": 0.10230939631264063, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 11731 + }, + { + "epoch": 0.10231811759780922, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 11732 + }, + { + "epoch": 0.10232683888297779, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 11733 + }, + { + "epoch": 0.10233556016814638, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 11734 + }, + { + "epoch": 0.10234428145331496, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 11735 + }, + { + "epoch": 0.10235300273848354, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 11736 + }, + { + "epoch": 0.10236172402365212, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 11737 + }, + { + "epoch": 0.10237044530882071, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 11738 + }, + { + "epoch": 0.1023791665939893, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 11739 + }, + { + "epoch": 0.10238788787915787, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 11740 + }, + { + "epoch": 0.10239660916432645, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 11741 + }, + { + "epoch": 0.10240533044949504, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 11742 + }, + { + "epoch": 0.10241405173466361, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 11743 + }, + { + "epoch": 0.1024227730198322, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 11744 + }, + { + "epoch": 0.10243149430500079, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 11745 + }, + { + "epoch": 0.10244021559016937, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 11746 + }, + { + "epoch": 0.10244893687533795, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 11747 + }, + { + "epoch": 0.10245765816050653, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 11748 + }, + { + "epoch": 0.10246637944567512, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 11749 + }, + { + "epoch": 0.10247510073084369, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 11750 + }, + { + "epoch": 0.10248382201601228, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 11751 + }, + { + "epoch": 0.10249254330118086, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 11752 + }, + { + "epoch": 0.10250126458634945, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 11753 + }, + { + "epoch": 0.10250998587151802, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 11754 + }, + { + "epoch": 0.10251870715668661, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 11755 + }, + { + "epoch": 0.1025274284418552, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 11756 + }, + { + "epoch": 0.10253614972702377, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 11757 + }, + { + "epoch": 0.10254487101219235, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 11758 + }, + { + "epoch": 0.10255359229736094, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 11759 + }, + { + "epoch": 0.10256231358252953, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 11760 + }, + { + "epoch": 0.1025710348676981, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 11761 + }, + { + "epoch": 0.10257975615286669, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 11762 + }, + { + "epoch": 0.10258847743803527, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 11763 + }, + { + "epoch": 0.10259719872320384, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 11764 + }, + { + "epoch": 0.10260592000837243, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 11765 + }, + { + "epoch": 0.10261464129354102, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 11766 + }, + { + "epoch": 0.1026233625787096, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 11767 + }, + { + "epoch": 0.10263208386387818, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 11768 + }, + { + "epoch": 0.10264080514904676, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 11769 + }, + { + "epoch": 0.10264952643421535, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 11770 + }, + { + "epoch": 0.10265824771938392, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 11771 + }, + { + "epoch": 0.10266696900455251, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 11772 + }, + { + "epoch": 0.1026756902897211, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 11773 + }, + { + "epoch": 0.10268441157488968, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 11774 + }, + { + "epoch": 0.10269313286005825, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 11775 + }, + { + "epoch": 0.10270185414522684, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11776 + }, + { + "epoch": 0.10271057543039543, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 11777 + }, + { + "epoch": 0.10271929671556401, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 11778 + }, + { + "epoch": 0.10272801800073258, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 11779 + }, + { + "epoch": 0.10273673928590117, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 11780 + }, + { + "epoch": 0.10274546057106976, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 11781 + }, + { + "epoch": 0.10275418185623833, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 11782 + }, + { + "epoch": 0.10276290314140692, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 11783 + }, + { + "epoch": 0.1027716244265755, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 11784 + }, + { + "epoch": 0.10278034571174409, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 11785 + }, + { + "epoch": 0.10278906699691266, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 11786 + }, + { + "epoch": 0.10279778828208125, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 11787 + }, + { + "epoch": 0.10280650956724983, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 11788 + }, + { + "epoch": 0.10281523085241841, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 11789 + }, + { + "epoch": 0.102823952137587, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 11790 + }, + { + "epoch": 0.10283267342275558, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 11791 + }, + { + "epoch": 0.10284139470792417, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 11792 + }, + { + "epoch": 0.10285011599309274, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 11793 + }, + { + "epoch": 0.10285883727826133, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 11794 + }, + { + "epoch": 0.10286755856342991, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 11795 + }, + { + "epoch": 0.10287627984859848, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 11796 + }, + { + "epoch": 0.10288500113376707, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 11797 + }, + { + "epoch": 0.10289372241893566, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 11798 + }, + { + "epoch": 0.10290244370410424, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 11799 + }, + { + "epoch": 0.10291116498927282, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 11800 + }, + { + "epoch": 0.1029198862744414, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11801 + }, + { + "epoch": 0.10292860755960999, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 11802 + }, + { + "epoch": 0.10293732884477856, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11803 + }, + { + "epoch": 0.10294605012994715, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 11804 + }, + { + "epoch": 0.10295477141511573, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 11805 + }, + { + "epoch": 0.10296349270028432, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 11806 + }, + { + "epoch": 0.10297221398545289, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 11807 + }, + { + "epoch": 0.10298093527062148, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 11808 + }, + { + "epoch": 0.10298965655579007, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11809 + }, + { + "epoch": 0.10299837784095864, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 11810 + }, + { + "epoch": 0.10300709912612722, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 11811 + }, + { + "epoch": 0.10301582041129581, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 11812 + }, + { + "epoch": 0.1030245416964644, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 11813 + }, + { + "epoch": 0.10303326298163297, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 11814 + }, + { + "epoch": 0.10304198426680156, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 11815 + }, + { + "epoch": 0.10305070555197014, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 11816 + }, + { + "epoch": 0.10305942683713872, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 11817 + }, + { + "epoch": 0.1030681481223073, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 11818 + }, + { + "epoch": 0.10307686940747589, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 11819 + }, + { + "epoch": 0.10308559069264447, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 11820 + }, + { + "epoch": 0.10309431197781305, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 11821 + }, + { + "epoch": 0.10310303326298163, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 11822 + }, + { + "epoch": 0.10311175454815022, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 11823 + }, + { + "epoch": 0.10312047583331879, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 11824 + }, + { + "epoch": 0.10312919711848738, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 11825 + }, + { + "epoch": 0.10313791840365596, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 11826 + }, + { + "epoch": 0.10314663968882455, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 11827 + }, + { + "epoch": 0.10315536097399312, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 11828 + }, + { + "epoch": 0.10316408225916171, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 11829 + }, + { + "epoch": 0.1031728035443303, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 11830 + }, + { + "epoch": 0.10318152482949887, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 11831 + }, + { + "epoch": 0.10319024611466746, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 11832 + }, + { + "epoch": 0.10319896739983604, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 11833 + }, + { + "epoch": 0.10320768868500463, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 11834 + }, + { + "epoch": 0.1032164099701732, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 11835 + }, + { + "epoch": 0.10322513125534179, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 11836 + }, + { + "epoch": 0.10323385254051037, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 11837 + }, + { + "epoch": 0.10324257382567895, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 11838 + }, + { + "epoch": 0.10325129511084753, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 11839 + }, + { + "epoch": 0.10326001639601612, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 11840 + }, + { + "epoch": 0.1032687376811847, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 11841 + }, + { + "epoch": 0.10327745896635328, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 11842 + }, + { + "epoch": 0.10328618025152186, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 11843 + }, + { + "epoch": 0.10329490153669045, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 11844 + }, + { + "epoch": 0.10330362282185902, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 11845 + }, + { + "epoch": 0.10331234410702761, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 11846 + }, + { + "epoch": 0.1033210653921962, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 11847 + }, + { + "epoch": 0.10332978667736478, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 11848 + }, + { + "epoch": 0.10333850796253335, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 11849 + }, + { + "epoch": 0.10334722924770194, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 11850 + }, + { + "epoch": 0.10335595053287053, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 11851 + }, + { + "epoch": 0.1033646718180391, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 11852 + }, + { + "epoch": 0.10337339310320769, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 11853 + }, + { + "epoch": 0.10338211438837627, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 11854 + }, + { + "epoch": 0.10339083567354486, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 11855 + }, + { + "epoch": 0.10339955695871343, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 11856 + }, + { + "epoch": 0.10340827824388202, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 11857 + }, + { + "epoch": 0.1034169995290506, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 11858 + }, + { + "epoch": 0.10342572081421918, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 11859 + }, + { + "epoch": 0.10343444209938776, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 11860 + }, + { + "epoch": 0.10344316338455635, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11861 + }, + { + "epoch": 0.10345188466972494, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 11862 + }, + { + "epoch": 0.10346060595489351, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 11863 + }, + { + "epoch": 0.1034693272400621, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 11864 + }, + { + "epoch": 0.10347804852523068, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 11865 + }, + { + "epoch": 0.10348676981039925, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 11866 + }, + { + "epoch": 0.10349549109556784, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 11867 + }, + { + "epoch": 0.10350421238073643, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 11868 + }, + { + "epoch": 0.10351293366590501, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 11869 + }, + { + "epoch": 0.10352165495107359, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 11870 + }, + { + "epoch": 0.10353037623624217, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 11871 + }, + { + "epoch": 0.10353909752141076, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 11872 + }, + { + "epoch": 0.10354781880657933, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 11873 + }, + { + "epoch": 0.10355654009174792, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 11874 + }, + { + "epoch": 0.1035652613769165, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 11875 + }, + { + "epoch": 0.10357398266208509, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 11876 + }, + { + "epoch": 0.10358270394725366, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 11877 + }, + { + "epoch": 0.10359142523242225, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 11878 + }, + { + "epoch": 0.10360014651759084, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 11879 + }, + { + "epoch": 0.10360886780275941, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 11880 + }, + { + "epoch": 0.103617589087928, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 11881 + }, + { + "epoch": 0.10362631037309658, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 11882 + }, + { + "epoch": 0.10363503165826517, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 11883 + }, + { + "epoch": 0.10364375294343374, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 11884 + }, + { + "epoch": 0.10365247422860233, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 11885 + }, + { + "epoch": 0.10366119551377091, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 11886 + }, + { + "epoch": 0.10366991679893948, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 11887 + }, + { + "epoch": 0.10367863808410807, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 11888 + }, + { + "epoch": 0.10368735936927666, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 11889 + }, + { + "epoch": 0.10369608065444524, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 11890 + }, + { + "epoch": 0.10370480193961382, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 11891 + }, + { + "epoch": 0.1037135232247824, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 11892 + }, + { + "epoch": 0.10372224450995099, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 11893 + }, + { + "epoch": 0.10373096579511958, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 11894 + }, + { + "epoch": 0.10373968708028815, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 11895 + }, + { + "epoch": 0.10374840836545673, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 11896 + }, + { + "epoch": 0.10375712965062532, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 11897 + }, + { + "epoch": 0.1037658509357939, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 11898 + }, + { + "epoch": 0.10377457222096248, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 11899 + }, + { + "epoch": 0.10378329350613107, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 11900 + }, + { + "epoch": 0.10379201479129965, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 11901 + }, + { + "epoch": 0.10380073607646823, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 11902 + }, + { + "epoch": 0.10380945736163681, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 11903 + }, + { + "epoch": 0.1038181786468054, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 11904 + }, + { + "epoch": 0.10382689993197397, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 11905 + }, + { + "epoch": 0.10383562121714256, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 11906 + }, + { + "epoch": 0.10384434250231114, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 11907 + }, + { + "epoch": 0.10385306378747973, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 11908 + }, + { + "epoch": 0.1038617850726483, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 11909 + }, + { + "epoch": 0.10387050635781689, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 11910 + }, + { + "epoch": 0.10387922764298547, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11911 + }, + { + "epoch": 0.10388794892815405, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 11912 + }, + { + "epoch": 0.10389667021332263, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 11913 + }, + { + "epoch": 0.10390539149849122, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 11914 + }, + { + "epoch": 0.1039141127836598, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 11915 + }, + { + "epoch": 0.10392283406882838, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 11916 + }, + { + "epoch": 0.10393155535399697, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 11917 + }, + { + "epoch": 0.10394027663916555, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 11918 + }, + { + "epoch": 0.10394899792433412, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 11919 + }, + { + "epoch": 0.10395771920950271, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 11920 + }, + { + "epoch": 0.1039664404946713, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 11921 + }, + { + "epoch": 0.10397516177983988, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 11922 + }, + { + "epoch": 0.10398388306500846, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 11923 + }, + { + "epoch": 0.10399260435017704, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 11924 + }, + { + "epoch": 0.10400132563534563, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 11925 + }, + { + "epoch": 0.1040100469205142, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 11926 + }, + { + "epoch": 0.10401876820568279, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 11927 + }, + { + "epoch": 0.10402748949085137, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 11928 + }, + { + "epoch": 0.10403621077601996, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 11929 + }, + { + "epoch": 0.10404493206118853, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 11930 + }, + { + "epoch": 0.10405365334635712, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 11931 + }, + { + "epoch": 0.1040623746315257, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 11932 + }, + { + "epoch": 0.10407109591669428, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 11933 + }, + { + "epoch": 0.10407981720186286, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 11934 + }, + { + "epoch": 0.10408853848703145, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 11935 + }, + { + "epoch": 0.10409725977220004, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 11936 + }, + { + "epoch": 0.10410598105736861, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 11937 + }, + { + "epoch": 0.1041147023425372, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 11938 + }, + { + "epoch": 0.10412342362770578, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 11939 + }, + { + "epoch": 0.10413214491287436, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 11940 + }, + { + "epoch": 0.10414086619804294, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 11941 + }, + { + "epoch": 0.10414958748321153, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 11942 + }, + { + "epoch": 0.10415830876838011, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 11943 + }, + { + "epoch": 0.10416703005354869, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 11944 + }, + { + "epoch": 0.10417575133871727, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 11945 + }, + { + "epoch": 0.10418447262388586, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 11946 + }, + { + "epoch": 0.10419319390905443, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 11947 + }, + { + "epoch": 0.10420191519422302, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 11948 + }, + { + "epoch": 0.1042106364793916, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 11949 + }, + { + "epoch": 0.10421935776456019, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 11950 + }, + { + "epoch": 0.10422807904972876, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 11951 + }, + { + "epoch": 0.10423680033489735, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 11952 + }, + { + "epoch": 0.10424552162006594, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 11953 + }, + { + "epoch": 0.10425424290523451, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 11954 + }, + { + "epoch": 0.1042629641904031, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 11955 + }, + { + "epoch": 0.10427168547557168, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 11956 + }, + { + "epoch": 0.10428040676074027, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 11957 + }, + { + "epoch": 0.10428912804590884, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 11958 + }, + { + "epoch": 0.10429784933107743, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 11959 + }, + { + "epoch": 0.10430657061624601, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 11960 + }, + { + "epoch": 0.10431529190141459, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 11961 + }, + { + "epoch": 0.10432401318658317, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 11962 + }, + { + "epoch": 0.10433273447175176, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 11963 + }, + { + "epoch": 0.10434145575692035, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 11964 + }, + { + "epoch": 0.10435017704208892, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 11965 + }, + { + "epoch": 0.1043588983272575, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 11966 + }, + { + "epoch": 0.10436761961242609, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 11967 + }, + { + "epoch": 0.10437634089759466, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 11968 + }, + { + "epoch": 0.10438506218276325, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 11969 + }, + { + "epoch": 0.10439378346793184, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 11970 + }, + { + "epoch": 0.10440250475310042, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 11971 + }, + { + "epoch": 0.104411226038269, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 11972 + }, + { + "epoch": 0.10441994732343758, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 11973 + }, + { + "epoch": 0.10442866860860617, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 11974 + }, + { + "epoch": 0.10443738989377474, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 11975 + }, + { + "epoch": 0.10444611117894333, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 11976 + }, + { + "epoch": 0.10445483246411191, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 11977 + }, + { + "epoch": 0.1044635537492805, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 11978 + }, + { + "epoch": 0.10447227503444907, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 11979 + }, + { + "epoch": 0.10448099631961766, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 11980 + }, + { + "epoch": 0.10448971760478624, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 11981 + }, + { + "epoch": 0.10449843888995482, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 11982 + }, + { + "epoch": 0.1045071601751234, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 11983 + }, + { + "epoch": 0.10451588146029199, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 11984 + }, + { + "epoch": 0.10452460274546058, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 11985 + }, + { + "epoch": 0.10453332403062915, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 11986 + }, + { + "epoch": 0.10454204531579774, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 11987 + }, + { + "epoch": 0.10455076660096632, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 11988 + }, + { + "epoch": 0.1045594878861349, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 11989 + }, + { + "epoch": 0.10456820917130348, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 11990 + }, + { + "epoch": 0.10457693045647207, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 11991 + }, + { + "epoch": 0.10458565174164065, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 11992 + }, + { + "epoch": 0.10459437302680923, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 11993 + }, + { + "epoch": 0.10460309431197781, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 11994 + }, + { + "epoch": 0.1046118155971464, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 11995 + }, + { + "epoch": 0.10462053688231497, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 11996 + }, + { + "epoch": 0.10462925816748356, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 11997 + }, + { + "epoch": 0.10463797945265214, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 11998 + }, + { + "epoch": 0.10464670073782073, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 11999 + }, + { + "epoch": 0.1046554220229893, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 12000 + }, + { + "epoch": 0.10466414330815789, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 12001 + }, + { + "epoch": 0.10467286459332648, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 12002 + }, + { + "epoch": 0.10468158587849505, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 12003 + }, + { + "epoch": 0.10469030716366363, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 12004 + }, + { + "epoch": 0.10469902844883222, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 12005 + }, + { + "epoch": 0.10470774973400081, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 12006 + }, + { + "epoch": 0.10471647101916938, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 12007 + }, + { + "epoch": 0.10472519230433797, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 12008 + }, + { + "epoch": 0.10473391358950655, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 12009 + }, + { + "epoch": 0.10474263487467514, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 12010 + }, + { + "epoch": 0.10475135615984371, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 12011 + }, + { + "epoch": 0.1047600774450123, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 12012 + }, + { + "epoch": 0.10476879873018088, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 12013 + }, + { + "epoch": 0.10477752001534946, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 12014 + }, + { + "epoch": 0.10478624130051804, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 12015 + }, + { + "epoch": 0.10479496258568663, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 12016 + }, + { + "epoch": 0.10480368387085522, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 12017 + }, + { + "epoch": 0.10481240515602379, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 12018 + }, + { + "epoch": 0.10482112644119237, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 12019 + }, + { + "epoch": 0.10482984772636096, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12020 + }, + { + "epoch": 0.10483856901152953, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 12021 + }, + { + "epoch": 0.10484729029669812, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 12022 + }, + { + "epoch": 0.1048560115818667, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 12023 + }, + { + "epoch": 0.10486473286703529, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 12024 + }, + { + "epoch": 0.10487345415220387, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 12025 + }, + { + "epoch": 0.10488217543737245, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 12026 + }, + { + "epoch": 0.10489089672254104, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 12027 + }, + { + "epoch": 0.10489961800770961, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 12028 + }, + { + "epoch": 0.1049083392928782, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 12029 + }, + { + "epoch": 0.10491706057804678, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 12030 + }, + { + "epoch": 0.10492578186321537, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 12031 + }, + { + "epoch": 0.10493450314838394, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 12032 + }, + { + "epoch": 0.10494322443355253, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 12033 + }, + { + "epoch": 0.10495194571872112, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 12034 + }, + { + "epoch": 0.10496066700388969, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 12035 + }, + { + "epoch": 0.10496938828905827, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 12036 + }, + { + "epoch": 0.10497810957422686, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 12037 + }, + { + "epoch": 0.10498683085939545, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 12038 + }, + { + "epoch": 0.10499555214456402, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 12039 + }, + { + "epoch": 0.1050042734297326, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 12040 + }, + { + "epoch": 0.10501299471490119, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 12041 + }, + { + "epoch": 0.10502171600006976, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 12042 + }, + { + "epoch": 0.10503043728523835, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 12043 + }, + { + "epoch": 0.10503915857040694, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 12044 + }, + { + "epoch": 0.10504787985557552, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 12045 + }, + { + "epoch": 0.1050566011407441, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 12046 + }, + { + "epoch": 0.10506532242591268, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 12047 + }, + { + "epoch": 0.10507404371108127, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 12048 + }, + { + "epoch": 0.10508276499624984, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 12049 + }, + { + "epoch": 0.10509148628141843, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 12050 + }, + { + "epoch": 0.10510020756658701, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 12051 + }, + { + "epoch": 0.1051089288517556, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 12052 + }, + { + "epoch": 0.10511765013692417, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 12053 + }, + { + "epoch": 0.10512637142209276, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 12054 + }, + { + "epoch": 0.10513509270726135, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 12055 + }, + { + "epoch": 0.10514381399242992, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 12056 + }, + { + "epoch": 0.1051525352775985, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 12057 + }, + { + "epoch": 0.10516125656276709, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12058 + }, + { + "epoch": 0.10516997784793568, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12059 + }, + { + "epoch": 0.10517869913310425, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 12060 + }, + { + "epoch": 0.10518742041827284, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 12061 + }, + { + "epoch": 0.10519614170344142, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 12062 + }, + { + "epoch": 0.10520486298861, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12063 + }, + { + "epoch": 0.10521358427377858, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 12064 + }, + { + "epoch": 0.10522230555894717, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 12065 + }, + { + "epoch": 0.10523102684411575, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 12066 + }, + { + "epoch": 0.10523974812928433, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 12067 + }, + { + "epoch": 0.10524846941445291, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 12068 + }, + { + "epoch": 0.1052571906996215, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 12069 + }, + { + "epoch": 0.10526591198479007, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 12070 + }, + { + "epoch": 0.10527463326995866, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 12071 + }, + { + "epoch": 0.10528335455512725, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 12072 + }, + { + "epoch": 0.10529207584029583, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 12073 + }, + { + "epoch": 0.1053007971254644, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 12074 + }, + { + "epoch": 0.10530951841063299, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 12075 + }, + { + "epoch": 0.10531823969580158, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 12076 + }, + { + "epoch": 0.10532696098097015, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 12077 + }, + { + "epoch": 0.10533568226613874, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 12078 + }, + { + "epoch": 0.10534440355130732, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 12079 + }, + { + "epoch": 0.10535312483647591, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 12080 + }, + { + "epoch": 0.10536184612164448, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 12081 + }, + { + "epoch": 0.10537056740681307, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 12082 + }, + { + "epoch": 0.10537928869198165, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 12083 + }, + { + "epoch": 0.10538800997715023, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 12084 + }, + { + "epoch": 0.10539673126231881, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12085 + }, + { + "epoch": 0.1054054525474874, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 12086 + }, + { + "epoch": 0.10541417383265599, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 12087 + }, + { + "epoch": 0.10542289511782456, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 12088 + }, + { + "epoch": 0.10543161640299314, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12089 + }, + { + "epoch": 0.10544033768816173, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 12090 + }, + { + "epoch": 0.1054490589733303, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 12091 + }, + { + "epoch": 0.10545778025849889, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 12092 + }, + { + "epoch": 0.10546650154366748, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 12093 + }, + { + "epoch": 0.10547522282883606, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 12094 + }, + { + "epoch": 0.10548394411400464, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 12095 + }, + { + "epoch": 0.10549266539917322, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 12096 + }, + { + "epoch": 0.10550138668434181, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 12097 + }, + { + "epoch": 0.10551010796951038, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 12098 + }, + { + "epoch": 0.10551882925467897, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 12099 + }, + { + "epoch": 0.10552755053984755, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 12100 + }, + { + "epoch": 0.10553627182501614, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 12101 + }, + { + "epoch": 0.10554499311018471, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 12102 + }, + { + "epoch": 0.1055537143953533, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 12103 + }, + { + "epoch": 0.10556243568052188, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 12104 + }, + { + "epoch": 0.10557115696569046, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 12105 + }, + { + "epoch": 0.10557987825085904, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 12106 + }, + { + "epoch": 0.10558859953602763, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 12107 + }, + { + "epoch": 0.10559732082119622, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 12108 + }, + { + "epoch": 0.10560604210636479, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 12109 + }, + { + "epoch": 0.10561476339153338, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 12110 + }, + { + "epoch": 0.10562348467670196, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 12111 + }, + { + "epoch": 0.10563220596187053, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 12112 + }, + { + "epoch": 0.10564092724703912, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 12113 + }, + { + "epoch": 0.10564964853220771, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 12114 + }, + { + "epoch": 0.1056583698173763, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 12115 + }, + { + "epoch": 0.10566709110254487, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 12116 + }, + { + "epoch": 0.10567581238771345, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 12117 + }, + { + "epoch": 0.10568453367288204, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 12118 + }, + { + "epoch": 0.10569325495805061, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 12119 + }, + { + "epoch": 0.1057019762432192, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 12120 + }, + { + "epoch": 0.10571069752838778, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 12121 + }, + { + "epoch": 0.10571941881355637, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 12122 + }, + { + "epoch": 0.10572814009872494, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 12123 + }, + { + "epoch": 0.10573686138389353, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 12124 + }, + { + "epoch": 0.10574558266906212, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 12125 + }, + { + "epoch": 0.1057543039542307, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 12126 + }, + { + "epoch": 0.10576302523939927, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 12127 + }, + { + "epoch": 0.10577174652456786, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 12128 + }, + { + "epoch": 0.10578046780973645, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 12129 + }, + { + "epoch": 0.10578918909490502, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 12130 + }, + { + "epoch": 0.1057979103800736, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 12131 + }, + { + "epoch": 0.10580663166524219, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 12132 + }, + { + "epoch": 0.10581535295041078, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 12133 + }, + { + "epoch": 0.10582407423557935, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12134 + }, + { + "epoch": 0.10583279552074794, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 12135 + }, + { + "epoch": 0.10584151680591652, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 12136 + }, + { + "epoch": 0.1058502380910851, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 12137 + }, + { + "epoch": 0.10585895937625368, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 12138 + }, + { + "epoch": 0.10586768066142227, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 12139 + }, + { + "epoch": 0.10587640194659086, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 12140 + }, + { + "epoch": 0.10588512323175943, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 12141 + }, + { + "epoch": 0.10589384451692802, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 12142 + }, + { + "epoch": 0.1059025658020966, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 12143 + }, + { + "epoch": 0.10591128708726517, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 12144 + }, + { + "epoch": 0.10592000837243376, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 12145 + }, + { + "epoch": 0.10592872965760235, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 12146 + }, + { + "epoch": 0.10593745094277093, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 12147 + }, + { + "epoch": 0.1059461722279395, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 12148 + }, + { + "epoch": 0.10595489351310809, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 12149 + }, + { + "epoch": 0.10596361479827668, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 12150 + }, + { + "epoch": 0.10597233608344525, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 12151 + }, + { + "epoch": 0.10598105736861384, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 12152 + }, + { + "epoch": 0.10598977865378242, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 12153 + }, + { + "epoch": 0.10599849993895101, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 12154 + }, + { + "epoch": 0.10600722122411958, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 12155 + }, + { + "epoch": 0.10601594250928817, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 12156 + }, + { + "epoch": 0.10602466379445676, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 12157 + }, + { + "epoch": 0.10603338507962533, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 12158 + }, + { + "epoch": 0.10604210636479391, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 12159 + }, + { + "epoch": 0.1060508276499625, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 12160 + }, + { + "epoch": 0.10605954893513109, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 12161 + }, + { + "epoch": 0.10606827022029966, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 12162 + }, + { + "epoch": 0.10607699150546825, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 12163 + }, + { + "epoch": 0.10608571279063683, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 12164 + }, + { + "epoch": 0.1060944340758054, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 12165 + }, + { + "epoch": 0.10610315536097399, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 12166 + }, + { + "epoch": 0.10611187664614258, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 12167 + }, + { + "epoch": 0.10612059793131116, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 12168 + }, + { + "epoch": 0.10612931921647974, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 12169 + }, + { + "epoch": 0.10613804050164832, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 12170 + }, + { + "epoch": 0.10614676178681691, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 12171 + }, + { + "epoch": 0.10615548307198548, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 12172 + }, + { + "epoch": 0.10616420435715407, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12173 + }, + { + "epoch": 0.10617292564232265, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12174 + }, + { + "epoch": 0.10618164692749124, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 12175 + }, + { + "epoch": 0.10619036821265981, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 12176 + }, + { + "epoch": 0.1061990894978284, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 12177 + }, + { + "epoch": 0.10620781078299699, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 12178 + }, + { + "epoch": 0.10621653206816556, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 12179 + }, + { + "epoch": 0.10622525335333415, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 12180 + }, + { + "epoch": 0.10623397463850273, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 12181 + }, + { + "epoch": 0.10624269592367132, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 12182 + }, + { + "epoch": 0.10625141720883989, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 12183 + }, + { + "epoch": 0.10626013849400848, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 12184 + }, + { + "epoch": 0.10626885977917706, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 12185 + }, + { + "epoch": 0.10627758106434564, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 12186 + }, + { + "epoch": 0.10628630234951422, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 12187 + }, + { + "epoch": 0.10629502363468281, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 12188 + }, + { + "epoch": 0.1063037449198514, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 12189 + }, + { + "epoch": 0.10631246620501997, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 12190 + }, + { + "epoch": 0.10632118749018855, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 12191 + }, + { + "epoch": 0.10632990877535714, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 12192 + }, + { + "epoch": 0.10633863006052571, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 12193 + }, + { + "epoch": 0.1063473513456943, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 12194 + }, + { + "epoch": 0.10635607263086289, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 12195 + }, + { + "epoch": 0.10636479391603147, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 12196 + }, + { + "epoch": 0.10637351520120004, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 12197 + }, + { + "epoch": 0.10638223648636863, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 12198 + }, + { + "epoch": 0.10639095777153722, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 12199 + }, + { + "epoch": 0.10639967905670579, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 12200 + }, + { + "epoch": 0.10640840034187438, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 12201 + }, + { + "epoch": 0.10641712162704296, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 12202 + }, + { + "epoch": 0.10642584291221155, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 12203 + }, + { + "epoch": 0.10643456419738012, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 12204 + }, + { + "epoch": 0.10644328548254871, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 12205 + }, + { + "epoch": 0.1064520067677173, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 12206 + }, + { + "epoch": 0.10646072805288587, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 12207 + }, + { + "epoch": 0.10646944933805445, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 12208 + }, + { + "epoch": 0.10647817062322304, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 12209 + }, + { + "epoch": 0.10648689190839163, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 12210 + }, + { + "epoch": 0.1064956131935602, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 12211 + }, + { + "epoch": 0.10650433447872878, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 12212 + }, + { + "epoch": 0.10651305576389737, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 12213 + }, + { + "epoch": 0.10652177704906594, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 12214 + }, + { + "epoch": 0.10653049833423453, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 12215 + }, + { + "epoch": 0.10653921961940312, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 12216 + }, + { + "epoch": 0.1065479409045717, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 12217 + }, + { + "epoch": 0.10655666218974028, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 12218 + }, + { + "epoch": 0.10656538347490886, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 12219 + }, + { + "epoch": 0.10657410476007745, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 12220 + }, + { + "epoch": 0.10658282604524602, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 12221 + }, + { + "epoch": 0.10659154733041461, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12222 + }, + { + "epoch": 0.1066002686155832, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 12223 + }, + { + "epoch": 0.10660898990075178, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 12224 + }, + { + "epoch": 0.10661771118592035, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 12225 + }, + { + "epoch": 0.10662643247108894, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 12226 + }, + { + "epoch": 0.10663515375625753, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 12227 + }, + { + "epoch": 0.1066438750414261, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 12228 + }, + { + "epoch": 0.10665259632659468, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 12229 + }, + { + "epoch": 0.10666131761176327, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 12230 + }, + { + "epoch": 0.10667003889693186, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 12231 + }, + { + "epoch": 0.10667876018210043, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 12232 + }, + { + "epoch": 0.10668748146726902, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 12233 + }, + { + "epoch": 0.1066962027524376, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 12234 + }, + { + "epoch": 0.10670492403760617, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 12235 + }, + { + "epoch": 0.10671364532277476, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 12236 + }, + { + "epoch": 0.10672236660794335, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 12237 + }, + { + "epoch": 0.10673108789311193, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 12238 + }, + { + "epoch": 0.1067398091782805, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 12239 + }, + { + "epoch": 0.10674853046344909, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 12240 + }, + { + "epoch": 0.10675725174861768, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 12241 + }, + { + "epoch": 0.10676597303378627, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 12242 + }, + { + "epoch": 0.10677469431895484, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 12243 + }, + { + "epoch": 0.10678341560412342, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 12244 + }, + { + "epoch": 0.10679213688929201, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 12245 + }, + { + "epoch": 0.10680085817446058, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 12246 + }, + { + "epoch": 0.10680957945962917, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 12247 + }, + { + "epoch": 0.10681830074479776, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 12248 + }, + { + "epoch": 0.10682702202996634, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 12249 + }, + { + "epoch": 0.10683574331513492, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 12250 + }, + { + "epoch": 0.1068444646003035, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 12251 + }, + { + "epoch": 0.10685318588547209, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 12252 + }, + { + "epoch": 0.10686190717064066, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 12253 + }, + { + "epoch": 0.10687062845580925, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 12254 + }, + { + "epoch": 0.10687934974097783, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 12255 + }, + { + "epoch": 0.10688807102614642, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 12256 + }, + { + "epoch": 0.10689679231131499, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 12257 + }, + { + "epoch": 0.10690551359648358, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 12258 + }, + { + "epoch": 0.10691423488165216, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 12259 + }, + { + "epoch": 0.10692295616682074, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 12260 + }, + { + "epoch": 0.10693167745198932, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 12261 + }, + { + "epoch": 0.10694039873715791, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 12262 + }, + { + "epoch": 0.1069491200223265, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 12263 + }, + { + "epoch": 0.10695784130749507, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 12264 + }, + { + "epoch": 0.10696656259266366, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 12265 + }, + { + "epoch": 0.10697528387783224, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 12266 + }, + { + "epoch": 0.10698400516300081, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12267 + }, + { + "epoch": 0.1069927264481694, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 12268 + }, + { + "epoch": 0.10700144773333799, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12269 + }, + { + "epoch": 0.10701016901850657, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 12270 + }, + { + "epoch": 0.10701889030367515, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 12271 + }, + { + "epoch": 0.10702761158884373, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 12272 + }, + { + "epoch": 0.10703633287401232, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 12273 + }, + { + "epoch": 0.10704505415918089, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 12274 + }, + { + "epoch": 0.10705377544434948, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 12275 + }, + { + "epoch": 0.10706249672951806, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 12276 + }, + { + "epoch": 0.10707121801468665, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 12277 + }, + { + "epoch": 0.10707993929985522, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 12278 + }, + { + "epoch": 0.10708866058502381, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 12279 + }, + { + "epoch": 0.1070973818701924, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 12280 + }, + { + "epoch": 0.10710610315536097, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 12281 + }, + { + "epoch": 0.10711482444052955, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 12282 + }, + { + "epoch": 0.10712354572569814, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 12283 + }, + { + "epoch": 0.10713226701086673, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 12284 + }, + { + "epoch": 0.1071409882960353, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12285 + }, + { + "epoch": 0.10714970958120389, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 12286 + }, + { + "epoch": 0.10715843086637247, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 12287 + }, + { + "epoch": 0.10716715215154105, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 12288 + }, + { + "epoch": 0.10717587343670963, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 12289 + }, + { + "epoch": 0.10718459472187822, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 12290 + }, + { + "epoch": 0.1071933160070468, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 12291 + }, + { + "epoch": 0.10720203729221538, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 12292 + }, + { + "epoch": 0.10721075857738396, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 12293 + }, + { + "epoch": 0.10721947986255255, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 12294 + }, + { + "epoch": 0.10722820114772112, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 12295 + }, + { + "epoch": 0.10723692243288971, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 12296 + }, + { + "epoch": 0.1072456437180583, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 12297 + }, + { + "epoch": 0.10725436500322688, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 12298 + }, + { + "epoch": 0.10726308628839545, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 12299 + }, + { + "epoch": 0.10727180757356404, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 12300 + }, + { + "epoch": 0.10728052885873263, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 12301 + }, + { + "epoch": 0.1072892501439012, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 12302 + }, + { + "epoch": 0.10729797142906979, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 12303 + }, + { + "epoch": 0.10730669271423837, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 12304 + }, + { + "epoch": 0.10731541399940696, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12305 + }, + { + "epoch": 0.10732413528457553, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 12306 + }, + { + "epoch": 0.10733285656974412, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 12307 + }, + { + "epoch": 0.1073415778549127, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 12308 + }, + { + "epoch": 0.10735029914008128, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 12309 + }, + { + "epoch": 0.10735902042524986, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 12310 + }, + { + "epoch": 0.10736774171041845, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 12311 + }, + { + "epoch": 0.10737646299558704, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 12312 + }, + { + "epoch": 0.10738518428075561, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12313 + }, + { + "epoch": 0.1073939055659242, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 12314 + }, + { + "epoch": 0.10740262685109278, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 12315 + }, + { + "epoch": 0.10741134813626135, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 12316 + }, + { + "epoch": 0.10742006942142994, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 12317 + }, + { + "epoch": 0.10742879070659853, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 12318 + }, + { + "epoch": 0.10743751199176711, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12319 + }, + { + "epoch": 0.10744623327693568, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 12320 + }, + { + "epoch": 0.10745495456210427, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 12321 + }, + { + "epoch": 0.10746367584727286, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 12322 + }, + { + "epoch": 0.10747239713244143, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 12323 + }, + { + "epoch": 0.10748111841761002, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 12324 + }, + { + "epoch": 0.1074898397027786, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 12325 + }, + { + "epoch": 0.10749856098794719, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 12326 + }, + { + "epoch": 0.10750728227311576, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 12327 + }, + { + "epoch": 0.10751600355828435, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 12328 + }, + { + "epoch": 0.10752472484345293, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 12329 + }, + { + "epoch": 0.10753344612862151, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 12330 + }, + { + "epoch": 0.1075421674137901, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 12331 + }, + { + "epoch": 0.10755088869895868, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 12332 + }, + { + "epoch": 0.10755960998412727, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 12333 + }, + { + "epoch": 0.10756833126929584, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 12334 + }, + { + "epoch": 0.10757705255446443, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 12335 + }, + { + "epoch": 0.10758577383963301, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 12336 + }, + { + "epoch": 0.10759449512480158, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 12337 + }, + { + "epoch": 0.10760321640997017, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 12338 + }, + { + "epoch": 0.10761193769513876, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 12339 + }, + { + "epoch": 0.10762065898030734, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 12340 + }, + { + "epoch": 0.10762938026547592, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12341 + }, + { + "epoch": 0.1076381015506445, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12342 + }, + { + "epoch": 0.10764682283581309, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 12343 + }, + { + "epoch": 0.10765554412098166, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 12344 + }, + { + "epoch": 0.10766426540615025, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 12345 + }, + { + "epoch": 0.10767298669131883, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 12346 + }, + { + "epoch": 0.10768170797648742, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 12347 + }, + { + "epoch": 0.10769042926165599, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12348 + }, + { + "epoch": 0.10769915054682458, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 12349 + }, + { + "epoch": 0.10770787183199317, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12350 + }, + { + "epoch": 0.10771659311716175, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 12351 + }, + { + "epoch": 0.10772531440233032, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 12352 + }, + { + "epoch": 0.10773403568749891, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12353 + }, + { + "epoch": 0.1077427569726675, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 12354 + }, + { + "epoch": 0.10775147825783607, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 12355 + }, + { + "epoch": 0.10776019954300466, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 12356 + }, + { + "epoch": 0.10776892082817324, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 12357 + }, + { + "epoch": 0.10777764211334183, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 12358 + }, + { + "epoch": 0.1077863633985104, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12359 + }, + { + "epoch": 0.10779508468367899, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 12360 + }, + { + "epoch": 0.10780380596884757, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 12361 + }, + { + "epoch": 0.10781252725401615, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 12362 + }, + { + "epoch": 0.10782124853918473, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 12363 + }, + { + "epoch": 0.10782996982435332, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 12364 + }, + { + "epoch": 0.1078386911095219, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 12365 + }, + { + "epoch": 0.10784741239469048, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 12366 + }, + { + "epoch": 0.10785613367985906, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 12367 + }, + { + "epoch": 0.10786485496502765, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 12368 + }, + { + "epoch": 0.10787357625019622, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 12369 + }, + { + "epoch": 0.10788229753536481, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 12370 + }, + { + "epoch": 0.1078910188205334, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 12371 + }, + { + "epoch": 0.10789974010570198, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 12372 + }, + { + "epoch": 0.10790846139087056, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 12373 + }, + { + "epoch": 0.10791718267603914, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 12374 + }, + { + "epoch": 0.10792590396120773, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 12375 + }, + { + "epoch": 0.1079346252463763, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 12376 + }, + { + "epoch": 0.10794334653154489, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 12377 + }, + { + "epoch": 0.10795206781671347, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 12378 + }, + { + "epoch": 0.10796078910188206, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 12379 + }, + { + "epoch": 0.10796951038705063, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 12380 + }, + { + "epoch": 0.10797823167221922, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 12381 + }, + { + "epoch": 0.1079869529573878, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 12382 + }, + { + "epoch": 0.10799567424255638, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 12383 + }, + { + "epoch": 0.10800439552772496, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 12384 + }, + { + "epoch": 0.10801311681289355, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 12385 + }, + { + "epoch": 0.10802183809806214, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 12386 + }, + { + "epoch": 0.10803055938323071, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 12387 + }, + { + "epoch": 0.1080392806683993, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 12388 + }, + { + "epoch": 0.10804800195356788, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 12389 + }, + { + "epoch": 0.10805672323873645, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 12390 + }, + { + "epoch": 0.10806544452390504, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 12391 + }, + { + "epoch": 0.10807416580907363, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 12392 + }, + { + "epoch": 0.10808288709424221, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12393 + }, + { + "epoch": 0.10809160837941079, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 12394 + }, + { + "epoch": 0.10810032966457937, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 12395 + }, + { + "epoch": 0.10810905094974796, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 12396 + }, + { + "epoch": 0.10811777223491653, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 12397 + }, + { + "epoch": 0.10812649352008512, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 12398 + }, + { + "epoch": 0.1081352148052537, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 12399 + }, + { + "epoch": 0.10814393609042229, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 12400 + }, + { + "epoch": 0.10815265737559086, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 12401 + }, + { + "epoch": 0.10816137866075945, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 12402 + }, + { + "epoch": 0.10817009994592804, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 12403 + }, + { + "epoch": 0.10817882123109661, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 12404 + }, + { + "epoch": 0.1081875425162652, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 12405 + }, + { + "epoch": 0.10819626380143378, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 12406 + }, + { + "epoch": 0.10820498508660237, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 12407 + }, + { + "epoch": 0.10821370637177094, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 12408 + }, + { + "epoch": 0.10822242765693953, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 12409 + }, + { + "epoch": 0.10823114894210811, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 12410 + }, + { + "epoch": 0.10823987022727669, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12411 + }, + { + "epoch": 0.10824859151244527, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 12412 + }, + { + "epoch": 0.10825731279761386, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 12413 + }, + { + "epoch": 0.10826603408278244, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 12414 + }, + { + "epoch": 0.10827475536795102, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 12415 + }, + { + "epoch": 0.1082834766531196, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 12416 + }, + { + "epoch": 0.10829219793828819, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 12417 + }, + { + "epoch": 0.10830091922345676, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 12418 + }, + { + "epoch": 0.10830964050862535, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 12419 + }, + { + "epoch": 0.10831836179379394, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 12420 + }, + { + "epoch": 0.10832708307896252, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12421 + }, + { + "epoch": 0.1083358043641311, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 12422 + }, + { + "epoch": 0.10834452564929968, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 12423 + }, + { + "epoch": 0.10835324693446827, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 12424 + }, + { + "epoch": 0.10836196821963684, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 12425 + }, + { + "epoch": 0.10837068950480543, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 12426 + }, + { + "epoch": 0.10837941078997401, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 12427 + }, + { + "epoch": 0.1083881320751426, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 12428 + }, + { + "epoch": 0.10839685336031117, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 12429 + }, + { + "epoch": 0.10840557464547976, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 12430 + }, + { + "epoch": 0.10841429593064834, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 12431 + }, + { + "epoch": 0.10842301721581692, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 12432 + }, + { + "epoch": 0.1084317385009855, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 12433 + }, + { + "epoch": 0.10844045978615409, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 12434 + }, + { + "epoch": 0.10844918107132268, + "grad_norm": 0.37109375, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 12435 + }, + { + "epoch": 0.10845790235649125, + "grad_norm": 0.57421875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 12436 + }, + { + "epoch": 0.10846662364165983, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 12437 + }, + { + "epoch": 0.10847534492682842, + "grad_norm": 0.78125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 12438 + }, + { + "epoch": 0.108484066211997, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 12439 + }, + { + "epoch": 0.10849278749716558, + "grad_norm": 0.7109375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 12440 + }, + { + "epoch": 0.10850150878233417, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 12441 + }, + { + "epoch": 0.10851023006750275, + "grad_norm": 0.91015625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 12442 + }, + { + "epoch": 0.10851895135267133, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 12443 + }, + { + "epoch": 0.10852767263783991, + "grad_norm": 0.73046875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 12444 + }, + { + "epoch": 0.1085363939230085, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12445 + }, + { + "epoch": 0.10854511520817707, + "grad_norm": 0.640625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 12446 + }, + { + "epoch": 0.10855383649334566, + "grad_norm": 0.58203125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 12447 + }, + { + "epoch": 0.10856255777851424, + "grad_norm": 0.77734375, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 12448 + }, + { + "epoch": 0.10857127906368283, + "grad_norm": 0.5703125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 12449 + }, + { + "epoch": 0.1085800003488514, + "grad_norm": 0.58203125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 12450 + }, + { + "epoch": 0.10858872163401999, + "grad_norm": 0.8125, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 12451 + }, + { + "epoch": 0.10859744291918857, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 12452 + }, + { + "epoch": 0.10860616420435715, + "grad_norm": 0.72265625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 12453 + }, + { + "epoch": 0.10861488548952573, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 12454 + }, + { + "epoch": 0.10862360677469432, + "grad_norm": 0.61328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 12455 + }, + { + "epoch": 0.1086323280598629, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 12456 + }, + { + "epoch": 0.10864104934503148, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 12457 + }, + { + "epoch": 0.10864977063020007, + "grad_norm": 0.353515625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 12458 + }, + { + "epoch": 0.10865849191536865, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 12459 + }, + { + "epoch": 0.10866721320053722, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 12460 + }, + { + "epoch": 0.10867593448570581, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 12461 + }, + { + "epoch": 0.1086846557708744, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 12462 + }, + { + "epoch": 0.10869337705604298, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 12463 + }, + { + "epoch": 0.10870209834121156, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 12464 + }, + { + "epoch": 0.10871081962638014, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 12465 + }, + { + "epoch": 0.10871954091154873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 12466 + }, + { + "epoch": 0.10872826219671732, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12467 + }, + { + "epoch": 0.10873698348188589, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 12468 + }, + { + "epoch": 0.10874570476705447, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 12469 + }, + { + "epoch": 0.10875442605222306, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 12470 + }, + { + "epoch": 0.10876314733739163, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 12471 + }, + { + "epoch": 0.10877186862256022, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 12472 + }, + { + "epoch": 0.1087805899077288, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 12473 + }, + { + "epoch": 0.10878931119289739, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 12474 + }, + { + "epoch": 0.10879803247806596, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 12475 + }, + { + "epoch": 0.10880675376323455, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 12476 + }, + { + "epoch": 0.10881547504840314, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 12477 + }, + { + "epoch": 0.10882419633357171, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 12478 + }, + { + "epoch": 0.1088329176187403, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 12479 + }, + { + "epoch": 0.10884163890390888, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 12480 + }, + { + "epoch": 0.10885036018907747, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12481 + }, + { + "epoch": 0.10885908147424604, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 12482 + }, + { + "epoch": 0.10886780275941463, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 12483 + }, + { + "epoch": 0.10887652404458321, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 12484 + }, + { + "epoch": 0.10888524532975179, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 12485 + }, + { + "epoch": 0.10889396661492037, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 12486 + }, + { + "epoch": 0.10890268790008896, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 12487 + }, + { + "epoch": 0.10891140918525755, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 12488 + }, + { + "epoch": 0.10892013047042612, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 12489 + }, + { + "epoch": 0.1089288517555947, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12490 + }, + { + "epoch": 0.10893757304076329, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 12491 + }, + { + "epoch": 0.10894629432593186, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 12492 + }, + { + "epoch": 0.10895501561110045, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 12493 + }, + { + "epoch": 0.10896373689626904, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 12494 + }, + { + "epoch": 0.10897245818143762, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 12495 + }, + { + "epoch": 0.1089811794666062, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 12496 + }, + { + "epoch": 0.10898990075177478, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12497 + }, + { + "epoch": 0.10899862203694337, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 12498 + }, + { + "epoch": 0.10900734332211194, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 12499 + }, + { + "epoch": 0.10901606460728053, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12500 + }, + { + "epoch": 0.10902478589244911, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 12501 + }, + { + "epoch": 0.1090335071776177, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 12502 + }, + { + "epoch": 0.10904222846278627, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 12503 + }, + { + "epoch": 0.10905094974795486, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12504 + }, + { + "epoch": 0.10905967103312345, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 12505 + }, + { + "epoch": 0.10906839231829202, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 12506 + }, + { + "epoch": 0.1090771136034606, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 12507 + }, + { + "epoch": 0.10908583488862919, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 12508 + }, + { + "epoch": 0.10909455617379778, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 12509 + }, + { + "epoch": 0.10910327745896635, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 12510 + }, + { + "epoch": 0.10911199874413494, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12511 + }, + { + "epoch": 0.10912072002930352, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 12512 + }, + { + "epoch": 0.1091294413144721, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 12513 + }, + { + "epoch": 0.10913816259964068, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 12514 + }, + { + "epoch": 0.10914688388480927, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 12515 + }, + { + "epoch": 0.10915560516997785, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 12516 + }, + { + "epoch": 0.10916432645514643, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 12517 + }, + { + "epoch": 0.10917304774031501, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 12518 + }, + { + "epoch": 0.1091817690254836, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 12519 + }, + { + "epoch": 0.10919049031065217, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 12520 + }, + { + "epoch": 0.10919921159582076, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 12521 + }, + { + "epoch": 0.10920793288098934, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 12522 + }, + { + "epoch": 0.10921665416615793, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 12523 + }, + { + "epoch": 0.1092253754513265, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 12524 + }, + { + "epoch": 0.10923409673649509, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 12525 + }, + { + "epoch": 0.10924281802166368, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 12526 + }, + { + "epoch": 0.10925153930683225, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 12527 + }, + { + "epoch": 0.10926026059200084, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 12528 + }, + { + "epoch": 0.10926898187716942, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 12529 + }, + { + "epoch": 0.10927770316233801, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 12530 + }, + { + "epoch": 0.10928642444750658, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 12531 + }, + { + "epoch": 0.10929514573267517, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 12532 + }, + { + "epoch": 0.10930386701784375, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 12533 + }, + { + "epoch": 0.10931258830301233, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 12534 + }, + { + "epoch": 0.10932130958818091, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 12535 + }, + { + "epoch": 0.1093300308733495, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 12536 + }, + { + "epoch": 0.10933875215851808, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 12537 + }, + { + "epoch": 0.10934747344368666, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 12538 + }, + { + "epoch": 0.10935619472885524, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 12539 + }, + { + "epoch": 0.10936491601402383, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 12540 + }, + { + "epoch": 0.1093736372991924, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 12541 + }, + { + "epoch": 0.10938235858436099, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 12542 + }, + { + "epoch": 0.10939107986952958, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 12543 + }, + { + "epoch": 0.10939980115469816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 12544 + }, + { + "epoch": 0.10940852243986673, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 12545 + }, + { + "epoch": 0.10941724372503532, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 12546 + }, + { + "epoch": 0.10942596501020391, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 12547 + }, + { + "epoch": 0.10943468629537248, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 12548 + }, + { + "epoch": 0.10944340758054107, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 12549 + }, + { + "epoch": 0.10945212886570965, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 12550 + }, + { + "epoch": 0.10946085015087824, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 12551 + }, + { + "epoch": 0.10946957143604681, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12552 + }, + { + "epoch": 0.1094782927212154, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 12553 + }, + { + "epoch": 0.10948701400638398, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 12554 + }, + { + "epoch": 0.10949573529155256, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 12555 + }, + { + "epoch": 0.10950445657672114, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 12556 + }, + { + "epoch": 0.10951317786188973, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 12557 + }, + { + "epoch": 0.10952189914705832, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 12558 + }, + { + "epoch": 0.10953062043222689, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 12559 + }, + { + "epoch": 0.10953934171739547, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 12560 + }, + { + "epoch": 0.10954806300256406, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 12561 + }, + { + "epoch": 0.10955678428773263, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 12562 + }, + { + "epoch": 0.10956550557290122, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 12563 + }, + { + "epoch": 0.1095742268580698, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 12564 + }, + { + "epoch": 0.10958294814323839, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 12565 + }, + { + "epoch": 0.10959166942840697, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 12566 + }, + { + "epoch": 0.10960039071357555, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 12567 + }, + { + "epoch": 0.10960911199874414, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 12568 + }, + { + "epoch": 0.10961783328391271, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 12569 + }, + { + "epoch": 0.1096265545690813, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 12570 + }, + { + "epoch": 0.10963527585424988, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 12571 + }, + { + "epoch": 0.10964399713941847, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 12572 + }, + { + "epoch": 0.10965271842458704, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 12573 + }, + { + "epoch": 0.10966143970975563, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 12574 + }, + { + "epoch": 0.10967016099492422, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 12575 + }, + { + "epoch": 0.10967888228009279, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 12576 + }, + { + "epoch": 0.10968760356526137, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12577 + }, + { + "epoch": 0.10969632485042996, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 12578 + }, + { + "epoch": 0.10970504613559855, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 12579 + }, + { + "epoch": 0.10971376742076712, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12580 + }, + { + "epoch": 0.1097224887059357, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 12581 + }, + { + "epoch": 0.10973120999110429, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 12582 + }, + { + "epoch": 0.10973993127627288, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 12583 + }, + { + "epoch": 0.10974865256144145, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 12584 + }, + { + "epoch": 0.10975737384661004, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 12585 + }, + { + "epoch": 0.10976609513177862, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 12586 + }, + { + "epoch": 0.1097748164169472, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 12587 + }, + { + "epoch": 0.10978353770211578, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 12588 + }, + { + "epoch": 0.10979225898728437, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 12589 + }, + { + "epoch": 0.10980098027245296, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 12590 + }, + { + "epoch": 0.10980970155762153, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 12591 + }, + { + "epoch": 0.10981842284279011, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 12592 + }, + { + "epoch": 0.1098271441279587, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 12593 + }, + { + "epoch": 0.10983586541312727, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 12594 + }, + { + "epoch": 0.10984458669829586, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12595 + }, + { + "epoch": 0.10985330798346445, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 12596 + }, + { + "epoch": 0.10986202926863303, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 12597 + }, + { + "epoch": 0.1098707505538016, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 12598 + }, + { + "epoch": 0.10987947183897019, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 12599 + }, + { + "epoch": 0.10988819312413878, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 12600 + }, + { + "epoch": 0.10989691440930735, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 12601 + }, + { + "epoch": 0.10990563569447594, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12602 + }, + { + "epoch": 0.10991435697964452, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 12603 + }, + { + "epoch": 0.10992307826481311, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 12604 + }, + { + "epoch": 0.10993179954998168, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 12605 + }, + { + "epoch": 0.10994052083515027, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 12606 + }, + { + "epoch": 0.10994924212031885, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12607 + }, + { + "epoch": 0.10995796340548743, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 12608 + }, + { + "epoch": 0.10996668469065601, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 12609 + }, + { + "epoch": 0.1099754059758246, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 12610 + }, + { + "epoch": 0.10998412726099319, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 12611 + }, + { + "epoch": 0.10999284854616176, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 12612 + }, + { + "epoch": 0.11000156983133035, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 12613 + }, + { + "epoch": 0.11001029111649893, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 12614 + }, + { + "epoch": 0.1100190124016675, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 12615 + }, + { + "epoch": 0.11002773368683609, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 12616 + }, + { + "epoch": 0.11003645497200468, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 12617 + }, + { + "epoch": 0.11004517625717326, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 12618 + }, + { + "epoch": 0.11005389754234184, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 12619 + }, + { + "epoch": 0.11006261882751042, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12620 + }, + { + "epoch": 0.11007134011267901, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 12621 + }, + { + "epoch": 0.11008006139784758, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 12622 + }, + { + "epoch": 0.11008878268301617, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 12623 + }, + { + "epoch": 0.11009750396818475, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 12624 + }, + { + "epoch": 0.11010622525335334, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 12625 + }, + { + "epoch": 0.11011494653852191, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 12626 + }, + { + "epoch": 0.1101236678236905, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12627 + }, + { + "epoch": 0.11013238910885909, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 12628 + }, + { + "epoch": 0.11014111039402766, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 12629 + }, + { + "epoch": 0.11014983167919624, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 12630 + }, + { + "epoch": 0.11015855296436483, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 12631 + }, + { + "epoch": 0.11016727424953342, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 12632 + }, + { + "epoch": 0.11017599553470199, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 12633 + }, + { + "epoch": 0.11018471681987058, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 12634 + }, + { + "epoch": 0.11019343810503916, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 12635 + }, + { + "epoch": 0.11020215939020774, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12636 + }, + { + "epoch": 0.11021088067537632, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 12637 + }, + { + "epoch": 0.11021960196054491, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 12638 + }, + { + "epoch": 0.1102283232457135, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 12639 + }, + { + "epoch": 0.11023704453088207, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 12640 + }, + { + "epoch": 0.11024576581605065, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 12641 + }, + { + "epoch": 0.11025448710121924, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 12642 + }, + { + "epoch": 0.11026320838638781, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 12643 + }, + { + "epoch": 0.1102719296715564, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 12644 + }, + { + "epoch": 0.11028065095672498, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 12645 + }, + { + "epoch": 0.11028937224189357, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 12646 + }, + { + "epoch": 0.11029809352706214, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 12647 + }, + { + "epoch": 0.11030681481223073, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 12648 + }, + { + "epoch": 0.11031553609739932, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 12649 + }, + { + "epoch": 0.11032425738256789, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 12650 + }, + { + "epoch": 0.11033297866773648, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 12651 + }, + { + "epoch": 0.11034169995290506, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 12652 + }, + { + "epoch": 0.11035042123807365, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 12653 + }, + { + "epoch": 0.11035914252324222, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 12654 + }, + { + "epoch": 0.11036786380841081, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 12655 + }, + { + "epoch": 0.1103765850935794, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 12656 + }, + { + "epoch": 0.11038530637874797, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 12657 + }, + { + "epoch": 0.11039402766391655, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 12658 + }, + { + "epoch": 0.11040274894908514, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 12659 + }, + { + "epoch": 0.11041147023425373, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 12660 + }, + { + "epoch": 0.1104201915194223, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12661 + }, + { + "epoch": 0.11042891280459088, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 12662 + }, + { + "epoch": 0.11043763408975947, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 12663 + }, + { + "epoch": 0.11044635537492804, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 12664 + }, + { + "epoch": 0.11045507666009663, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 12665 + }, + { + "epoch": 0.11046379794526522, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 12666 + }, + { + "epoch": 0.1104725192304338, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 12667 + }, + { + "epoch": 0.11048124051560237, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 12668 + }, + { + "epoch": 0.11048996180077096, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 12669 + }, + { + "epoch": 0.11049868308593955, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 12670 + }, + { + "epoch": 0.11050740437110812, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 12671 + }, + { + "epoch": 0.1105161256562767, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 12672 + }, + { + "epoch": 0.11052484694144529, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 12673 + }, + { + "epoch": 0.11053356822661388, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 12674 + }, + { + "epoch": 0.11054228951178245, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 12675 + }, + { + "epoch": 0.11055101079695104, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 12676 + }, + { + "epoch": 0.11055973208211962, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 12677 + }, + { + "epoch": 0.1105684533672882, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 12678 + }, + { + "epoch": 0.11057717465245678, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 12679 + }, + { + "epoch": 0.11058589593762537, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12680 + }, + { + "epoch": 0.11059461722279396, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 12681 + }, + { + "epoch": 0.11060333850796253, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 12682 + }, + { + "epoch": 0.11061205979313112, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 12683 + }, + { + "epoch": 0.1106207810782997, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 12684 + }, + { + "epoch": 0.11062950236346827, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 12685 + }, + { + "epoch": 0.11063822364863686, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 12686 + }, + { + "epoch": 0.11064694493380545, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 12687 + }, + { + "epoch": 0.11065566621897403, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 12688 + }, + { + "epoch": 0.1106643875041426, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 12689 + }, + { + "epoch": 0.11067310878931119, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 12690 + }, + { + "epoch": 0.11068183007447978, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 12691 + }, + { + "epoch": 0.11069055135964835, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 12692 + }, + { + "epoch": 0.11069927264481694, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12693 + }, + { + "epoch": 0.11070799392998552, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 12694 + }, + { + "epoch": 0.11071671521515411, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12695 + }, + { + "epoch": 0.11072543650032268, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 12696 + }, + { + "epoch": 0.11073415778549127, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 12697 + }, + { + "epoch": 0.11074287907065986, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 12698 + }, + { + "epoch": 0.11075160035582844, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 12699 + }, + { + "epoch": 0.11076032164099701, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 12700 + }, + { + "epoch": 0.1107690429261656, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12701 + }, + { + "epoch": 0.11077776421133419, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 12702 + }, + { + "epoch": 0.11078648549650276, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 12703 + }, + { + "epoch": 0.11079520678167135, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 12704 + }, + { + "epoch": 0.11080392806683993, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 12705 + }, + { + "epoch": 0.11081264935200852, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 12706 + }, + { + "epoch": 0.11082137063717709, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 12707 + }, + { + "epoch": 0.11083009192234568, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 12708 + }, + { + "epoch": 0.11083881320751426, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 12709 + }, + { + "epoch": 0.11084753449268284, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12710 + }, + { + "epoch": 0.11085625577785142, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 12711 + }, + { + "epoch": 0.11086497706302001, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 12712 + }, + { + "epoch": 0.1108736983481886, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 12713 + }, + { + "epoch": 0.11088241963335717, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12714 + }, + { + "epoch": 0.11089114091852575, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 12715 + }, + { + "epoch": 0.11089986220369434, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 12716 + }, + { + "epoch": 0.11090858348886291, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 12717 + }, + { + "epoch": 0.1109173047740315, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 12718 + }, + { + "epoch": 0.11092602605920009, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 12719 + }, + { + "epoch": 0.11093474734436867, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 12720 + }, + { + "epoch": 0.11094346862953725, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 12721 + }, + { + "epoch": 0.11095218991470583, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 12722 + }, + { + "epoch": 0.11096091119987442, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 12723 + }, + { + "epoch": 0.11096963248504299, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 12724 + }, + { + "epoch": 0.11097835377021158, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 12725 + }, + { + "epoch": 0.11098707505538016, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 12726 + }, + { + "epoch": 0.11099579634054875, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 12727 + }, + { + "epoch": 0.11100451762571732, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12728 + }, + { + "epoch": 0.11101323891088591, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 12729 + }, + { + "epoch": 0.1110219601960545, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 12730 + }, + { + "epoch": 0.11103068148122307, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12731 + }, + { + "epoch": 0.11103940276639165, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12732 + }, + { + "epoch": 0.11104812405156024, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 12733 + }, + { + "epoch": 0.11105684533672883, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 12734 + }, + { + "epoch": 0.1110655666218974, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 12735 + }, + { + "epoch": 0.11107428790706599, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 12736 + }, + { + "epoch": 0.11108300919223457, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 12737 + }, + { + "epoch": 0.11109173047740314, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 12738 + }, + { + "epoch": 0.11110045176257173, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 12739 + }, + { + "epoch": 0.11110917304774032, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 12740 + }, + { + "epoch": 0.1111178943329089, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 12741 + }, + { + "epoch": 0.11112661561807748, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 12742 + }, + { + "epoch": 0.11113533690324606, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 12743 + }, + { + "epoch": 0.11114405818841465, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 12744 + }, + { + "epoch": 0.11115277947358322, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 12745 + }, + { + "epoch": 0.11116150075875181, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 12746 + }, + { + "epoch": 0.1111702220439204, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 12747 + }, + { + "epoch": 0.11117894332908898, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 12748 + }, + { + "epoch": 0.11118766461425755, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12749 + }, + { + "epoch": 0.11119638589942614, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 12750 + }, + { + "epoch": 0.11120510718459473, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12751 + }, + { + "epoch": 0.1112138284697633, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 12752 + }, + { + "epoch": 0.11122254975493188, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 12753 + }, + { + "epoch": 0.11123127104010047, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 12754 + }, + { + "epoch": 0.11123999232526906, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 12755 + }, + { + "epoch": 0.11124871361043763, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 12756 + }, + { + "epoch": 0.11125743489560622, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 12757 + }, + { + "epoch": 0.1112661561807748, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 12758 + }, + { + "epoch": 0.11127487746594338, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 12759 + }, + { + "epoch": 0.11128359875111196, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 12760 + }, + { + "epoch": 0.11129232003628055, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 12761 + }, + { + "epoch": 0.11130104132144913, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 12762 + }, + { + "epoch": 0.11130976260661771, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 12763 + }, + { + "epoch": 0.1113184838917863, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 12764 + }, + { + "epoch": 0.11132720517695488, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 12765 + }, + { + "epoch": 0.11133592646212345, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 12766 + }, + { + "epoch": 0.11134464774729204, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12767 + }, + { + "epoch": 0.11135336903246063, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 12768 + }, + { + "epoch": 0.11136209031762921, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 12769 + }, + { + "epoch": 0.11137081160279778, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 12770 + }, + { + "epoch": 0.11137953288796637, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 12771 + }, + { + "epoch": 0.11138825417313496, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 12772 + }, + { + "epoch": 0.11139697545830353, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 12773 + }, + { + "epoch": 0.11140569674347212, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 12774 + }, + { + "epoch": 0.1114144180286407, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 12775 + }, + { + "epoch": 0.11142313931380929, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 12776 + }, + { + "epoch": 0.11143186059897786, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 12777 + }, + { + "epoch": 0.11144058188414645, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 12778 + }, + { + "epoch": 0.11144930316931503, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 12779 + }, + { + "epoch": 0.1114580244544836, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 12780 + }, + { + "epoch": 0.11146674573965219, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 12781 + }, + { + "epoch": 0.11147546702482078, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 12782 + }, + { + "epoch": 0.11148418830998937, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 12783 + }, + { + "epoch": 0.11149290959515794, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 12784 + }, + { + "epoch": 0.11150163088032652, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12785 + }, + { + "epoch": 0.11151035216549511, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 12786 + }, + { + "epoch": 0.11151907345066368, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 12787 + }, + { + "epoch": 0.11152779473583227, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 12788 + }, + { + "epoch": 0.11153651602100086, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 12789 + }, + { + "epoch": 0.11154523730616944, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 12790 + }, + { + "epoch": 0.11155395859133801, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12791 + }, + { + "epoch": 0.1115626798765066, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 12792 + }, + { + "epoch": 0.11157140116167519, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 12793 + }, + { + "epoch": 0.11158012244684376, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 12794 + }, + { + "epoch": 0.11158884373201235, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12795 + }, + { + "epoch": 0.11159756501718093, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 12796 + }, + { + "epoch": 0.11160628630234952, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 12797 + }, + { + "epoch": 0.11161500758751809, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 12798 + }, + { + "epoch": 0.11162372887268668, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 12799 + }, + { + "epoch": 0.11163245015785526, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 12800 + }, + { + "epoch": 0.11164117144302384, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 12801 + }, + { + "epoch": 0.11164989272819242, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 12802 + }, + { + "epoch": 0.11165861401336101, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12803 + }, + { + "epoch": 0.1116673352985296, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 12804 + }, + { + "epoch": 0.11167605658369817, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 12805 + }, + { + "epoch": 0.11168477786886676, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 12806 + }, + { + "epoch": 0.11169349915403534, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 12807 + }, + { + "epoch": 0.11170222043920391, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 12808 + }, + { + "epoch": 0.1117109417243725, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 12809 + }, + { + "epoch": 0.11171966300954109, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 12810 + }, + { + "epoch": 0.11172838429470967, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 12811 + }, + { + "epoch": 0.11173710557987825, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 12812 + }, + { + "epoch": 0.11174582686504683, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 12813 + }, + { + "epoch": 0.11175454815021542, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 12814 + }, + { + "epoch": 0.111763269435384, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12815 + }, + { + "epoch": 0.11177199072055258, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 12816 + }, + { + "epoch": 0.11178071200572116, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 12817 + }, + { + "epoch": 0.11178943329088975, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 12818 + }, + { + "epoch": 0.11179815457605832, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 12819 + }, + { + "epoch": 0.11180687586122691, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 12820 + }, + { + "epoch": 0.1118155971463955, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 12821 + }, + { + "epoch": 0.11182431843156408, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 12822 + }, + { + "epoch": 0.11183303971673265, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 12823 + }, + { + "epoch": 0.11184176100190124, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 12824 + }, + { + "epoch": 0.11185048228706983, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 12825 + }, + { + "epoch": 0.1118592035722384, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 12826 + }, + { + "epoch": 0.11186792485740699, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 12827 + }, + { + "epoch": 0.11187664614257557, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 12828 + }, + { + "epoch": 0.11188536742774416, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 12829 + }, + { + "epoch": 0.11189408871291273, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 12830 + }, + { + "epoch": 0.11190280999808132, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 12831 + }, + { + "epoch": 0.1119115312832499, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 12832 + }, + { + "epoch": 0.11192025256841848, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 12833 + }, + { + "epoch": 0.11192897385358706, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 12834 + }, + { + "epoch": 0.11193769513875565, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 12835 + }, + { + "epoch": 0.11194641642392424, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 12836 + }, + { + "epoch": 0.11195513770909281, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 12837 + }, + { + "epoch": 0.1119638589942614, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 12838 + }, + { + "epoch": 0.11197258027942998, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 12839 + }, + { + "epoch": 0.11198130156459855, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 12840 + }, + { + "epoch": 0.11199002284976714, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 12841 + }, + { + "epoch": 0.11199874413493573, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 12842 + }, + { + "epoch": 0.11200746542010431, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 12843 + }, + { + "epoch": 0.11201618670527289, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 12844 + }, + { + "epoch": 0.11202490799044147, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 12845 + }, + { + "epoch": 0.11203362927561006, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12846 + }, + { + "epoch": 0.11204235056077863, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 12847 + }, + { + "epoch": 0.11205107184594722, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12848 + }, + { + "epoch": 0.1120597931311158, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 12849 + }, + { + "epoch": 0.11206851441628439, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 12850 + }, + { + "epoch": 0.11207723570145296, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 12851 + }, + { + "epoch": 0.11208595698662155, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 12852 + }, + { + "epoch": 0.11209467827179014, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 12853 + }, + { + "epoch": 0.11210339955695871, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 12854 + }, + { + "epoch": 0.1121121208421273, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 12855 + }, + { + "epoch": 0.11212084212729588, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 12856 + }, + { + "epoch": 0.11212956341246447, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 12857 + }, + { + "epoch": 0.11213828469763304, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 12858 + }, + { + "epoch": 0.11214700598280163, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 12859 + }, + { + "epoch": 0.11215572726797021, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 12860 + }, + { + "epoch": 0.11216444855313878, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 12861 + }, + { + "epoch": 0.11217316983830737, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 12862 + }, + { + "epoch": 0.11218189112347596, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 12863 + }, + { + "epoch": 0.11219061240864454, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 12864 + }, + { + "epoch": 0.11219933369381312, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 12865 + }, + { + "epoch": 0.1122080549789817, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 12866 + }, + { + "epoch": 0.11221677626415029, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 12867 + }, + { + "epoch": 0.11222549754931886, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 12868 + }, + { + "epoch": 0.11223421883448745, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 12869 + }, + { + "epoch": 0.11224294011965603, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 12870 + }, + { + "epoch": 0.11225166140482462, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 12871 + }, + { + "epoch": 0.1122603826899932, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 12872 + }, + { + "epoch": 0.11226910397516178, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 12873 + }, + { + "epoch": 0.11227782526033037, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12874 + }, + { + "epoch": 0.11228654654549894, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 12875 + }, + { + "epoch": 0.11229526783066753, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 12876 + }, + { + "epoch": 0.11230398911583611, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 12877 + }, + { + "epoch": 0.1123127104010047, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 12878 + }, + { + "epoch": 0.11232143168617327, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 12879 + }, + { + "epoch": 0.11233015297134186, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 12880 + }, + { + "epoch": 0.11233887425651044, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 12881 + }, + { + "epoch": 0.11234759554167902, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 12882 + }, + { + "epoch": 0.1123563168268476, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 12883 + }, + { + "epoch": 0.11236503811201619, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 12884 + }, + { + "epoch": 0.11237375939718477, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 12885 + }, + { + "epoch": 0.11238248068235335, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 12886 + }, + { + "epoch": 0.11239120196752193, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 12887 + }, + { + "epoch": 0.11239992325269052, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 12888 + }, + { + "epoch": 0.11240864453785909, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 12889 + }, + { + "epoch": 0.11241736582302768, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12890 + }, + { + "epoch": 0.11242608710819627, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 12891 + }, + { + "epoch": 0.11243480839336485, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 12892 + }, + { + "epoch": 0.11244352967853342, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 12893 + }, + { + "epoch": 0.11245225096370201, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 12894 + }, + { + "epoch": 0.1124609722488706, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 12895 + }, + { + "epoch": 0.11246969353403917, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 12896 + }, + { + "epoch": 0.11247841481920776, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 12897 + }, + { + "epoch": 0.11248713610437634, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 12898 + }, + { + "epoch": 0.11249585738954493, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 12899 + }, + { + "epoch": 0.1125045786747135, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 12900 + }, + { + "epoch": 0.11251329995988209, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 12901 + }, + { + "epoch": 0.11252202124505067, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 12902 + }, + { + "epoch": 0.11253074253021925, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 12903 + }, + { + "epoch": 0.11253946381538783, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 12904 + }, + { + "epoch": 0.11254818510055642, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 12905 + }, + { + "epoch": 0.112556906385725, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 12906 + }, + { + "epoch": 0.11256562767089358, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 12907 + }, + { + "epoch": 0.11257434895606216, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 12908 + }, + { + "epoch": 0.11258307024123075, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 12909 + }, + { + "epoch": 0.11259179152639932, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 12910 + }, + { + "epoch": 0.11260051281156791, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 12911 + }, + { + "epoch": 0.1126092340967365, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 12912 + }, + { + "epoch": 0.11261795538190508, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 12913 + }, + { + "epoch": 0.11262667666707366, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 12914 + }, + { + "epoch": 0.11263539795224224, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 12915 + }, + { + "epoch": 0.11264411923741083, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 12916 + }, + { + "epoch": 0.1126528405225794, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 12917 + }, + { + "epoch": 0.11266156180774799, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 12918 + }, + { + "epoch": 0.11267028309291657, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 12919 + }, + { + "epoch": 0.11267900437808516, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 12920 + }, + { + "epoch": 0.11268772566325373, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 12921 + }, + { + "epoch": 0.11269644694842232, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 12922 + }, + { + "epoch": 0.1127051682335909, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 12923 + }, + { + "epoch": 0.11271388951875948, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 12924 + }, + { + "epoch": 0.11272261080392806, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 12925 + }, + { + "epoch": 0.11273133208909665, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 12926 + }, + { + "epoch": 0.11274005337426524, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 12927 + }, + { + "epoch": 0.11274877465943381, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 12928 + }, + { + "epoch": 0.1127574959446024, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 12929 + }, + { + "epoch": 0.11276621722977098, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 12930 + }, + { + "epoch": 0.11277493851493957, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 12931 + }, + { + "epoch": 0.11278365980010814, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 12932 + }, + { + "epoch": 0.11279238108527673, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 12933 + }, + { + "epoch": 0.11280110237044531, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 12934 + }, + { + "epoch": 0.11280982365561389, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 12935 + }, + { + "epoch": 0.11281854494078247, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 12936 + }, + { + "epoch": 0.11282726622595106, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 12937 + }, + { + "epoch": 0.11283598751111965, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 12938 + }, + { + "epoch": 0.11284470879628822, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 12939 + }, + { + "epoch": 0.1128534300814568, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 12940 + }, + { + "epoch": 0.11286215136662539, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 12941 + }, + { + "epoch": 0.11287087265179396, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 12942 + }, + { + "epoch": 0.11287959393696255, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 12943 + }, + { + "epoch": 0.11288831522213114, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 12944 + }, + { + "epoch": 0.11289703650729972, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 12945 + }, + { + "epoch": 0.1129057577924683, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12946 + }, + { + "epoch": 0.11291447907763688, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 12947 + }, + { + "epoch": 0.11292320036280547, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 12948 + }, + { + "epoch": 0.11293192164797404, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 12949 + }, + { + "epoch": 0.11294064293314263, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 12950 + }, + { + "epoch": 0.11294936421831121, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 12951 + }, + { + "epoch": 0.1129580855034798, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 12952 + }, + { + "epoch": 0.11296680678864837, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 12953 + }, + { + "epoch": 0.11297552807381696, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 12954 + }, + { + "epoch": 0.11298424935898554, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 12955 + }, + { + "epoch": 0.11299297064415412, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 12956 + }, + { + "epoch": 0.1130016919293227, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 12957 + }, + { + "epoch": 0.11301041321449129, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 12958 + }, + { + "epoch": 0.11301913449965988, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 12959 + }, + { + "epoch": 0.11302785578482845, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 12960 + }, + { + "epoch": 0.11303657706999704, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 12961 + }, + { + "epoch": 0.11304529835516562, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 12962 + }, + { + "epoch": 0.1130540196403342, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 12963 + }, + { + "epoch": 0.11306274092550278, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 12964 + }, + { + "epoch": 0.11307146221067137, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 12965 + }, + { + "epoch": 0.11308018349583995, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 12966 + }, + { + "epoch": 0.11308890478100853, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 12967 + }, + { + "epoch": 0.11309762606617711, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 12968 + }, + { + "epoch": 0.1131063473513457, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 12969 + }, + { + "epoch": 0.11311506863651427, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 12970 + }, + { + "epoch": 0.11312378992168286, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 12971 + }, + { + "epoch": 0.11313251120685144, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 12972 + }, + { + "epoch": 0.11314123249202003, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 12973 + }, + { + "epoch": 0.1131499537771886, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 12974 + }, + { + "epoch": 0.11315867506235719, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 12975 + }, + { + "epoch": 0.11316739634752578, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 12976 + }, + { + "epoch": 0.11317611763269435, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 12977 + }, + { + "epoch": 0.11318483891786293, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 12978 + }, + { + "epoch": 0.11319356020303152, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 12979 + }, + { + "epoch": 0.11320228148820011, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 12980 + }, + { + "epoch": 0.11321100277336868, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 12981 + }, + { + "epoch": 0.11321972405853727, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 12982 + }, + { + "epoch": 0.11322844534370585, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 12983 + }, + { + "epoch": 0.11323716662887442, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 12984 + }, + { + "epoch": 0.11324588791404301, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 12985 + }, + { + "epoch": 0.1132546091992116, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 12986 + }, + { + "epoch": 0.11326333048438018, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 12987 + }, + { + "epoch": 0.11327205176954876, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 12988 + }, + { + "epoch": 0.11328077305471734, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 12989 + }, + { + "epoch": 0.11328949433988593, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 12990 + }, + { + "epoch": 0.1132982156250545, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 12991 + }, + { + "epoch": 0.11330693691022309, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 12992 + }, + { + "epoch": 0.11331565819539167, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 12993 + }, + { + "epoch": 0.11332437948056026, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 12994 + }, + { + "epoch": 0.11333310076572883, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 12995 + }, + { + "epoch": 0.11334182205089742, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 12996 + }, + { + "epoch": 0.113350543336066, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 12997 + }, + { + "epoch": 0.11335926462123458, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 12998 + }, + { + "epoch": 0.11336798590640317, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 12999 + }, + { + "epoch": 0.11337670719157175, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13000 + }, + { + "epoch": 0.11338542847674034, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 13001 + }, + { + "epoch": 0.11339414976190891, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 13002 + }, + { + "epoch": 0.1134028710470775, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 13003 + }, + { + "epoch": 0.11341159233224608, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 13004 + }, + { + "epoch": 0.11342031361741466, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 13005 + }, + { + "epoch": 0.11342903490258324, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 13006 + }, + { + "epoch": 0.11343775618775183, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 13007 + }, + { + "epoch": 0.11344647747292042, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 13008 + }, + { + "epoch": 0.11345519875808899, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 13009 + }, + { + "epoch": 0.11346392004325757, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 13010 + }, + { + "epoch": 0.11347264132842616, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 13011 + }, + { + "epoch": 0.11348136261359473, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 13012 + }, + { + "epoch": 0.11349008389876332, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 13013 + }, + { + "epoch": 0.1134988051839319, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 13014 + }, + { + "epoch": 0.11350752646910049, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 13015 + }, + { + "epoch": 0.11351624775426906, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 13016 + }, + { + "epoch": 0.11352496903943765, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 13017 + }, + { + "epoch": 0.11353369032460624, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 13018 + }, + { + "epoch": 0.11354241160977481, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 13019 + }, + { + "epoch": 0.1135511328949434, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 13020 + }, + { + "epoch": 0.11355985418011198, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 13021 + }, + { + "epoch": 0.11356857546528057, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 13022 + }, + { + "epoch": 0.11357729675044914, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 13023 + }, + { + "epoch": 0.11358601803561773, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 13024 + }, + { + "epoch": 0.11359473932078631, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 13025 + }, + { + "epoch": 0.11360346060595489, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 13026 + }, + { + "epoch": 0.11361218189112347, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 13027 + }, + { + "epoch": 0.11362090317629206, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 13028 + }, + { + "epoch": 0.11362962446146065, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 13029 + }, + { + "epoch": 0.11363834574662922, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 13030 + }, + { + "epoch": 0.1136470670317978, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 13031 + }, + { + "epoch": 0.11365578831696639, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 13032 + }, + { + "epoch": 0.11366450960213496, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 13033 + }, + { + "epoch": 0.11367323088730355, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 13034 + }, + { + "epoch": 0.11368195217247214, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 13035 + }, + { + "epoch": 0.11369067345764072, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 13036 + }, + { + "epoch": 0.1136993947428093, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 13037 + }, + { + "epoch": 0.11370811602797788, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 13038 + }, + { + "epoch": 0.11371683731314647, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 13039 + }, + { + "epoch": 0.11372555859831504, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 13040 + }, + { + "epoch": 0.11373427988348363, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 13041 + }, + { + "epoch": 0.11374300116865221, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 13042 + }, + { + "epoch": 0.1137517224538208, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 13043 + }, + { + "epoch": 0.11376044373898937, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 13044 + }, + { + "epoch": 0.11376916502415796, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 13045 + }, + { + "epoch": 0.11377788630932655, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 13046 + }, + { + "epoch": 0.11378660759449513, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 13047 + }, + { + "epoch": 0.1137953288796637, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 13048 + }, + { + "epoch": 0.11380405016483229, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 13049 + }, + { + "epoch": 0.11381277145000088, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 13050 + }, + { + "epoch": 0.11382149273516945, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 13051 + }, + { + "epoch": 0.11383021402033804, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 13052 + }, + { + "epoch": 0.11383893530550662, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 13053 + }, + { + "epoch": 0.11384765659067521, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 13054 + }, + { + "epoch": 0.11385637787584378, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 13055 + }, + { + "epoch": 0.11386509916101237, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 13056 + }, + { + "epoch": 0.11387382044618095, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 13057 + }, + { + "epoch": 0.11388254173134953, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 13058 + }, + { + "epoch": 0.11389126301651811, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 13059 + }, + { + "epoch": 0.1138999843016867, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 13060 + }, + { + "epoch": 0.11390870558685529, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 13061 + }, + { + "epoch": 0.11391742687202386, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 13062 + }, + { + "epoch": 0.11392614815719244, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 13063 + }, + { + "epoch": 0.11393486944236103, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 13064 + }, + { + "epoch": 0.1139435907275296, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 13065 + }, + { + "epoch": 0.11395231201269819, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 13066 + }, + { + "epoch": 0.11396103329786678, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 13067 + }, + { + "epoch": 0.11396975458303536, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 13068 + }, + { + "epoch": 0.11397847586820394, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 13069 + }, + { + "epoch": 0.11398719715337252, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 13070 + }, + { + "epoch": 0.11399591843854111, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 13071 + }, + { + "epoch": 0.11400463972370968, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 13072 + }, + { + "epoch": 0.11401336100887827, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 13073 + }, + { + "epoch": 0.11402208229404685, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 13074 + }, + { + "epoch": 0.11403080357921544, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 13075 + }, + { + "epoch": 0.11403952486438401, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 13076 + }, + { + "epoch": 0.1140482461495526, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 13077 + }, + { + "epoch": 0.11405696743472118, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 13078 + }, + { + "epoch": 0.11406568871988976, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 13079 + }, + { + "epoch": 0.11407441000505834, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 13080 + }, + { + "epoch": 0.11408313129022693, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 13081 + }, + { + "epoch": 0.11409185257539552, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 13082 + }, + { + "epoch": 0.11410057386056409, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 13083 + }, + { + "epoch": 0.11410929514573268, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 13084 + }, + { + "epoch": 0.11411801643090126, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 13085 + }, + { + "epoch": 0.11412673771606983, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 13086 + }, + { + "epoch": 0.11413545900123842, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 13087 + }, + { + "epoch": 0.11414418028640701, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 13088 + }, + { + "epoch": 0.1141529015715756, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 13089 + }, + { + "epoch": 0.11416162285674417, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 13090 + }, + { + "epoch": 0.11417034414191275, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 13091 + }, + { + "epoch": 0.11417906542708134, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 13092 + }, + { + "epoch": 0.11418778671224991, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 13093 + }, + { + "epoch": 0.1141965079974185, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 13094 + }, + { + "epoch": 0.11420522928258708, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 13095 + }, + { + "epoch": 0.11421395056775567, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 13096 + }, + { + "epoch": 0.11422267185292424, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 13097 + }, + { + "epoch": 0.11423139313809283, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 13098 + }, + { + "epoch": 0.11424011442326142, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 13099 + }, + { + "epoch": 0.11424883570842999, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 13100 + }, + { + "epoch": 0.11425755699359857, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 13101 + }, + { + "epoch": 0.11426627827876716, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 13102 + }, + { + "epoch": 0.11427499956393575, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 13103 + }, + { + "epoch": 0.11428372084910432, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 13104 + }, + { + "epoch": 0.1142924421342729, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 13105 + }, + { + "epoch": 0.11430116341944149, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 13106 + }, + { + "epoch": 0.11430988470461007, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 13107 + }, + { + "epoch": 0.11431860598977865, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 13108 + }, + { + "epoch": 0.11432732727494724, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 13109 + }, + { + "epoch": 0.11433604856011582, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 13110 + }, + { + "epoch": 0.1143447698452844, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 13111 + }, + { + "epoch": 0.11435349113045298, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 13112 + }, + { + "epoch": 0.11436221241562157, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 13113 + }, + { + "epoch": 0.11437093370079014, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 13114 + }, + { + "epoch": 0.11437965498595873, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 13115 + }, + { + "epoch": 0.11438837627112731, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 13116 + }, + { + "epoch": 0.1143970975562959, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 13117 + }, + { + "epoch": 0.11440581884146447, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 13118 + }, + { + "epoch": 0.11441454012663306, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 13119 + }, + { + "epoch": 0.11442326141180165, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 13120 + }, + { + "epoch": 0.11443198269697022, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 13121 + }, + { + "epoch": 0.1144407039821388, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 13122 + }, + { + "epoch": 0.11444942526730739, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13123 + }, + { + "epoch": 0.11445814655247598, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 13124 + }, + { + "epoch": 0.11446686783764455, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 13125 + }, + { + "epoch": 0.11447558912281314, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 13126 + }, + { + "epoch": 0.11448431040798172, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 13127 + }, + { + "epoch": 0.1144930316931503, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 13128 + }, + { + "epoch": 0.11450175297831888, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 13129 + }, + { + "epoch": 0.11451047426348747, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 13130 + }, + { + "epoch": 0.11451919554865606, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 13131 + }, + { + "epoch": 0.11452791683382463, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 13132 + }, + { + "epoch": 0.11453663811899321, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13133 + }, + { + "epoch": 0.1145453594041618, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 13134 + }, + { + "epoch": 0.11455408068933037, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 13135 + }, + { + "epoch": 0.11456280197449896, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 13136 + }, + { + "epoch": 0.11457152325966755, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 13137 + }, + { + "epoch": 0.11458024454483613, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 13138 + }, + { + "epoch": 0.1145889658300047, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 13139 + }, + { + "epoch": 0.11459768711517329, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 13140 + }, + { + "epoch": 0.11460640840034188, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 13141 + }, + { + "epoch": 0.11461512968551045, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13142 + }, + { + "epoch": 0.11462385097067904, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 13143 + }, + { + "epoch": 0.11463257225584762, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 13144 + }, + { + "epoch": 0.11464129354101621, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 13145 + }, + { + "epoch": 0.11465001482618478, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 13146 + }, + { + "epoch": 0.11465873611135337, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 13147 + }, + { + "epoch": 0.11466745739652195, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 13148 + }, + { + "epoch": 0.11467617868169053, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 13149 + }, + { + "epoch": 0.11468489996685911, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 13150 + }, + { + "epoch": 0.1146936212520277, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 13151 + }, + { + "epoch": 0.11470234253719629, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 13152 + }, + { + "epoch": 0.11471106382236486, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 13153 + }, + { + "epoch": 0.11471978510753345, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 13154 + }, + { + "epoch": 0.11472850639270203, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 13155 + }, + { + "epoch": 0.11473722767787062, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 13156 + }, + { + "epoch": 0.11474594896303919, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 13157 + }, + { + "epoch": 0.11475467024820778, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13158 + }, + { + "epoch": 0.11476339153337636, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 13159 + }, + { + "epoch": 0.11477211281854494, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 13160 + }, + { + "epoch": 0.11478083410371352, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 13161 + }, + { + "epoch": 0.11478955538888211, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 13162 + }, + { + "epoch": 0.1147982766740507, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13163 + }, + { + "epoch": 0.11480699795921927, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 13164 + }, + { + "epoch": 0.11481571924438785, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 13165 + }, + { + "epoch": 0.11482444052955644, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13166 + }, + { + "epoch": 0.11483316181472501, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 13167 + }, + { + "epoch": 0.1148418830998936, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 13168 + }, + { + "epoch": 0.11485060438506219, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 13169 + }, + { + "epoch": 0.11485932567023077, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 13170 + }, + { + "epoch": 0.11486804695539934, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 13171 + }, + { + "epoch": 0.11487676824056793, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 13172 + }, + { + "epoch": 0.11488548952573652, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13173 + }, + { + "epoch": 0.11489421081090509, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 13174 + }, + { + "epoch": 0.11490293209607368, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 13175 + }, + { + "epoch": 0.11491165338124226, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 13176 + }, + { + "epoch": 0.11492037466641085, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 13177 + }, + { + "epoch": 0.11492909595157942, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 13178 + }, + { + "epoch": 0.11493781723674801, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 13179 + }, + { + "epoch": 0.1149465385219166, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 13180 + }, + { + "epoch": 0.11495525980708517, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 13181 + }, + { + "epoch": 0.11496398109225375, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 13182 + }, + { + "epoch": 0.11497270237742234, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 13183 + }, + { + "epoch": 0.11498142366259093, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 13184 + }, + { + "epoch": 0.1149901449477595, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 13185 + }, + { + "epoch": 0.11499886623292808, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 13186 + }, + { + "epoch": 0.11500758751809667, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 13187 + }, + { + "epoch": 0.11501630880326524, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 13188 + }, + { + "epoch": 0.11502503008843383, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 13189 + }, + { + "epoch": 0.11503375137360242, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 13190 + }, + { + "epoch": 0.115042472658771, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 13191 + }, + { + "epoch": 0.11505119394393958, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 13192 + }, + { + "epoch": 0.11505991522910816, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 13193 + }, + { + "epoch": 0.11506863651427675, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 13194 + }, + { + "epoch": 0.11507735779944532, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 13195 + }, + { + "epoch": 0.11508607908461391, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 13196 + }, + { + "epoch": 0.1150948003697825, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 13197 + }, + { + "epoch": 0.11510352165495108, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 13198 + }, + { + "epoch": 0.11511224294011965, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 13199 + }, + { + "epoch": 0.11512096422528824, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13200 + }, + { + "epoch": 0.11512968551045683, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 13201 + }, + { + "epoch": 0.1151384067956254, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 13202 + }, + { + "epoch": 0.11514712808079398, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 13203 + }, + { + "epoch": 0.11515584936596257, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 13204 + }, + { + "epoch": 0.11516457065113116, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 13205 + }, + { + "epoch": 0.11517329193629973, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 13206 + }, + { + "epoch": 0.11518201322146832, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 13207 + }, + { + "epoch": 0.1151907345066369, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 13208 + }, + { + "epoch": 0.11519945579180547, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 13209 + }, + { + "epoch": 0.11520817707697406, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13210 + }, + { + "epoch": 0.11521689836214265, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 13211 + }, + { + "epoch": 0.11522561964731123, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 13212 + }, + { + "epoch": 0.1152343409324798, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 13213 + }, + { + "epoch": 0.11524306221764839, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13214 + }, + { + "epoch": 0.11525178350281698, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13215 + }, + { + "epoch": 0.11526050478798555, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 13216 + }, + { + "epoch": 0.11526922607315414, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 13217 + }, + { + "epoch": 0.11527794735832272, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 13218 + }, + { + "epoch": 0.11528666864349131, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 13219 + }, + { + "epoch": 0.11529538992865988, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 13220 + }, + { + "epoch": 0.11530411121382847, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 13221 + }, + { + "epoch": 0.11531283249899706, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 13222 + }, + { + "epoch": 0.11532155378416563, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 13223 + }, + { + "epoch": 0.11533027506933421, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 13224 + }, + { + "epoch": 0.1153389963545028, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 13225 + }, + { + "epoch": 0.11534771763967139, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 13226 + }, + { + "epoch": 0.11535643892483996, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 13227 + }, + { + "epoch": 0.11536516021000855, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 13228 + }, + { + "epoch": 0.11537388149517713, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 13229 + }, + { + "epoch": 0.1153826027803457, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 13230 + }, + { + "epoch": 0.11539132406551429, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 13231 + }, + { + "epoch": 0.11540004535068288, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 13232 + }, + { + "epoch": 0.11540876663585146, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13233 + }, + { + "epoch": 0.11541748792102004, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 13234 + }, + { + "epoch": 0.11542620920618862, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 13235 + }, + { + "epoch": 0.11543493049135721, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 13236 + }, + { + "epoch": 0.11544365177652578, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 13237 + }, + { + "epoch": 0.11545237306169437, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 13238 + }, + { + "epoch": 0.11546109434686296, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13239 + }, + { + "epoch": 0.11546981563203154, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 13240 + }, + { + "epoch": 0.11547853691720011, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 13241 + }, + { + "epoch": 0.1154872582023687, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 13242 + }, + { + "epoch": 0.11549597948753729, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 13243 + }, + { + "epoch": 0.11550470077270586, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 13244 + }, + { + "epoch": 0.11551342205787445, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 13245 + }, + { + "epoch": 0.11552214334304303, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 13246 + }, + { + "epoch": 0.11553086462821162, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 13247 + }, + { + "epoch": 0.11553958591338019, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 13248 + }, + { + "epoch": 0.11554830719854878, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 13249 + }, + { + "epoch": 0.11555702848371736, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 13250 + }, + { + "epoch": 0.11556574976888594, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13251 + }, + { + "epoch": 0.11557447105405452, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 13252 + }, + { + "epoch": 0.11558319233922311, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 13253 + }, + { + "epoch": 0.1155919136243917, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 13254 + }, + { + "epoch": 0.11560063490956027, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 13255 + }, + { + "epoch": 0.11560935619472885, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 13256 + }, + { + "epoch": 0.11561807747989744, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 13257 + }, + { + "epoch": 0.11562679876506601, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 13258 + }, + { + "epoch": 0.1156355200502346, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 13259 + }, + { + "epoch": 0.11564424133540319, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 13260 + }, + { + "epoch": 0.11565296262057177, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13261 + }, + { + "epoch": 0.11566168390574035, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 13262 + }, + { + "epoch": 0.11567040519090893, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13263 + }, + { + "epoch": 0.11567912647607752, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 13264 + }, + { + "epoch": 0.11568784776124609, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 13265 + }, + { + "epoch": 0.11569656904641468, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 13266 + }, + { + "epoch": 0.11570529033158326, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 13267 + }, + { + "epoch": 0.11571401161675185, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 13268 + }, + { + "epoch": 0.11572273290192042, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 13269 + }, + { + "epoch": 0.11573145418708901, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 13270 + }, + { + "epoch": 0.1157401754722576, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 13271 + }, + { + "epoch": 0.11574889675742618, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 13272 + }, + { + "epoch": 0.11575761804259475, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 13273 + }, + { + "epoch": 0.11576633932776334, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 13274 + }, + { + "epoch": 0.11577506061293193, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 13275 + }, + { + "epoch": 0.1157837818981005, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 13276 + }, + { + "epoch": 0.11579250318326909, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 13277 + }, + { + "epoch": 0.11580122446843767, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 13278 + }, + { + "epoch": 0.11580994575360626, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 13279 + }, + { + "epoch": 0.11581866703877483, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 13280 + }, + { + "epoch": 0.11582738832394342, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 13281 + }, + { + "epoch": 0.115836109609112, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 13282 + }, + { + "epoch": 0.11584483089428058, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 13283 + }, + { + "epoch": 0.11585355217944916, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 13284 + }, + { + "epoch": 0.11586227346461775, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 13285 + }, + { + "epoch": 0.11587099474978634, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 13286 + }, + { + "epoch": 0.11587971603495491, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13287 + }, + { + "epoch": 0.1158884373201235, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 13288 + }, + { + "epoch": 0.11589715860529208, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 13289 + }, + { + "epoch": 0.11590587989046065, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 13290 + }, + { + "epoch": 0.11591460117562924, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 13291 + }, + { + "epoch": 0.11592332246079783, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 13292 + }, + { + "epoch": 0.11593204374596641, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 13293 + }, + { + "epoch": 0.11594076503113498, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 13294 + }, + { + "epoch": 0.11594948631630357, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 13295 + }, + { + "epoch": 0.11595820760147216, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 13296 + }, + { + "epoch": 0.11596692888664073, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 13297 + }, + { + "epoch": 0.11597565017180932, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 13298 + }, + { + "epoch": 0.1159843714569779, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 13299 + }, + { + "epoch": 0.11599309274214649, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 13300 + }, + { + "epoch": 0.11600181402731506, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 13301 + }, + { + "epoch": 0.11601053531248365, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 13302 + }, + { + "epoch": 0.11601925659765223, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 13303 + }, + { + "epoch": 0.1160279778828208, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 13304 + }, + { + "epoch": 0.1160366991679894, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 13305 + }, + { + "epoch": 0.11604542045315798, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 13306 + }, + { + "epoch": 0.11605414173832657, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 13307 + }, + { + "epoch": 0.11606286302349514, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 13308 + }, + { + "epoch": 0.11607158430866373, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 13309 + }, + { + "epoch": 0.11608030559383231, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 13310 + }, + { + "epoch": 0.11608902687900088, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 13311 + }, + { + "epoch": 0.11609774816416947, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 13312 + }, + { + "epoch": 0.11610646944933806, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 13313 + }, + { + "epoch": 0.11611519073450664, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 13314 + }, + { + "epoch": 0.11612391201967522, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 13315 + }, + { + "epoch": 0.1161326333048438, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13316 + }, + { + "epoch": 0.11614135459001239, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 13317 + }, + { + "epoch": 0.11615007587518096, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13318 + }, + { + "epoch": 0.11615879716034955, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 13319 + }, + { + "epoch": 0.11616751844551813, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 13320 + }, + { + "epoch": 0.11617623973068672, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 13321 + }, + { + "epoch": 0.11618496101585529, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 13322 + }, + { + "epoch": 0.11619368230102388, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 13323 + }, + { + "epoch": 0.11620240358619247, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 13324 + }, + { + "epoch": 0.11621112487136104, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13325 + }, + { + "epoch": 0.11621984615652962, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 13326 + }, + { + "epoch": 0.11622856744169821, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 13327 + }, + { + "epoch": 0.1162372887268668, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 13328 + }, + { + "epoch": 0.11624601001203537, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 13329 + }, + { + "epoch": 0.11625473129720396, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13330 + }, + { + "epoch": 0.11626345258237254, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 13331 + }, + { + "epoch": 0.11627217386754111, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 13332 + }, + { + "epoch": 0.1162808951527097, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 13333 + }, + { + "epoch": 0.11628961643787829, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 13334 + }, + { + "epoch": 0.11629833772304687, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 13335 + }, + { + "epoch": 0.11630705900821545, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 13336 + }, + { + "epoch": 0.11631578029338403, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 13337 + }, + { + "epoch": 0.11632450157855262, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13338 + }, + { + "epoch": 0.11633322286372119, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 13339 + }, + { + "epoch": 0.11634194414888978, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 13340 + }, + { + "epoch": 0.11635066543405836, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 13341 + }, + { + "epoch": 0.11635938671922695, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 13342 + }, + { + "epoch": 0.11636810800439552, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 13343 + }, + { + "epoch": 0.11637682928956411, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 13344 + }, + { + "epoch": 0.1163855505747327, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 13345 + }, + { + "epoch": 0.11639427185990127, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 13346 + }, + { + "epoch": 0.11640299314506986, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 13347 + }, + { + "epoch": 0.11641171443023844, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 13348 + }, + { + "epoch": 0.11642043571540703, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 13349 + }, + { + "epoch": 0.1164291570005756, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 13350 + }, + { + "epoch": 0.11643787828574419, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 13351 + }, + { + "epoch": 0.11644659957091277, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 13352 + }, + { + "epoch": 0.11645532085608135, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 13353 + }, + { + "epoch": 0.11646404214124993, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 13354 + }, + { + "epoch": 0.11647276342641852, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 13355 + }, + { + "epoch": 0.1164814847115871, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13356 + }, + { + "epoch": 0.11649020599675568, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 13357 + }, + { + "epoch": 0.11649892728192426, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 13358 + }, + { + "epoch": 0.11650764856709285, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 13359 + }, + { + "epoch": 0.11651636985226142, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 13360 + }, + { + "epoch": 0.11652509113743001, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 13361 + }, + { + "epoch": 0.1165338124225986, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 13362 + }, + { + "epoch": 0.11654253370776718, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 13363 + }, + { + "epoch": 0.11655125499293575, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 13364 + }, + { + "epoch": 0.11655997627810434, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 13365 + }, + { + "epoch": 0.11656869756327293, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13366 + }, + { + "epoch": 0.1165774188484415, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 13367 + }, + { + "epoch": 0.11658614013361009, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 13368 + }, + { + "epoch": 0.11659486141877867, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 13369 + }, + { + "epoch": 0.11660358270394726, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 13370 + }, + { + "epoch": 0.11661230398911583, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 13371 + }, + { + "epoch": 0.11662102527428442, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 13372 + }, + { + "epoch": 0.116629746559453, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 13373 + }, + { + "epoch": 0.11663846784462158, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 13374 + }, + { + "epoch": 0.11664718912979016, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13375 + }, + { + "epoch": 0.11665591041495875, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 13376 + }, + { + "epoch": 0.11666463170012734, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 13377 + }, + { + "epoch": 0.11667335298529591, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13378 + }, + { + "epoch": 0.1166820742704645, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 13379 + }, + { + "epoch": 0.11669079555563308, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 13380 + }, + { + "epoch": 0.11669951684080165, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 13381 + }, + { + "epoch": 0.11670823812597024, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 13382 + }, + { + "epoch": 0.11671695941113883, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 13383 + }, + { + "epoch": 0.11672568069630741, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 13384 + }, + { + "epoch": 0.11673440198147599, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 13385 + }, + { + "epoch": 0.11674312326664457, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 13386 + }, + { + "epoch": 0.11675184455181316, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 13387 + }, + { + "epoch": 0.11676056583698174, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 13388 + }, + { + "epoch": 0.11676928712215032, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 13389 + }, + { + "epoch": 0.1167780084073189, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 13390 + }, + { + "epoch": 0.11678672969248749, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 13391 + }, + { + "epoch": 0.11679545097765606, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 13392 + }, + { + "epoch": 0.11680417226282465, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 13393 + }, + { + "epoch": 0.11681289354799324, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 13394 + }, + { + "epoch": 0.11682161483316182, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 13395 + }, + { + "epoch": 0.1168303361183304, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 13396 + }, + { + "epoch": 0.11683905740349898, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 13397 + }, + { + "epoch": 0.11684777868866757, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 13398 + }, + { + "epoch": 0.11685649997383614, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 13399 + }, + { + "epoch": 0.11686522125900473, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 13400 + }, + { + "epoch": 0.11687394254417331, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 13401 + }, + { + "epoch": 0.1168826638293419, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13402 + }, + { + "epoch": 0.11689138511451047, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 13403 + }, + { + "epoch": 0.11690010639967906, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 13404 + }, + { + "epoch": 0.11690882768484764, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 13405 + }, + { + "epoch": 0.11691754897001622, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 13406 + }, + { + "epoch": 0.1169262702551848, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 13407 + }, + { + "epoch": 0.11693499154035339, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 13408 + }, + { + "epoch": 0.11694371282552198, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 13409 + }, + { + "epoch": 0.11695243411069055, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 13410 + }, + { + "epoch": 0.11696115539585913, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 13411 + }, + { + "epoch": 0.11696987668102772, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 13412 + }, + { + "epoch": 0.1169785979661963, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 13413 + }, + { + "epoch": 0.11698731925136488, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 13414 + }, + { + "epoch": 0.11699604053653347, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 13415 + }, + { + "epoch": 0.11700476182170205, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 13416 + }, + { + "epoch": 0.11701348310687062, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 13417 + }, + { + "epoch": 0.11702220439203921, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 13418 + }, + { + "epoch": 0.1170309256772078, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 13419 + }, + { + "epoch": 0.11703964696237637, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 13420 + }, + { + "epoch": 0.11704836824754496, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 13421 + }, + { + "epoch": 0.11705708953271354, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 13422 + }, + { + "epoch": 0.11706581081788213, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 13423 + }, + { + "epoch": 0.1170745321030507, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 13424 + }, + { + "epoch": 0.11708325338821929, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 13425 + }, + { + "epoch": 0.11709197467338787, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 13426 + }, + { + "epoch": 0.11710069595855645, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 13427 + }, + { + "epoch": 0.11710941724372503, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13428 + }, + { + "epoch": 0.11711813852889362, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 13429 + }, + { + "epoch": 0.1171268598140622, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 13430 + }, + { + "epoch": 0.11713558109923078, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 13431 + }, + { + "epoch": 0.11714430238439937, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 13432 + }, + { + "epoch": 0.11715302366956795, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 13433 + }, + { + "epoch": 0.11716174495473652, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 13434 + }, + { + "epoch": 0.11717046623990511, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 13435 + }, + { + "epoch": 0.1171791875250737, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 13436 + }, + { + "epoch": 0.11718790881024228, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 13437 + }, + { + "epoch": 0.11719663009541086, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 13438 + }, + { + "epoch": 0.11720535138057944, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 13439 + }, + { + "epoch": 0.11721407266574803, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 13440 + }, + { + "epoch": 0.1172227939509166, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 13441 + }, + { + "epoch": 0.11723151523608519, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 13442 + }, + { + "epoch": 0.11724023652125377, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 13443 + }, + { + "epoch": 0.11724895780642236, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 13444 + }, + { + "epoch": 0.11725767909159093, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13445 + }, + { + "epoch": 0.11726640037675952, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13446 + }, + { + "epoch": 0.1172751216619281, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 13447 + }, + { + "epoch": 0.11728384294709668, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 13448 + }, + { + "epoch": 0.11729256423226526, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 13449 + }, + { + "epoch": 0.11730128551743385, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 13450 + }, + { + "epoch": 0.11731000680260244, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 13451 + }, + { + "epoch": 0.11731872808777101, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 13452 + }, + { + "epoch": 0.1173274493729396, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 13453 + }, + { + "epoch": 0.11733617065810818, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 13454 + }, + { + "epoch": 0.11734489194327676, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 13455 + }, + { + "epoch": 0.11735361322844534, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 13456 + }, + { + "epoch": 0.11736233451361393, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 13457 + }, + { + "epoch": 0.11737105579878251, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 13458 + }, + { + "epoch": 0.11737977708395109, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 13459 + }, + { + "epoch": 0.11738849836911967, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 13460 + }, + { + "epoch": 0.11739721965428826, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 13461 + }, + { + "epoch": 0.11740594093945683, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 13462 + }, + { + "epoch": 0.11741466222462542, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13463 + }, + { + "epoch": 0.117423383509794, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 13464 + }, + { + "epoch": 0.11743210479496259, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13465 + }, + { + "epoch": 0.11744082608013116, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 13466 + }, + { + "epoch": 0.11744954736529975, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 13467 + }, + { + "epoch": 0.11745826865046834, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 13468 + }, + { + "epoch": 0.11746698993563691, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 13469 + }, + { + "epoch": 0.1174757112208055, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 13470 + }, + { + "epoch": 0.11748443250597408, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 13471 + }, + { + "epoch": 0.11749315379114267, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 13472 + }, + { + "epoch": 0.11750187507631124, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 13473 + }, + { + "epoch": 0.11751059636147983, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 13474 + }, + { + "epoch": 0.11751931764664841, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 13475 + }, + { + "epoch": 0.11752803893181699, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13476 + }, + { + "epoch": 0.11753676021698557, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 13477 + }, + { + "epoch": 0.11754548150215416, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 13478 + }, + { + "epoch": 0.11755420278732275, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 13479 + }, + { + "epoch": 0.11756292407249132, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 13480 + }, + { + "epoch": 0.1175716453576599, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 13481 + }, + { + "epoch": 0.11758036664282849, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 13482 + }, + { + "epoch": 0.11758908792799706, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 13483 + }, + { + "epoch": 0.11759780921316565, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 13484 + }, + { + "epoch": 0.11760653049833424, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 13485 + }, + { + "epoch": 0.11761525178350282, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 13486 + }, + { + "epoch": 0.1176239730686714, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13487 + }, + { + "epoch": 0.11763269435383998, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 13488 + }, + { + "epoch": 0.11764141563900857, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 13489 + }, + { + "epoch": 0.11765013692417714, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 13490 + }, + { + "epoch": 0.11765885820934573, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 13491 + }, + { + "epoch": 0.11766757949451431, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 13492 + }, + { + "epoch": 0.1176763007796829, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 13493 + }, + { + "epoch": 0.11768502206485147, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 13494 + }, + { + "epoch": 0.11769374335002006, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 13495 + }, + { + "epoch": 0.11770246463518864, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 13496 + }, + { + "epoch": 0.11771118592035722, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 13497 + }, + { + "epoch": 0.1177199072055258, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 13498 + }, + { + "epoch": 0.11772862849069439, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 13499 + }, + { + "epoch": 0.11773734977586298, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 13500 + }, + { + "epoch": 0.11774607106103155, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 13501 + }, + { + "epoch": 0.11775479234620014, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 13502 + }, + { + "epoch": 0.11776351363136872, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 13503 + }, + { + "epoch": 0.11777223491653731, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 13504 + }, + { + "epoch": 0.11778095620170588, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 13505 + }, + { + "epoch": 0.11778967748687447, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 13506 + }, + { + "epoch": 0.11779839877204305, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 13507 + }, + { + "epoch": 0.11780712005721163, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 13508 + }, + { + "epoch": 0.11781584134238021, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 13509 + }, + { + "epoch": 0.1178245626275488, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 13510 + }, + { + "epoch": 0.11783328391271738, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 13511 + }, + { + "epoch": 0.11784200519788596, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 13512 + }, + { + "epoch": 0.11785072648305454, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 13513 + }, + { + "epoch": 0.11785944776822313, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 13514 + }, + { + "epoch": 0.1178681690533917, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 13515 + }, + { + "epoch": 0.11787689033856029, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 13516 + }, + { + "epoch": 0.11788561162372888, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 13517 + }, + { + "epoch": 0.11789433290889746, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 13518 + }, + { + "epoch": 0.11790305419406603, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13519 + }, + { + "epoch": 0.11791177547923462, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 13520 + }, + { + "epoch": 0.11792049676440321, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 13521 + }, + { + "epoch": 0.11792921804957178, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 13522 + }, + { + "epoch": 0.11793793933474037, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 13523 + }, + { + "epoch": 0.11794666061990895, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 13524 + }, + { + "epoch": 0.11795538190507754, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 13525 + }, + { + "epoch": 0.11796410319024611, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 13526 + }, + { + "epoch": 0.1179728244754147, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 13527 + }, + { + "epoch": 0.11798154576058328, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 13528 + }, + { + "epoch": 0.11799026704575186, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 13529 + }, + { + "epoch": 0.11799898833092044, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 13530 + }, + { + "epoch": 0.11800770961608903, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 13531 + }, + { + "epoch": 0.11801643090125762, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 13532 + }, + { + "epoch": 0.11802515218642619, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 13533 + }, + { + "epoch": 0.11803387347159477, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 13534 + }, + { + "epoch": 0.11804259475676336, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 13535 + }, + { + "epoch": 0.11805131604193193, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 13536 + }, + { + "epoch": 0.11806003732710052, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 13537 + }, + { + "epoch": 0.1180687586122691, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13538 + }, + { + "epoch": 0.11807747989743769, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 13539 + }, + { + "epoch": 0.11808620118260627, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 13540 + }, + { + "epoch": 0.11809492246777485, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 13541 + }, + { + "epoch": 0.11810364375294344, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 13542 + }, + { + "epoch": 0.11811236503811201, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 13543 + }, + { + "epoch": 0.1181210863232806, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 13544 + }, + { + "epoch": 0.11812980760844918, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13545 + }, + { + "epoch": 0.11813852889361777, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 13546 + }, + { + "epoch": 0.11814725017878634, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 13547 + }, + { + "epoch": 0.11815597146395493, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 13548 + }, + { + "epoch": 0.11816469274912351, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 13549 + }, + { + "epoch": 0.11817341403429209, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 13550 + }, + { + "epoch": 0.11818213531946067, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 13551 + }, + { + "epoch": 0.11819085660462926, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 13552 + }, + { + "epoch": 0.11819957788979785, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 13553 + }, + { + "epoch": 0.11820829917496642, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 13554 + }, + { + "epoch": 0.118217020460135, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 13555 + }, + { + "epoch": 0.11822574174530359, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 13556 + }, + { + "epoch": 0.11823446303047216, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13557 + }, + { + "epoch": 0.11824318431564075, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 13558 + }, + { + "epoch": 0.11825190560080934, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 13559 + }, + { + "epoch": 0.11826062688597792, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 13560 + }, + { + "epoch": 0.1182693481711465, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 13561 + }, + { + "epoch": 0.11827806945631508, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 13562 + }, + { + "epoch": 0.11828679074148367, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 13563 + }, + { + "epoch": 0.11829551202665224, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 13564 + }, + { + "epoch": 0.11830423331182083, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 13565 + }, + { + "epoch": 0.11831295459698941, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 13566 + }, + { + "epoch": 0.118321675882158, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 13567 + }, + { + "epoch": 0.11833039716732657, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 13568 + }, + { + "epoch": 0.11833911845249516, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 13569 + }, + { + "epoch": 0.11834783973766375, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 13570 + }, + { + "epoch": 0.11835656102283232, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 13571 + }, + { + "epoch": 0.1183652823080009, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 13572 + }, + { + "epoch": 0.11837400359316949, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 13573 + }, + { + "epoch": 0.11838272487833808, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 13574 + }, + { + "epoch": 0.11839144616350665, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 13575 + }, + { + "epoch": 0.11840016744867524, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 13576 + }, + { + "epoch": 0.11840888873384382, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13577 + }, + { + "epoch": 0.1184176100190124, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 13578 + }, + { + "epoch": 0.11842633130418098, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 13579 + }, + { + "epoch": 0.11843505258934957, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 13580 + }, + { + "epoch": 0.11844377387451815, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 13581 + }, + { + "epoch": 0.11845249515968673, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 13582 + }, + { + "epoch": 0.11846121644485531, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 13583 + }, + { + "epoch": 0.1184699377300239, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 13584 + }, + { + "epoch": 0.11847865901519247, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13585 + }, + { + "epoch": 0.11848738030036106, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 13586 + }, + { + "epoch": 0.11849610158552965, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 13587 + }, + { + "epoch": 0.11850482287069823, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 13588 + }, + { + "epoch": 0.1185135441558668, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 13589 + }, + { + "epoch": 0.11852226544103539, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 13590 + }, + { + "epoch": 0.11853098672620398, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 13591 + }, + { + "epoch": 0.11853970801137255, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 13592 + }, + { + "epoch": 0.11854842929654114, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 13593 + }, + { + "epoch": 0.11855715058170972, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 13594 + }, + { + "epoch": 0.11856587186687831, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 13595 + }, + { + "epoch": 0.11857459315204688, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 13596 + }, + { + "epoch": 0.11858331443721547, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 13597 + }, + { + "epoch": 0.11859203572238405, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 13598 + }, + { + "epoch": 0.11860075700755263, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 13599 + }, + { + "epoch": 0.11860947829272121, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 13600 + }, + { + "epoch": 0.1186181995778898, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 13601 + }, + { + "epoch": 0.11862692086305839, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 13602 + }, + { + "epoch": 0.11863564214822696, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13603 + }, + { + "epoch": 0.11864436343339554, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 13604 + }, + { + "epoch": 0.11865308471856413, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 13605 + }, + { + "epoch": 0.1186618060037327, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 13606 + }, + { + "epoch": 0.11867052728890129, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 13607 + }, + { + "epoch": 0.11867924857406988, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 13608 + }, + { + "epoch": 0.11868796985923846, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 13609 + }, + { + "epoch": 0.11869669114440703, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 13610 + }, + { + "epoch": 0.11870541242957562, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 13611 + }, + { + "epoch": 0.11871413371474421, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 13612 + }, + { + "epoch": 0.11872285499991278, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 13613 + }, + { + "epoch": 0.11873157628508137, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 13614 + }, + { + "epoch": 0.11874029757024995, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 13615 + }, + { + "epoch": 0.11874901885541854, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 13616 + }, + { + "epoch": 0.11875774014058711, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 13617 + }, + { + "epoch": 0.1187664614257557, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13618 + }, + { + "epoch": 0.11877518271092428, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 13619 + }, + { + "epoch": 0.11878390399609287, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 13620 + }, + { + "epoch": 0.11879262528126144, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 13621 + }, + { + "epoch": 0.11880134656643003, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 13622 + }, + { + "epoch": 0.11881006785159862, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 13623 + }, + { + "epoch": 0.11881878913676719, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 13624 + }, + { + "epoch": 0.11882751042193578, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 13625 + }, + { + "epoch": 0.11883623170710436, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 13626 + }, + { + "epoch": 0.11884495299227295, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 13627 + }, + { + "epoch": 0.11885367427744152, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 13628 + }, + { + "epoch": 0.11886239556261011, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 13629 + }, + { + "epoch": 0.1188711168477787, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 13630 + }, + { + "epoch": 0.11887983813294727, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 13631 + }, + { + "epoch": 0.11888855941811585, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 13632 + }, + { + "epoch": 0.11889728070328444, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 13633 + }, + { + "epoch": 0.11890600198845303, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 13634 + }, + { + "epoch": 0.1189147232736216, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 13635 + }, + { + "epoch": 0.11892344455879018, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 13636 + }, + { + "epoch": 0.11893216584395877, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 13637 + }, + { + "epoch": 0.11894088712912734, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 13638 + }, + { + "epoch": 0.11894960841429593, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 13639 + }, + { + "epoch": 0.11895832969946452, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 13640 + }, + { + "epoch": 0.1189670509846331, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 13641 + }, + { + "epoch": 0.11897577226980167, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 13642 + }, + { + "epoch": 0.11898449355497026, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 13643 + }, + { + "epoch": 0.11899321484013885, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 13644 + }, + { + "epoch": 0.11900193612530742, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 13645 + }, + { + "epoch": 0.119010657410476, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 13646 + }, + { + "epoch": 0.11901937869564459, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 13647 + }, + { + "epoch": 0.11902809998081318, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13648 + }, + { + "epoch": 0.11903682126598175, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 13649 + }, + { + "epoch": 0.11904554255115034, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 13650 + }, + { + "epoch": 0.11905426383631892, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 13651 + }, + { + "epoch": 0.1190629851214875, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 13652 + }, + { + "epoch": 0.11907170640665608, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 13653 + }, + { + "epoch": 0.11908042769182467, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 13654 + }, + { + "epoch": 0.11908914897699326, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 13655 + }, + { + "epoch": 0.11909787026216183, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 13656 + }, + { + "epoch": 0.11910659154733041, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13657 + }, + { + "epoch": 0.119115312832499, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 13658 + }, + { + "epoch": 0.11912403411766757, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 13659 + }, + { + "epoch": 0.11913275540283616, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 13660 + }, + { + "epoch": 0.11914147668800475, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 13661 + }, + { + "epoch": 0.11915019797317333, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13662 + }, + { + "epoch": 0.1191589192583419, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 13663 + }, + { + "epoch": 0.11916764054351049, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 13664 + }, + { + "epoch": 0.11917636182867908, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 13665 + }, + { + "epoch": 0.11918508311384765, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 13666 + }, + { + "epoch": 0.11919380439901624, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 13667 + }, + { + "epoch": 0.11920252568418482, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 13668 + }, + { + "epoch": 0.11921124696935341, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 13669 + }, + { + "epoch": 0.11921996825452198, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 13670 + }, + { + "epoch": 0.11922868953969057, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 13671 + }, + { + "epoch": 0.11923741082485916, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 13672 + }, + { + "epoch": 0.11924613211002773, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 13673 + }, + { + "epoch": 0.11925485339519631, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 13674 + }, + { + "epoch": 0.1192635746803649, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13675 + }, + { + "epoch": 0.11927229596553349, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 13676 + }, + { + "epoch": 0.11928101725070206, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 13677 + }, + { + "epoch": 0.11928973853587065, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 13678 + }, + { + "epoch": 0.11929845982103923, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 13679 + }, + { + "epoch": 0.1193071811062078, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 13680 + }, + { + "epoch": 0.11931590239137639, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 13681 + }, + { + "epoch": 0.11932462367654498, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 13682 + }, + { + "epoch": 0.11933334496171356, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 13683 + }, + { + "epoch": 0.11934206624688214, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 13684 + }, + { + "epoch": 0.11935078753205072, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 13685 + }, + { + "epoch": 0.11935950881721931, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 13686 + }, + { + "epoch": 0.11936823010238788, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 13687 + }, + { + "epoch": 0.11937695138755647, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 13688 + }, + { + "epoch": 0.11938567267272505, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 13689 + }, + { + "epoch": 0.11939439395789364, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 13690 + }, + { + "epoch": 0.11940311524306221, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 13691 + }, + { + "epoch": 0.1194118365282308, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 13692 + }, + { + "epoch": 0.11942055781339939, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 13693 + }, + { + "epoch": 0.11942927909856796, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 13694 + }, + { + "epoch": 0.11943800038373655, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 13695 + }, + { + "epoch": 0.11944672166890513, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 13696 + }, + { + "epoch": 0.11945544295407372, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 13697 + }, + { + "epoch": 0.11946416423924229, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 13698 + }, + { + "epoch": 0.11947288552441088, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 13699 + }, + { + "epoch": 0.11948160680957946, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 13700 + }, + { + "epoch": 0.11949032809474804, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 13701 + }, + { + "epoch": 0.11949904937991662, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 13702 + }, + { + "epoch": 0.11950777066508521, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 13703 + }, + { + "epoch": 0.1195164919502538, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 13704 + }, + { + "epoch": 0.11952521323542237, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 13705 + }, + { + "epoch": 0.11953393452059095, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 13706 + }, + { + "epoch": 0.11954265580575954, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 13707 + }, + { + "epoch": 0.11955137709092811, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 13708 + }, + { + "epoch": 0.1195600983760967, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 13709 + }, + { + "epoch": 0.11956881966126529, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 13710 + }, + { + "epoch": 0.11957754094643387, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 13711 + }, + { + "epoch": 0.11958626223160244, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 13712 + }, + { + "epoch": 0.11959498351677103, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 13713 + }, + { + "epoch": 0.11960370480193962, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 13714 + }, + { + "epoch": 0.11961242608710819, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 13715 + }, + { + "epoch": 0.11962114737227678, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 13716 + }, + { + "epoch": 0.11962986865744536, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 13717 + }, + { + "epoch": 0.11963858994261395, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 13718 + }, + { + "epoch": 0.11964731122778252, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 13719 + }, + { + "epoch": 0.11965603251295111, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 13720 + }, + { + "epoch": 0.1196647537981197, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 13721 + }, + { + "epoch": 0.11967347508328827, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 13722 + }, + { + "epoch": 0.11968219636845685, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 13723 + }, + { + "epoch": 0.11969091765362544, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 13724 + }, + { + "epoch": 0.11969963893879403, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 13725 + }, + { + "epoch": 0.1197083602239626, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 13726 + }, + { + "epoch": 0.11971708150913118, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 13727 + }, + { + "epoch": 0.11972580279429977, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 13728 + }, + { + "epoch": 0.11973452407946834, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13729 + }, + { + "epoch": 0.11974324536463693, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 13730 + }, + { + "epoch": 0.11975196664980552, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 13731 + }, + { + "epoch": 0.1197606879349741, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 13732 + }, + { + "epoch": 0.11976940922014268, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 13733 + }, + { + "epoch": 0.11977813050531126, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 13734 + }, + { + "epoch": 0.11978685179047985, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 13735 + }, + { + "epoch": 0.11979557307564843, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 13736 + }, + { + "epoch": 0.119804294360817, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 13737 + }, + { + "epoch": 0.1198130156459856, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 13738 + }, + { + "epoch": 0.11982173693115418, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 13739 + }, + { + "epoch": 0.11983045821632275, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 13740 + }, + { + "epoch": 0.11983917950149134, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 13741 + }, + { + "epoch": 0.11984790078665992, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 13742 + }, + { + "epoch": 0.11985662207182851, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 13743 + }, + { + "epoch": 0.11986534335699708, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 13744 + }, + { + "epoch": 0.11987406464216567, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 13745 + }, + { + "epoch": 0.11988278592733426, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 13746 + }, + { + "epoch": 0.11989150721250283, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 13747 + }, + { + "epoch": 0.11990022849767142, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 13748 + }, + { + "epoch": 0.11990894978284, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 13749 + }, + { + "epoch": 0.11991767106800859, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 13750 + }, + { + "epoch": 0.11992639235317716, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 13751 + }, + { + "epoch": 0.11993511363834575, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 13752 + }, + { + "epoch": 0.11994383492351433, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 13753 + }, + { + "epoch": 0.1199525562086829, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 13754 + }, + { + "epoch": 0.11996127749385149, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 13755 + }, + { + "epoch": 0.11996999877902008, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 13756 + }, + { + "epoch": 0.11997872006418867, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 13757 + }, + { + "epoch": 0.11998744134935724, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 13758 + }, + { + "epoch": 0.11999616263452582, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 13759 + }, + { + "epoch": 0.12000488391969441, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 13760 + }, + { + "epoch": 0.12001360520486298, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 13761 + }, + { + "epoch": 0.12002232649003157, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 13762 + }, + { + "epoch": 0.12003104777520016, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 13763 + }, + { + "epoch": 0.12003976906036874, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 13764 + }, + { + "epoch": 0.12004849034553731, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 13765 + }, + { + "epoch": 0.1200572116307059, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 13766 + }, + { + "epoch": 0.12006593291587449, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 13767 + }, + { + "epoch": 0.12007465420104306, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 13768 + }, + { + "epoch": 0.12008337548621165, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 13769 + }, + { + "epoch": 0.12009209677138023, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 13770 + }, + { + "epoch": 0.12010081805654882, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 13771 + }, + { + "epoch": 0.12010953934171739, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 13772 + }, + { + "epoch": 0.12011826062688598, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 13773 + }, + { + "epoch": 0.12012698191205456, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13774 + }, + { + "epoch": 0.12013570319722314, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 13775 + }, + { + "epoch": 0.12014442448239172, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 13776 + }, + { + "epoch": 0.12015314576756031, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 13777 + }, + { + "epoch": 0.1201618670527289, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 13778 + }, + { + "epoch": 0.12017058833789747, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 13779 + }, + { + "epoch": 0.12017930962306606, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 13780 + }, + { + "epoch": 0.12018803090823464, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 13781 + }, + { + "epoch": 0.12019675219340321, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 13782 + }, + { + "epoch": 0.1202054734785718, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 13783 + }, + { + "epoch": 0.12021419476374039, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 13784 + }, + { + "epoch": 0.12022291604890897, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 13785 + }, + { + "epoch": 0.12023163733407755, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 13786 + }, + { + "epoch": 0.12024035861924613, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 13787 + }, + { + "epoch": 0.12024907990441472, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 13788 + }, + { + "epoch": 0.12025780118958329, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 13789 + }, + { + "epoch": 0.12026652247475188, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 13790 + }, + { + "epoch": 0.12027524375992046, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 13791 + }, + { + "epoch": 0.12028396504508905, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 13792 + }, + { + "epoch": 0.12029268633025762, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 13793 + }, + { + "epoch": 0.12030140761542621, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 13794 + }, + { + "epoch": 0.1203101289005948, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 13795 + }, + { + "epoch": 0.12031885018576337, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 13796 + }, + { + "epoch": 0.12032757147093195, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 13797 + }, + { + "epoch": 0.12033629275610054, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 13798 + }, + { + "epoch": 0.12034501404126913, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 13799 + }, + { + "epoch": 0.1203537353264377, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 13800 + }, + { + "epoch": 0.12036245661160629, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 13801 + }, + { + "epoch": 0.12037117789677487, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 13802 + }, + { + "epoch": 0.12037989918194344, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 13803 + }, + { + "epoch": 0.12038862046711203, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 13804 + }, + { + "epoch": 0.12039734175228062, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 13805 + }, + { + "epoch": 0.1204060630374492, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13806 + }, + { + "epoch": 0.12041478432261778, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 13807 + }, + { + "epoch": 0.12042350560778636, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 13808 + }, + { + "epoch": 0.12043222689295495, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 13809 + }, + { + "epoch": 0.12044094817812352, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 13810 + }, + { + "epoch": 0.12044966946329211, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 13811 + }, + { + "epoch": 0.1204583907484607, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 13812 + }, + { + "epoch": 0.12046711203362928, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 13813 + }, + { + "epoch": 0.12047583331879785, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 13814 + }, + { + "epoch": 0.12048455460396644, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 13815 + }, + { + "epoch": 0.12049327588913503, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 13816 + }, + { + "epoch": 0.1205019971743036, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 13817 + }, + { + "epoch": 0.12051071845947219, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 13818 + }, + { + "epoch": 0.12051943974464077, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 13819 + }, + { + "epoch": 0.12052816102980936, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 13820 + }, + { + "epoch": 0.12053688231497793, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 13821 + }, + { + "epoch": 0.12054560360014652, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 13822 + }, + { + "epoch": 0.1205543248853151, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 13823 + }, + { + "epoch": 0.12056304617048368, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 13824 + }, + { + "epoch": 0.12057176745565226, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 13825 + }, + { + "epoch": 0.12058048874082085, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 13826 + }, + { + "epoch": 0.12058921002598944, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 13827 + }, + { + "epoch": 0.12059793131115801, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 13828 + }, + { + "epoch": 0.1206066525963266, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 13829 + }, + { + "epoch": 0.12061537388149518, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 13830 + }, + { + "epoch": 0.12062409516666375, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 13831 + }, + { + "epoch": 0.12063281645183234, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 13832 + }, + { + "epoch": 0.12064153773700093, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 13833 + }, + { + "epoch": 0.12065025902216951, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 13834 + }, + { + "epoch": 0.12065898030733808, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 13835 + }, + { + "epoch": 0.12066770159250667, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 13836 + }, + { + "epoch": 0.12067642287767526, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 13837 + }, + { + "epoch": 0.12068514416284383, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13838 + }, + { + "epoch": 0.12069386544801242, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 13839 + }, + { + "epoch": 0.120702586733181, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 13840 + }, + { + "epoch": 0.12071130801834959, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 13841 + }, + { + "epoch": 0.12072002930351816, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 13842 + }, + { + "epoch": 0.12072875058868675, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 13843 + }, + { + "epoch": 0.12073747187385533, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 13844 + }, + { + "epoch": 0.12074619315902392, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 13845 + }, + { + "epoch": 0.1207549144441925, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 13846 + }, + { + "epoch": 0.12076363572936108, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 13847 + }, + { + "epoch": 0.12077235701452967, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 13848 + }, + { + "epoch": 0.12078107829969824, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 13849 + }, + { + "epoch": 0.12078979958486682, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 13850 + }, + { + "epoch": 0.12079852087003541, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 13851 + }, + { + "epoch": 0.120807242155204, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 13852 + }, + { + "epoch": 0.12081596344037257, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 13853 + }, + { + "epoch": 0.12082468472554116, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 13854 + }, + { + "epoch": 0.12083340601070974, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 13855 + }, + { + "epoch": 0.12084212729587832, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 13856 + }, + { + "epoch": 0.1208508485810469, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 13857 + }, + { + "epoch": 0.12085956986621549, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 13858 + }, + { + "epoch": 0.12086829115138407, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 13859 + }, + { + "epoch": 0.12087701243655265, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 13860 + }, + { + "epoch": 0.12088573372172123, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 13861 + }, + { + "epoch": 0.12089445500688982, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 13862 + }, + { + "epoch": 0.12090317629205839, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 13863 + }, + { + "epoch": 0.12091189757722698, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 13864 + }, + { + "epoch": 0.12092061886239557, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 13865 + }, + { + "epoch": 0.12092934014756415, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 13866 + }, + { + "epoch": 0.12093806143273272, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 13867 + }, + { + "epoch": 0.12094678271790131, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 13868 + }, + { + "epoch": 0.1209555040030699, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 13869 + }, + { + "epoch": 0.12096422528823847, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13870 + }, + { + "epoch": 0.12097294657340706, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 13871 + }, + { + "epoch": 0.12098166785857564, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 13872 + }, + { + "epoch": 0.12099038914374423, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 13873 + }, + { + "epoch": 0.1209991104289128, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13874 + }, + { + "epoch": 0.12100783171408139, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 13875 + }, + { + "epoch": 0.12101655299924997, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 13876 + }, + { + "epoch": 0.12102527428441855, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 13877 + }, + { + "epoch": 0.12103399556958713, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 13878 + }, + { + "epoch": 0.12104271685475572, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13879 + }, + { + "epoch": 0.1210514381399243, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 13880 + }, + { + "epoch": 0.12106015942509288, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 13881 + }, + { + "epoch": 0.12106888071026146, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13882 + }, + { + "epoch": 0.12107760199543005, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 13883 + }, + { + "epoch": 0.12108632328059862, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 13884 + }, + { + "epoch": 0.12109504456576721, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 13885 + }, + { + "epoch": 0.1211037658509358, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 13886 + }, + { + "epoch": 0.12111248713610438, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 13887 + }, + { + "epoch": 0.12112120842127296, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 13888 + }, + { + "epoch": 0.12112992970644154, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 13889 + }, + { + "epoch": 0.12113865099161013, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 13890 + }, + { + "epoch": 0.1211473722767787, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 13891 + }, + { + "epoch": 0.12115609356194729, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 13892 + }, + { + "epoch": 0.12116481484711587, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 13893 + }, + { + "epoch": 0.12117353613228446, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 13894 + }, + { + "epoch": 0.12118225741745303, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 13895 + }, + { + "epoch": 0.12119097870262162, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13896 + }, + { + "epoch": 0.1211996999877902, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 13897 + }, + { + "epoch": 0.12120842127295878, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 13898 + }, + { + "epoch": 0.12121714255812736, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 13899 + }, + { + "epoch": 0.12122586384329595, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 13900 + }, + { + "epoch": 0.12123458512846454, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 13901 + }, + { + "epoch": 0.12124330641363311, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 13902 + }, + { + "epoch": 0.1212520276988017, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 13903 + }, + { + "epoch": 0.12126074898397028, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 13904 + }, + { + "epoch": 0.12126947026913885, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 13905 + }, + { + "epoch": 0.12127819155430744, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13906 + }, + { + "epoch": 0.12128691283947603, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 13907 + }, + { + "epoch": 0.12129563412464461, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 13908 + }, + { + "epoch": 0.12130435540981319, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 13909 + }, + { + "epoch": 0.12131307669498177, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 13910 + }, + { + "epoch": 0.12132179798015036, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 13911 + }, + { + "epoch": 0.12133051926531893, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 13912 + }, + { + "epoch": 0.12133924055048752, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 13913 + }, + { + "epoch": 0.1213479618356561, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 13914 + }, + { + "epoch": 0.12135668312082469, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 13915 + }, + { + "epoch": 0.12136540440599326, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 13916 + }, + { + "epoch": 0.12137412569116185, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 13917 + }, + { + "epoch": 0.12138284697633044, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 13918 + }, + { + "epoch": 0.12139156826149901, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 13919 + }, + { + "epoch": 0.1214002895466676, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 13920 + }, + { + "epoch": 0.12140901083183618, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 13921 + }, + { + "epoch": 0.12141773211700477, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 13922 + }, + { + "epoch": 0.12142645340217334, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 13923 + }, + { + "epoch": 0.12143517468734193, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 13924 + }, + { + "epoch": 0.12144389597251051, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 13925 + }, + { + "epoch": 0.12145261725767909, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 13926 + }, + { + "epoch": 0.12146133854284767, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 13927 + }, + { + "epoch": 0.12147005982801626, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 13928 + }, + { + "epoch": 0.12147878111318484, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 13929 + }, + { + "epoch": 0.12148750239835342, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 13930 + }, + { + "epoch": 0.121496223683522, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 13931 + }, + { + "epoch": 0.12150494496869059, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 13932 + }, + { + "epoch": 0.12151366625385916, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 13933 + }, + { + "epoch": 0.12152238753902775, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 13934 + }, + { + "epoch": 0.12153110882419633, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 13935 + }, + { + "epoch": 0.12153983010936492, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 13936 + }, + { + "epoch": 0.1215485513945335, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 13937 + }, + { + "epoch": 0.12155727267970208, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 13938 + }, + { + "epoch": 0.12156599396487067, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 13939 + }, + { + "epoch": 0.12157471525003924, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 13940 + }, + { + "epoch": 0.12158343653520783, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 13941 + }, + { + "epoch": 0.12159215782037641, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 13942 + }, + { + "epoch": 0.121600879105545, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 13943 + }, + { + "epoch": 0.12160960039071357, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 13944 + }, + { + "epoch": 0.12161832167588216, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 13945 + }, + { + "epoch": 0.12162704296105074, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 13946 + }, + { + "epoch": 0.12163576424621932, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 13947 + }, + { + "epoch": 0.1216444855313879, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 13948 + }, + { + "epoch": 0.12165320681655649, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 13949 + }, + { + "epoch": 0.12166192810172508, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 13950 + }, + { + "epoch": 0.12167064938689365, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 13951 + }, + { + "epoch": 0.12167937067206223, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 13952 + }, + { + "epoch": 0.12168809195723082, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 13953 + }, + { + "epoch": 0.1216968132423994, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 13954 + }, + { + "epoch": 0.12170553452756798, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 13955 + }, + { + "epoch": 0.12171425581273657, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 13956 + }, + { + "epoch": 0.12172297709790515, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 13957 + }, + { + "epoch": 0.12173169838307372, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 13958 + }, + { + "epoch": 0.12174041966824231, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 13959 + }, + { + "epoch": 0.1217491409534109, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 13960 + }, + { + "epoch": 0.12175786223857948, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 13961 + }, + { + "epoch": 0.12176658352374806, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 13962 + }, + { + "epoch": 0.12177530480891664, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 13963 + }, + { + "epoch": 0.12178402609408523, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 13964 + }, + { + "epoch": 0.1217927473792538, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 13965 + }, + { + "epoch": 0.12180146866442239, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 13966 + }, + { + "epoch": 0.12181018994959097, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 13967 + }, + { + "epoch": 0.12181891123475956, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 13968 + }, + { + "epoch": 0.12182763251992813, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 13969 + }, + { + "epoch": 0.12183635380509672, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 13970 + }, + { + "epoch": 0.1218450750902653, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 13971 + }, + { + "epoch": 0.12185379637543388, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 13972 + }, + { + "epoch": 0.12186251766060247, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 13973 + }, + { + "epoch": 0.12187123894577105, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 13974 + }, + { + "epoch": 0.12187996023093964, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 13975 + }, + { + "epoch": 0.12188868151610821, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 13976 + }, + { + "epoch": 0.1218974028012768, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 13977 + }, + { + "epoch": 0.12190612408644538, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 13978 + }, + { + "epoch": 0.12191484537161396, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 13979 + }, + { + "epoch": 0.12192356665678254, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 13980 + }, + { + "epoch": 0.12193228794195113, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 13981 + }, + { + "epoch": 0.12194100922711971, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 13982 + }, + { + "epoch": 0.12194973051228829, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 13983 + }, + { + "epoch": 0.12195845179745687, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 13984 + }, + { + "epoch": 0.12196717308262546, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 13985 + }, + { + "epoch": 0.12197589436779403, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 13986 + }, + { + "epoch": 0.12198461565296262, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 13987 + }, + { + "epoch": 0.1219933369381312, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 13988 + }, + { + "epoch": 0.12200205822329979, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 13989 + }, + { + "epoch": 0.12201077950846836, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 13990 + }, + { + "epoch": 0.12201950079363695, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 13991 + }, + { + "epoch": 0.12202822207880554, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 13992 + }, + { + "epoch": 0.12203694336397411, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 13993 + }, + { + "epoch": 0.1220456646491427, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 13994 + }, + { + "epoch": 0.12205438593431128, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 13995 + }, + { + "epoch": 0.12206310721947987, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 13996 + }, + { + "epoch": 0.12207182850464844, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 13997 + }, + { + "epoch": 0.12208054978981703, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 13998 + }, + { + "epoch": 0.12208927107498561, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 13999 + }, + { + "epoch": 0.12209799236015419, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 14000 + }, + { + "epoch": 0.12210671364532277, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 14001 + }, + { + "epoch": 0.12211543493049136, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 14002 + }, + { + "epoch": 0.12212415621565995, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 14003 + }, + { + "epoch": 0.12213287750082852, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 14004 + }, + { + "epoch": 0.1221415987859971, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 14005 + }, + { + "epoch": 0.12215032007116569, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 14006 + }, + { + "epoch": 0.12215904135633426, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 14007 + }, + { + "epoch": 0.12216776264150285, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 14008 + }, + { + "epoch": 0.12217648392667144, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 14009 + }, + { + "epoch": 0.12218520521184002, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 14010 + }, + { + "epoch": 0.1221939264970086, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 14011 + }, + { + "epoch": 0.12220264778217718, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 14012 + }, + { + "epoch": 0.12221136906734577, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 14013 + }, + { + "epoch": 0.12222009035251434, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 14014 + }, + { + "epoch": 0.12222881163768293, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 14015 + }, + { + "epoch": 0.12223753292285151, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 14016 + }, + { + "epoch": 0.1222462542080201, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 14017 + }, + { + "epoch": 0.12225497549318867, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 14018 + }, + { + "epoch": 0.12226369677835726, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 14019 + }, + { + "epoch": 0.12227241806352585, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14020 + }, + { + "epoch": 0.12228113934869442, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 14021 + }, + { + "epoch": 0.122289860633863, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 14022 + }, + { + "epoch": 0.12229858191903159, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 14023 + }, + { + "epoch": 0.12230730320420018, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 14024 + }, + { + "epoch": 0.12231602448936875, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 14025 + }, + { + "epoch": 0.12232474577453734, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14026 + }, + { + "epoch": 0.12233346705970592, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 14027 + }, + { + "epoch": 0.1223421883448745, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 14028 + }, + { + "epoch": 0.12235090963004308, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 14029 + }, + { + "epoch": 0.12235963091521167, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 14030 + }, + { + "epoch": 0.12236835220038025, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 14031 + }, + { + "epoch": 0.12237707348554883, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 14032 + }, + { + "epoch": 0.12238579477071741, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 14033 + }, + { + "epoch": 0.122394516055886, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 14034 + }, + { + "epoch": 0.12240323734105457, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 14035 + }, + { + "epoch": 0.12241195862622316, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 14036 + }, + { + "epoch": 0.12242067991139174, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 14037 + }, + { + "epoch": 0.12242940119656033, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14038 + }, + { + "epoch": 0.1224381224817289, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 14039 + }, + { + "epoch": 0.12244684376689749, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14040 + }, + { + "epoch": 0.12245556505206608, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 14041 + }, + { + "epoch": 0.12246428633723465, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 14042 + }, + { + "epoch": 0.12247300762240323, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 14043 + }, + { + "epoch": 0.12248172890757182, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 14044 + }, + { + "epoch": 0.12249045019274041, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 14045 + }, + { + "epoch": 0.12249917147790898, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 14046 + }, + { + "epoch": 0.12250789276307757, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 14047 + }, + { + "epoch": 0.12251661404824615, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 14048 + }, + { + "epoch": 0.12252533533341473, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 14049 + }, + { + "epoch": 0.12253405661858331, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 14050 + }, + { + "epoch": 0.1225427779037519, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 14051 + }, + { + "epoch": 0.12255149918892048, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 14052 + }, + { + "epoch": 0.12256022047408906, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 14053 + }, + { + "epoch": 0.12256894175925764, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 14054 + }, + { + "epoch": 0.12257766304442623, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 14055 + }, + { + "epoch": 0.1225863843295948, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 14056 + }, + { + "epoch": 0.12259510561476339, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 14057 + }, + { + "epoch": 0.12260382689993198, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 14058 + }, + { + "epoch": 0.12261254818510056, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 14059 + }, + { + "epoch": 0.12262126947026913, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 14060 + }, + { + "epoch": 0.12262999075543772, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 14061 + }, + { + "epoch": 0.1226387120406063, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 14062 + }, + { + "epoch": 0.12264743332577488, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 14063 + }, + { + "epoch": 0.12265615461094347, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 14064 + }, + { + "epoch": 0.12266487589611205, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 14065 + }, + { + "epoch": 0.12267359718128064, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 14066 + }, + { + "epoch": 0.12268231846644921, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 14067 + }, + { + "epoch": 0.1226910397516178, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 14068 + }, + { + "epoch": 0.12269976103678638, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14069 + }, + { + "epoch": 0.12270848232195496, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 14070 + }, + { + "epoch": 0.12271720360712354, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 14071 + }, + { + "epoch": 0.12272592489229213, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14072 + }, + { + "epoch": 0.12273464617746072, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 14073 + }, + { + "epoch": 0.12274336746262929, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 14074 + }, + { + "epoch": 0.12275208874779787, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 14075 + }, + { + "epoch": 0.12276081003296646, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 14076 + }, + { + "epoch": 0.12276953131813505, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 14077 + }, + { + "epoch": 0.12277825260330362, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 14078 + }, + { + "epoch": 0.1227869738884722, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 14079 + }, + { + "epoch": 0.12279569517364079, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 14080 + }, + { + "epoch": 0.12280441645880937, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 14081 + }, + { + "epoch": 0.12281313774397795, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 14082 + }, + { + "epoch": 0.12282185902914654, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 14083 + }, + { + "epoch": 0.12283058031431512, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 14084 + }, + { + "epoch": 0.1228393015994837, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 14085 + }, + { + "epoch": 0.12284802288465228, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 14086 + }, + { + "epoch": 0.12285674416982087, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 14087 + }, + { + "epoch": 0.12286546545498944, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 14088 + }, + { + "epoch": 0.12287418674015803, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 14089 + }, + { + "epoch": 0.12288290802532661, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 14090 + }, + { + "epoch": 0.1228916293104952, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 14091 + }, + { + "epoch": 0.12290035059566377, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 14092 + }, + { + "epoch": 0.12290907188083236, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 14093 + }, + { + "epoch": 0.12291779316600095, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 14094 + }, + { + "epoch": 0.12292651445116952, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 14095 + }, + { + "epoch": 0.1229352357363381, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14096 + }, + { + "epoch": 0.12294395702150669, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 14097 + }, + { + "epoch": 0.12295267830667528, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 14098 + }, + { + "epoch": 0.12296139959184385, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 14099 + }, + { + "epoch": 0.12297012087701244, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 14100 + }, + { + "epoch": 0.12297884216218102, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 14101 + }, + { + "epoch": 0.1229875634473496, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 14102 + }, + { + "epoch": 0.12299628473251818, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 14103 + }, + { + "epoch": 0.12300500601768677, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 14104 + }, + { + "epoch": 0.12301372730285536, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 14105 + }, + { + "epoch": 0.12302244858802393, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 14106 + }, + { + "epoch": 0.12303116987319251, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 14107 + }, + { + "epoch": 0.1230398911583611, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 14108 + }, + { + "epoch": 0.12304861244352967, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 14109 + }, + { + "epoch": 0.12305733372869826, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 14110 + }, + { + "epoch": 0.12306605501386685, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 14111 + }, + { + "epoch": 0.12307477629903543, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 14112 + }, + { + "epoch": 0.123083497584204, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 14113 + }, + { + "epoch": 0.12309221886937259, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 14114 + }, + { + "epoch": 0.12310094015454118, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 14115 + }, + { + "epoch": 0.12310966143970975, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 14116 + }, + { + "epoch": 0.12311838272487834, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 14117 + }, + { + "epoch": 0.12312710401004692, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 14118 + }, + { + "epoch": 0.12313582529521551, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 14119 + }, + { + "epoch": 0.12314454658038408, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 14120 + }, + { + "epoch": 0.12315326786555267, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 14121 + }, + { + "epoch": 0.12316198915072125, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 14122 + }, + { + "epoch": 0.12317071043588983, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 14123 + }, + { + "epoch": 0.12317943172105841, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 14124 + }, + { + "epoch": 0.123188153006227, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 14125 + }, + { + "epoch": 0.12319687429139559, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 14126 + }, + { + "epoch": 0.12320559557656416, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 14127 + }, + { + "epoch": 0.12321431686173275, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14128 + }, + { + "epoch": 0.12322303814690133, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 14129 + }, + { + "epoch": 0.1232317594320699, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 14130 + }, + { + "epoch": 0.12324048071723849, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 14131 + }, + { + "epoch": 0.12324920200240708, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 14132 + }, + { + "epoch": 0.12325792328757566, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14133 + }, + { + "epoch": 0.12326664457274424, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 14134 + }, + { + "epoch": 0.12327536585791282, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 14135 + }, + { + "epoch": 0.12328408714308141, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 14136 + }, + { + "epoch": 0.12329280842824998, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14137 + }, + { + "epoch": 0.12330152971341857, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 14138 + }, + { + "epoch": 0.12331025099858715, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 14139 + }, + { + "epoch": 0.12331897228375574, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 14140 + }, + { + "epoch": 0.12332769356892431, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 14141 + }, + { + "epoch": 0.1233364148540929, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14142 + }, + { + "epoch": 0.12334513613926149, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 14143 + }, + { + "epoch": 0.12335385742443006, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 14144 + }, + { + "epoch": 0.12336257870959864, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 14145 + }, + { + "epoch": 0.12337129999476723, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 14146 + }, + { + "epoch": 0.12338002127993582, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 14147 + }, + { + "epoch": 0.12338874256510439, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 14148 + }, + { + "epoch": 0.12339746385027298, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 14149 + }, + { + "epoch": 0.12340618513544156, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 14150 + }, + { + "epoch": 0.12341490642061013, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 14151 + }, + { + "epoch": 0.12342362770577872, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 14152 + }, + { + "epoch": 0.12343234899094731, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 14153 + }, + { + "epoch": 0.1234410702761159, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 14154 + }, + { + "epoch": 0.12344979156128447, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 14155 + }, + { + "epoch": 0.12345851284645305, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 14156 + }, + { + "epoch": 0.12346723413162164, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 14157 + }, + { + "epoch": 0.12347595541679021, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 14158 + }, + { + "epoch": 0.1234846767019588, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 14159 + }, + { + "epoch": 0.12349339798712738, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 14160 + }, + { + "epoch": 0.12350211927229597, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 14161 + }, + { + "epoch": 0.12351084055746454, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 14162 + }, + { + "epoch": 0.12351956184263313, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 14163 + }, + { + "epoch": 0.12352828312780172, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 14164 + }, + { + "epoch": 0.12353700441297029, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 14165 + }, + { + "epoch": 0.12354572569813888, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 14166 + }, + { + "epoch": 0.12355444698330746, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 14167 + }, + { + "epoch": 0.12356316826847605, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 14168 + }, + { + "epoch": 0.12357188955364462, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 14169 + }, + { + "epoch": 0.1235806108388132, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 14170 + }, + { + "epoch": 0.1235893321239818, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 14171 + }, + { + "epoch": 0.12359805340915037, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 14172 + }, + { + "epoch": 0.12360677469431895, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 14173 + }, + { + "epoch": 0.12361549597948754, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 14174 + }, + { + "epoch": 0.12362421726465612, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 14175 + }, + { + "epoch": 0.1236329385498247, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 14176 + }, + { + "epoch": 0.12364165983499328, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 14177 + }, + { + "epoch": 0.12365038112016187, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 14178 + }, + { + "epoch": 0.12365910240533044, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 14179 + }, + { + "epoch": 0.12366782369049903, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 14180 + }, + { + "epoch": 0.12367654497566762, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 14181 + }, + { + "epoch": 0.1236852662608362, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 14182 + }, + { + "epoch": 0.12369398754600477, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14183 + }, + { + "epoch": 0.12370270883117336, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14184 + }, + { + "epoch": 0.12371143011634195, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 14185 + }, + { + "epoch": 0.12372015140151052, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 14186 + }, + { + "epoch": 0.1237288726866791, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 14187 + }, + { + "epoch": 0.12373759397184769, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 14188 + }, + { + "epoch": 0.12374631525701628, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 14189 + }, + { + "epoch": 0.12375503654218485, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 14190 + }, + { + "epoch": 0.12376375782735344, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 14191 + }, + { + "epoch": 0.12377247911252202, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 14192 + }, + { + "epoch": 0.12378120039769061, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 14193 + }, + { + "epoch": 0.12378992168285918, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 14194 + }, + { + "epoch": 0.12379864296802777, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 14195 + }, + { + "epoch": 0.12380736425319636, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 14196 + }, + { + "epoch": 0.12381608553836493, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 14197 + }, + { + "epoch": 0.12382480682353351, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 14198 + }, + { + "epoch": 0.1238335281087021, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 14199 + }, + { + "epoch": 0.12384224939387069, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 14200 + }, + { + "epoch": 0.12385097067903926, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 14201 + }, + { + "epoch": 0.12385969196420785, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 14202 + }, + { + "epoch": 0.12386841324937643, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14203 + }, + { + "epoch": 0.123877134534545, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14204 + }, + { + "epoch": 0.12388585581971359, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 14205 + }, + { + "epoch": 0.12389457710488218, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 14206 + }, + { + "epoch": 0.12390329839005076, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 14207 + }, + { + "epoch": 0.12391201967521934, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 14208 + }, + { + "epoch": 0.12392074096038792, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 14209 + }, + { + "epoch": 0.12392946224555651, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 14210 + }, + { + "epoch": 0.12393818353072508, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14211 + }, + { + "epoch": 0.12394690481589367, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 14212 + }, + { + "epoch": 0.12395562610106226, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 14213 + }, + { + "epoch": 0.12396434738623084, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 14214 + }, + { + "epoch": 0.12397306867139941, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 14215 + }, + { + "epoch": 0.123981789956568, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 14216 + }, + { + "epoch": 0.12399051124173659, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14217 + }, + { + "epoch": 0.12399923252690516, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 14218 + }, + { + "epoch": 0.12400795381207375, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 14219 + }, + { + "epoch": 0.12401667509724233, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 14220 + }, + { + "epoch": 0.12402539638241092, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 14221 + }, + { + "epoch": 0.12403411766757949, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 14222 + }, + { + "epoch": 0.12404283895274808, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 14223 + }, + { + "epoch": 0.12405156023791666, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 14224 + }, + { + "epoch": 0.12406028152308524, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 14225 + }, + { + "epoch": 0.12406900280825382, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 14226 + }, + { + "epoch": 0.12407772409342241, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 14227 + }, + { + "epoch": 0.124086445378591, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 14228 + }, + { + "epoch": 0.12409516666375957, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 14229 + }, + { + "epoch": 0.12410388794892815, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 14230 + }, + { + "epoch": 0.12411260923409674, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 14231 + }, + { + "epoch": 0.12412133051926531, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14232 + }, + { + "epoch": 0.1241300518044339, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 14233 + }, + { + "epoch": 0.12413877308960249, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 14234 + }, + { + "epoch": 0.12414749437477107, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 14235 + }, + { + "epoch": 0.12415621565993964, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 14236 + }, + { + "epoch": 0.12416493694510823, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 14237 + }, + { + "epoch": 0.12417365823027682, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 14238 + }, + { + "epoch": 0.12418237951544539, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 14239 + }, + { + "epoch": 0.12419110080061398, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 14240 + }, + { + "epoch": 0.12419982208578256, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 14241 + }, + { + "epoch": 0.12420854337095115, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 14242 + }, + { + "epoch": 0.12421726465611972, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 14243 + }, + { + "epoch": 0.12422598594128831, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 14244 + }, + { + "epoch": 0.1242347072264569, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 14245 + }, + { + "epoch": 0.12424342851162547, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 14246 + }, + { + "epoch": 0.12425214979679405, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 14247 + }, + { + "epoch": 0.12426087108196264, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 14248 + }, + { + "epoch": 0.12426959236713123, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 14249 + }, + { + "epoch": 0.1242783136522998, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 14250 + }, + { + "epoch": 0.12428703493746839, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 14251 + }, + { + "epoch": 0.12429575622263697, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 14252 + }, + { + "epoch": 0.12430447750780554, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 14253 + }, + { + "epoch": 0.12431319879297413, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 14254 + }, + { + "epoch": 0.12432192007814272, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 14255 + }, + { + "epoch": 0.1243306413633113, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 14256 + }, + { + "epoch": 0.12433936264847988, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 14257 + }, + { + "epoch": 0.12434808393364846, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 14258 + }, + { + "epoch": 0.12435680521881705, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 14259 + }, + { + "epoch": 0.12436552650398562, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 14260 + }, + { + "epoch": 0.12437424778915421, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 14261 + }, + { + "epoch": 0.1243829690743228, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 14262 + }, + { + "epoch": 0.12439169035949138, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 14263 + }, + { + "epoch": 0.12440041164465995, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 14264 + }, + { + "epoch": 0.12440913292982854, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 14265 + }, + { + "epoch": 0.12441785421499713, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 14266 + }, + { + "epoch": 0.1244265755001657, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 14267 + }, + { + "epoch": 0.12443529678533428, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 14268 + }, + { + "epoch": 0.12444401807050287, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 14269 + }, + { + "epoch": 0.12445273935567146, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 14270 + }, + { + "epoch": 0.12446146064084003, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 14271 + }, + { + "epoch": 0.12447018192600862, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 14272 + }, + { + "epoch": 0.1244789032111772, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 14273 + }, + { + "epoch": 0.12448762449634578, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 14274 + }, + { + "epoch": 0.12449634578151436, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 14275 + }, + { + "epoch": 0.12450506706668295, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 14276 + }, + { + "epoch": 0.12451378835185153, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 14277 + }, + { + "epoch": 0.1245225096370201, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 14278 + }, + { + "epoch": 0.1245312309221887, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 14279 + }, + { + "epoch": 0.12453995220735728, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 14280 + }, + { + "epoch": 0.12454867349252585, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 14281 + }, + { + "epoch": 0.12455739477769444, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 14282 + }, + { + "epoch": 0.12456611606286302, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 14283 + }, + { + "epoch": 0.12457483734803161, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 14284 + }, + { + "epoch": 0.12458355863320018, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 14285 + }, + { + "epoch": 0.12459227991836877, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 14286 + }, + { + "epoch": 0.12460100120353736, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 14287 + }, + { + "epoch": 0.12460972248870593, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 14288 + }, + { + "epoch": 0.12461844377387452, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 14289 + }, + { + "epoch": 0.1246271650590431, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 14290 + }, + { + "epoch": 0.12463588634421169, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 14291 + }, + { + "epoch": 0.12464460762938026, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 14292 + }, + { + "epoch": 0.12465332891454885, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 14293 + }, + { + "epoch": 0.12466205019971743, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 14294 + }, + { + "epoch": 0.124670771484886, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 14295 + }, + { + "epoch": 0.12467949277005459, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 14296 + }, + { + "epoch": 0.12468821405522318, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 14297 + }, + { + "epoch": 0.12469693534039177, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 14298 + }, + { + "epoch": 0.12470565662556034, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 14299 + }, + { + "epoch": 0.12471437791072892, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 14300 + }, + { + "epoch": 0.12472309919589751, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 14301 + }, + { + "epoch": 0.12473182048106608, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 14302 + }, + { + "epoch": 0.12474054176623467, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 14303 + }, + { + "epoch": 0.12474926305140326, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 14304 + }, + { + "epoch": 0.12475798433657184, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 14305 + }, + { + "epoch": 0.12476670562174041, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 14306 + }, + { + "epoch": 0.124775426906909, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 14307 + }, + { + "epoch": 0.12478414819207759, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 14308 + }, + { + "epoch": 0.12479286947724617, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 14309 + }, + { + "epoch": 0.12480159076241475, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 14310 + }, + { + "epoch": 0.12481031204758333, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 14311 + }, + { + "epoch": 0.12481903333275192, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 14312 + }, + { + "epoch": 0.12482775461792049, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14313 + }, + { + "epoch": 0.12483647590308908, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14314 + }, + { + "epoch": 0.12484519718825766, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 14315 + }, + { + "epoch": 0.12485391847342625, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 14316 + }, + { + "epoch": 0.12486263975859482, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 14317 + }, + { + "epoch": 0.12487136104376341, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 14318 + }, + { + "epoch": 0.124880082328932, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 14319 + }, + { + "epoch": 0.12488880361410057, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 14320 + }, + { + "epoch": 0.12489752489926916, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 14321 + }, + { + "epoch": 0.12490624618443774, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 14322 + }, + { + "epoch": 0.12491496746960633, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 14323 + }, + { + "epoch": 0.1249236887547749, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 14324 + }, + { + "epoch": 0.12493241003994349, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 14325 + }, + { + "epoch": 0.12494113132511207, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14326 + }, + { + "epoch": 0.12494985261028065, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 14327 + }, + { + "epoch": 0.12495857389544923, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 14328 + }, + { + "epoch": 0.12496729518061782, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 14329 + }, + { + "epoch": 0.1249760164657864, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 14330 + }, + { + "epoch": 0.12498473775095498, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 14331 + }, + { + "epoch": 0.12499345903612356, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 14332 + }, + { + "epoch": 0.12500218032129215, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 14333 + }, + { + "epoch": 0.12501090160646072, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 14334 + }, + { + "epoch": 0.12501962289162932, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 14335 + }, + { + "epoch": 0.1250283441767979, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 14336 + }, + { + "epoch": 0.12503706546196647, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 14337 + }, + { + "epoch": 0.12504578674713507, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14338 + }, + { + "epoch": 0.12505450803230364, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 14339 + }, + { + "epoch": 0.1250632293174722, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14340 + }, + { + "epoch": 0.1250719506026408, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 14341 + }, + { + "epoch": 0.12508067188780939, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 14342 + }, + { + "epoch": 0.12508939317297796, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 14343 + }, + { + "epoch": 0.12509811445814656, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 14344 + }, + { + "epoch": 0.12510683574331513, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 14345 + }, + { + "epoch": 0.1251155570284837, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 14346 + }, + { + "epoch": 0.1251242783136523, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 14347 + }, + { + "epoch": 0.12513299959882088, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14348 + }, + { + "epoch": 0.12514172088398948, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 14349 + }, + { + "epoch": 0.12515044216915805, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 14350 + }, + { + "epoch": 0.12515916345432662, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14351 + }, + { + "epoch": 0.12516788473949522, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 14352 + }, + { + "epoch": 0.1251766060246638, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 14353 + }, + { + "epoch": 0.12518532730983237, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 14354 + }, + { + "epoch": 0.12519404859500097, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 14355 + }, + { + "epoch": 0.12520276988016954, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 14356 + }, + { + "epoch": 0.1252114911653381, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 14357 + }, + { + "epoch": 0.1252202124505067, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 14358 + }, + { + "epoch": 0.12522893373567529, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 14359 + }, + { + "epoch": 0.12523765502084386, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 14360 + }, + { + "epoch": 0.12524637630601246, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 14361 + }, + { + "epoch": 0.12525509759118103, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 14362 + }, + { + "epoch": 0.12526381887634963, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 14363 + }, + { + "epoch": 0.1252725401615182, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 14364 + }, + { + "epoch": 0.12528126144668678, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 14365 + }, + { + "epoch": 0.12528998273185538, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14366 + }, + { + "epoch": 0.12529870401702395, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 14367 + }, + { + "epoch": 0.12530742530219252, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 14368 + }, + { + "epoch": 0.12531614658736112, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 14369 + }, + { + "epoch": 0.1253248678725297, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 14370 + }, + { + "epoch": 0.12533358915769827, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 14371 + }, + { + "epoch": 0.12534231044286687, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 14372 + }, + { + "epoch": 0.12535103172803544, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14373 + }, + { + "epoch": 0.12535975301320404, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 14374 + }, + { + "epoch": 0.1253684742983726, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 14375 + }, + { + "epoch": 0.12537719558354118, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 14376 + }, + { + "epoch": 0.12538591686870978, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 14377 + }, + { + "epoch": 0.12539463815387836, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 14378 + }, + { + "epoch": 0.12540335943904693, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 14379 + }, + { + "epoch": 0.12541208072421553, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 14380 + }, + { + "epoch": 0.1254208020093841, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 14381 + }, + { + "epoch": 0.12542952329455268, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 14382 + }, + { + "epoch": 0.12543824457972128, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 14383 + }, + { + "epoch": 0.12544696586488985, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 14384 + }, + { + "epoch": 0.12545568715005842, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 14385 + }, + { + "epoch": 0.12546440843522702, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 14386 + }, + { + "epoch": 0.1254731297203956, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 14387 + }, + { + "epoch": 0.1254818510055642, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14388 + }, + { + "epoch": 0.12549057229073277, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 14389 + }, + { + "epoch": 0.12549929357590134, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 14390 + }, + { + "epoch": 0.12550801486106994, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 14391 + }, + { + "epoch": 0.1255167361462385, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 14392 + }, + { + "epoch": 0.12552545743140708, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 14393 + }, + { + "epoch": 0.12553417871657568, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 14394 + }, + { + "epoch": 0.12554290000174426, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 14395 + }, + { + "epoch": 0.12555162128691283, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 14396 + }, + { + "epoch": 0.12556034257208143, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 14397 + }, + { + "epoch": 0.12556906385725, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 14398 + }, + { + "epoch": 0.12557778514241857, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 14399 + }, + { + "epoch": 0.12558650642758717, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 14400 + }, + { + "epoch": 0.12559522771275575, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 14401 + }, + { + "epoch": 0.12560394899792435, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 14402 + }, + { + "epoch": 0.12561267028309292, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 14403 + }, + { + "epoch": 0.1256213915682615, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 14404 + }, + { + "epoch": 0.1256301128534301, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 14405 + }, + { + "epoch": 0.12563883413859867, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14406 + }, + { + "epoch": 0.12564755542376724, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 14407 + }, + { + "epoch": 0.12565627670893584, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 14408 + }, + { + "epoch": 0.1256649979941044, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 14409 + }, + { + "epoch": 0.12567371927927298, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 14410 + }, + { + "epoch": 0.12568244056444158, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 14411 + }, + { + "epoch": 0.12569116184961016, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 14412 + }, + { + "epoch": 0.12569988313477873, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 14413 + }, + { + "epoch": 0.12570860441994733, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 14414 + }, + { + "epoch": 0.1257173257051159, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14415 + }, + { + "epoch": 0.1257260469902845, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 14416 + }, + { + "epoch": 0.12573476827545307, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 14417 + }, + { + "epoch": 0.12574348956062165, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14418 + }, + { + "epoch": 0.12575221084579025, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 14419 + }, + { + "epoch": 0.12576093213095882, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 14420 + }, + { + "epoch": 0.1257696534161274, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 14421 + }, + { + "epoch": 0.125778374701296, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 14422 + }, + { + "epoch": 0.12578709598646456, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 14423 + }, + { + "epoch": 0.12579581727163314, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 14424 + }, + { + "epoch": 0.12580453855680174, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 14425 + }, + { + "epoch": 0.1258132598419703, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 14426 + }, + { + "epoch": 0.12582198112713888, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 14427 + }, + { + "epoch": 0.12583070241230748, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 14428 + }, + { + "epoch": 0.12583942369747605, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14429 + }, + { + "epoch": 0.12584814498264466, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 14430 + }, + { + "epoch": 0.12585686626781323, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 14431 + }, + { + "epoch": 0.1258655875529818, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 14432 + }, + { + "epoch": 0.1258743088381504, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 14433 + }, + { + "epoch": 0.12588303012331897, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 14434 + }, + { + "epoch": 0.12589175140848755, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 14435 + }, + { + "epoch": 0.12590047269365615, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 14436 + }, + { + "epoch": 0.12590919397882472, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 14437 + }, + { + "epoch": 0.1259179152639933, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 14438 + }, + { + "epoch": 0.1259266365491619, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 14439 + }, + { + "epoch": 0.12593535783433046, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 14440 + }, + { + "epoch": 0.12594407911949904, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 14441 + }, + { + "epoch": 0.12595280040466764, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 14442 + }, + { + "epoch": 0.1259615216898362, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14443 + }, + { + "epoch": 0.1259702429750048, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 14444 + }, + { + "epoch": 0.12597896426017338, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 14445 + }, + { + "epoch": 0.12598768554534195, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 14446 + }, + { + "epoch": 0.12599640683051055, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 14447 + }, + { + "epoch": 0.12600512811567913, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 14448 + }, + { + "epoch": 0.1260138494008477, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 14449 + }, + { + "epoch": 0.1260225706860163, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 14450 + }, + { + "epoch": 0.12603129197118487, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 14451 + }, + { + "epoch": 0.12604001325635344, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 14452 + }, + { + "epoch": 0.12604873454152205, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 14453 + }, + { + "epoch": 0.12605745582669062, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 14454 + }, + { + "epoch": 0.1260661771118592, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 14455 + }, + { + "epoch": 0.1260748983970278, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 14456 + }, + { + "epoch": 0.12608361968219636, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 14457 + }, + { + "epoch": 0.12609234096736496, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 14458 + }, + { + "epoch": 0.12610106225253354, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 14459 + }, + { + "epoch": 0.1261097835377021, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 14460 + }, + { + "epoch": 0.1261185048228707, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 14461 + }, + { + "epoch": 0.12612722610803928, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 14462 + }, + { + "epoch": 0.12613594739320785, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 14463 + }, + { + "epoch": 0.12614466867837645, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 14464 + }, + { + "epoch": 0.12615338996354503, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 14465 + }, + { + "epoch": 0.1261621112487136, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 14466 + }, + { + "epoch": 0.1261708325338822, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 14467 + }, + { + "epoch": 0.12617955381905077, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 14468 + }, + { + "epoch": 0.12618827510421934, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 14469 + }, + { + "epoch": 0.12619699638938794, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 14470 + }, + { + "epoch": 0.12620571767455652, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 14471 + }, + { + "epoch": 0.12621443895972512, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 14472 + }, + { + "epoch": 0.1262231602448937, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 14473 + }, + { + "epoch": 0.12623188153006226, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 14474 + }, + { + "epoch": 0.12624060281523086, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 14475 + }, + { + "epoch": 0.12624932410039943, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 14476 + }, + { + "epoch": 0.126258045385568, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 14477 + }, + { + "epoch": 0.1262667666707366, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14478 + }, + { + "epoch": 0.12627548795590518, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 14479 + }, + { + "epoch": 0.12628420924107375, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 14480 + }, + { + "epoch": 0.12629293052624235, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 14481 + }, + { + "epoch": 0.12630165181141093, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14482 + }, + { + "epoch": 0.12631037309657953, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 14483 + }, + { + "epoch": 0.1263190943817481, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 14484 + }, + { + "epoch": 0.12632781566691667, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 14485 + }, + { + "epoch": 0.12633653695208527, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 14486 + }, + { + "epoch": 0.12634525823725384, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 14487 + }, + { + "epoch": 0.12635397952242242, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 14488 + }, + { + "epoch": 0.12636270080759102, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 14489 + }, + { + "epoch": 0.1263714220927596, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 14490 + }, + { + "epoch": 0.12638014337792816, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 14491 + }, + { + "epoch": 0.12638886466309676, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 14492 + }, + { + "epoch": 0.12639758594826533, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 14493 + }, + { + "epoch": 0.1264063072334339, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 14494 + }, + { + "epoch": 0.1264150285186025, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 14495 + }, + { + "epoch": 0.12642374980377108, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 14496 + }, + { + "epoch": 0.12643247108893968, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 14497 + }, + { + "epoch": 0.12644119237410825, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 14498 + }, + { + "epoch": 0.12644991365927682, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 14499 + }, + { + "epoch": 0.12645863494444542, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 14500 + }, + { + "epoch": 0.126467356229614, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 14501 + }, + { + "epoch": 0.12647607751478257, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14502 + }, + { + "epoch": 0.12648479879995117, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 14503 + }, + { + "epoch": 0.12649352008511974, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 14504 + }, + { + "epoch": 0.12650224137028832, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 14505 + }, + { + "epoch": 0.12651096265545692, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 14506 + }, + { + "epoch": 0.1265196839406255, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 14507 + }, + { + "epoch": 0.12652840522579406, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 14508 + }, + { + "epoch": 0.12653712651096266, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 14509 + }, + { + "epoch": 0.12654584779613123, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 14510 + }, + { + "epoch": 0.12655456908129983, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 14511 + }, + { + "epoch": 0.1265632903664684, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 14512 + }, + { + "epoch": 0.12657201165163698, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 14513 + }, + { + "epoch": 0.12658073293680558, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 14514 + }, + { + "epoch": 0.12658945422197415, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 14515 + }, + { + "epoch": 0.12659817550714272, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 14516 + }, + { + "epoch": 0.12660689679231132, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 14517 + }, + { + "epoch": 0.1266156180774799, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 14518 + }, + { + "epoch": 0.12662433936264847, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 14519 + }, + { + "epoch": 0.12663306064781707, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 14520 + }, + { + "epoch": 0.12664178193298564, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 14521 + }, + { + "epoch": 0.12665050321815421, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 14522 + }, + { + "epoch": 0.12665922450332281, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 14523 + }, + { + "epoch": 0.1266679457884914, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 14524 + }, + { + "epoch": 0.12667666707366, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 14525 + }, + { + "epoch": 0.12668538835882856, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 14526 + }, + { + "epoch": 0.12669410964399713, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14527 + }, + { + "epoch": 0.12670283092916573, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 14528 + }, + { + "epoch": 0.1267115522143343, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 14529 + }, + { + "epoch": 0.12672027349950288, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 14530 + }, + { + "epoch": 0.12672899478467148, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 14531 + }, + { + "epoch": 0.12673771606984005, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 14532 + }, + { + "epoch": 0.12674643735500862, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 14533 + }, + { + "epoch": 0.12675515864017722, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 14534 + }, + { + "epoch": 0.1267638799253458, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 14535 + }, + { + "epoch": 0.12677260121051437, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 14536 + }, + { + "epoch": 0.12678132249568297, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 14537 + }, + { + "epoch": 0.12679004378085154, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 14538 + }, + { + "epoch": 0.12679876506602014, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 14539 + }, + { + "epoch": 0.12680748635118871, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 14540 + }, + { + "epoch": 0.1268162076363573, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 14541 + }, + { + "epoch": 0.1268249289215259, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14542 + }, + { + "epoch": 0.12683365020669446, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 14543 + }, + { + "epoch": 0.12684237149186303, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14544 + }, + { + "epoch": 0.12685109277703163, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 14545 + }, + { + "epoch": 0.1268598140622002, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14546 + }, + { + "epoch": 0.12686853534736878, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 14547 + }, + { + "epoch": 0.12687725663253738, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 14548 + }, + { + "epoch": 0.12688597791770595, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 14549 + }, + { + "epoch": 0.12689469920287452, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 14550 + }, + { + "epoch": 0.12690342048804312, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 14551 + }, + { + "epoch": 0.1269121417732117, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 14552 + }, + { + "epoch": 0.1269208630583803, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 14553 + }, + { + "epoch": 0.12692958434354887, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 14554 + }, + { + "epoch": 0.12693830562871744, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 14555 + }, + { + "epoch": 0.12694702691388604, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 14556 + }, + { + "epoch": 0.1269557481990546, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 14557 + }, + { + "epoch": 0.12696446948422319, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 14558 + }, + { + "epoch": 0.12697319076939179, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 14559 + }, + { + "epoch": 0.12698191205456036, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 14560 + }, + { + "epoch": 0.12699063333972893, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 14561 + }, + { + "epoch": 0.12699935462489753, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 14562 + }, + { + "epoch": 0.1270080759100661, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 14563 + }, + { + "epoch": 0.12701679719523468, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 14564 + }, + { + "epoch": 0.12702551848040328, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 14565 + }, + { + "epoch": 0.12703423976557185, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 14566 + }, + { + "epoch": 0.12704296105074045, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 14567 + }, + { + "epoch": 0.12705168233590902, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 14568 + }, + { + "epoch": 0.1270604036210776, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 14569 + }, + { + "epoch": 0.1270691249062462, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 14570 + }, + { + "epoch": 0.12707784619141477, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 14571 + }, + { + "epoch": 0.12708656747658334, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 14572 + }, + { + "epoch": 0.12709528876175194, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 14573 + }, + { + "epoch": 0.1271040100469205, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 14574 + }, + { + "epoch": 0.12711273133208909, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 14575 + }, + { + "epoch": 0.12712145261725769, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 14576 + }, + { + "epoch": 0.12713017390242626, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 14577 + }, + { + "epoch": 0.12713889518759483, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 14578 + }, + { + "epoch": 0.12714761647276343, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 14579 + }, + { + "epoch": 0.127156337757932, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 14580 + }, + { + "epoch": 0.1271650590431006, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 14581 + }, + { + "epoch": 0.12717378032826918, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 14582 + }, + { + "epoch": 0.12718250161343775, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 14583 + }, + { + "epoch": 0.12719122289860635, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 14584 + }, + { + "epoch": 0.12719994418377492, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 14585 + }, + { + "epoch": 0.1272086654689435, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 14586 + }, + { + "epoch": 0.1272173867541121, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 14587 + }, + { + "epoch": 0.12722610803928067, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 14588 + }, + { + "epoch": 0.12723482932444924, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 14589 + }, + { + "epoch": 0.12724355060961784, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 14590 + }, + { + "epoch": 0.1272522718947864, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 14591 + }, + { + "epoch": 0.127260993179955, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 14592 + }, + { + "epoch": 0.12726971446512358, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 14593 + }, + { + "epoch": 0.12727843575029216, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 14594 + }, + { + "epoch": 0.12728715703546076, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 14595 + }, + { + "epoch": 0.12729587832062933, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 14596 + }, + { + "epoch": 0.1273045996057979, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 14597 + }, + { + "epoch": 0.1273133208909665, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 14598 + }, + { + "epoch": 0.12732204217613508, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 14599 + }, + { + "epoch": 0.12733076346130365, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 14600 + }, + { + "epoch": 0.12733948474647225, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 14601 + }, + { + "epoch": 0.12734820603164082, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 14602 + }, + { + "epoch": 0.1273569273168094, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 14603 + }, + { + "epoch": 0.127365648601978, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 14604 + }, + { + "epoch": 0.12737436988714657, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 14605 + }, + { + "epoch": 0.12738309117231517, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14606 + }, + { + "epoch": 0.12739181245748374, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 14607 + }, + { + "epoch": 0.1274005337426523, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 14608 + }, + { + "epoch": 0.1274092550278209, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 14609 + }, + { + "epoch": 0.12741797631298948, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 14610 + }, + { + "epoch": 0.12742669759815806, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 14611 + }, + { + "epoch": 0.12743541888332666, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 14612 + }, + { + "epoch": 0.12744414016849523, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 14613 + }, + { + "epoch": 0.1274528614536638, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 14614 + }, + { + "epoch": 0.1274615827388324, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 14615 + }, + { + "epoch": 0.12747030402400097, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 14616 + }, + { + "epoch": 0.12747902530916955, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 14617 + }, + { + "epoch": 0.12748774659433815, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 14618 + }, + { + "epoch": 0.12749646787950672, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 14619 + }, + { + "epoch": 0.12750518916467532, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 14620 + }, + { + "epoch": 0.1275139104498439, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 14621 + }, + { + "epoch": 0.12752263173501246, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 14622 + }, + { + "epoch": 0.12753135302018107, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 14623 + }, + { + "epoch": 0.12754007430534964, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 14624 + }, + { + "epoch": 0.1275487955905182, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 14625 + }, + { + "epoch": 0.1275575168756868, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 14626 + }, + { + "epoch": 0.12756623816085538, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 14627 + }, + { + "epoch": 0.12757495944602396, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 14628 + }, + { + "epoch": 0.12758368073119256, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14629 + }, + { + "epoch": 0.12759240201636113, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 14630 + }, + { + "epoch": 0.1276011233015297, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 14631 + }, + { + "epoch": 0.1276098445866983, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 14632 + }, + { + "epoch": 0.12761856587186687, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 14633 + }, + { + "epoch": 0.12762728715703547, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14634 + }, + { + "epoch": 0.12763600844220405, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 14635 + }, + { + "epoch": 0.12764472972737262, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14636 + }, + { + "epoch": 0.12765345101254122, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 14637 + }, + { + "epoch": 0.1276621722977098, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 14638 + }, + { + "epoch": 0.12767089358287836, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 14639 + }, + { + "epoch": 0.12767961486804696, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 14640 + }, + { + "epoch": 0.12768833615321554, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14641 + }, + { + "epoch": 0.1276970574383841, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 14642 + }, + { + "epoch": 0.1277057787235527, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 14643 + }, + { + "epoch": 0.12771450000872128, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 14644 + }, + { + "epoch": 0.12772322129388985, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 14645 + }, + { + "epoch": 0.12773194257905846, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 14646 + }, + { + "epoch": 0.12774066386422703, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 14647 + }, + { + "epoch": 0.12774938514939563, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 14648 + }, + { + "epoch": 0.1277581064345642, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 14649 + }, + { + "epoch": 0.12776682771973277, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 14650 + }, + { + "epoch": 0.12777554900490137, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 14651 + }, + { + "epoch": 0.12778427029006995, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 14652 + }, + { + "epoch": 0.12779299157523852, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 14653 + }, + { + "epoch": 0.12780171286040712, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 14654 + }, + { + "epoch": 0.1278104341455757, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 14655 + }, + { + "epoch": 0.12781915543074426, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 14656 + }, + { + "epoch": 0.12782787671591286, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 14657 + }, + { + "epoch": 0.12783659800108144, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 14658 + }, + { + "epoch": 0.12784531928625, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 14659 + }, + { + "epoch": 0.1278540405714186, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 14660 + }, + { + "epoch": 0.12786276185658718, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 14661 + }, + { + "epoch": 0.12787148314175578, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 14662 + }, + { + "epoch": 0.12788020442692435, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 14663 + }, + { + "epoch": 0.12788892571209293, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 14664 + }, + { + "epoch": 0.12789764699726153, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 14665 + }, + { + "epoch": 0.1279063682824301, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 14666 + }, + { + "epoch": 0.12791508956759867, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 14667 + }, + { + "epoch": 0.12792381085276727, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 14668 + }, + { + "epoch": 0.12793253213793584, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14669 + }, + { + "epoch": 0.12794125342310442, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 14670 + }, + { + "epoch": 0.12794997470827302, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 14671 + }, + { + "epoch": 0.1279586959934416, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 14672 + }, + { + "epoch": 0.12796741727861016, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 14673 + }, + { + "epoch": 0.12797613856377876, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 14674 + }, + { + "epoch": 0.12798485984894734, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 14675 + }, + { + "epoch": 0.12799358113411594, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 14676 + }, + { + "epoch": 0.1280023024192845, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 14677 + }, + { + "epoch": 0.12801102370445308, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 14678 + }, + { + "epoch": 0.12801974498962168, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 14679 + }, + { + "epoch": 0.12802846627479025, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 14680 + }, + { + "epoch": 0.12803718755995883, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 14681 + }, + { + "epoch": 0.12804590884512743, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 14682 + }, + { + "epoch": 0.128054630130296, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 14683 + }, + { + "epoch": 0.12806335141546457, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 14684 + }, + { + "epoch": 0.12807207270063317, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 14685 + }, + { + "epoch": 0.12808079398580174, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 14686 + }, + { + "epoch": 0.12808951527097032, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 14687 + }, + { + "epoch": 0.12809823655613892, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 14688 + }, + { + "epoch": 0.1281069578413075, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14689 + }, + { + "epoch": 0.1281156791264761, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 14690 + }, + { + "epoch": 0.12812440041164466, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 14691 + }, + { + "epoch": 0.12813312169681323, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 14692 + }, + { + "epoch": 0.12814184298198183, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 14693 + }, + { + "epoch": 0.1281505642671504, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 14694 + }, + { + "epoch": 0.12815928555231898, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14695 + }, + { + "epoch": 0.12816800683748758, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 14696 + }, + { + "epoch": 0.12817672812265615, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 14697 + }, + { + "epoch": 0.12818544940782473, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 14698 + }, + { + "epoch": 0.12819417069299333, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 14699 + }, + { + "epoch": 0.1282028919781619, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 14700 + }, + { + "epoch": 0.12821161326333047, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 14701 + }, + { + "epoch": 0.12822033454849907, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 14702 + }, + { + "epoch": 0.12822905583366764, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 14703 + }, + { + "epoch": 0.12823777711883624, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 14704 + }, + { + "epoch": 0.12824649840400482, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 14705 + }, + { + "epoch": 0.1282552196891734, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 14706 + }, + { + "epoch": 0.128263940974342, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 14707 + }, + { + "epoch": 0.12827266225951056, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 14708 + }, + { + "epoch": 0.12828138354467913, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 14709 + }, + { + "epoch": 0.12829010482984773, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 14710 + }, + { + "epoch": 0.1282988261150163, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 14711 + }, + { + "epoch": 0.12830754740018488, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 14712 + }, + { + "epoch": 0.12831626868535348, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 14713 + }, + { + "epoch": 0.12832498997052205, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 14714 + }, + { + "epoch": 0.12833371125569065, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 14715 + }, + { + "epoch": 0.12834243254085922, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 14716 + }, + { + "epoch": 0.1283511538260278, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 14717 + }, + { + "epoch": 0.1283598751111964, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14718 + }, + { + "epoch": 0.12836859639636497, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 14719 + }, + { + "epoch": 0.12837731768153354, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 14720 + }, + { + "epoch": 0.12838603896670214, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 14721 + }, + { + "epoch": 0.12839476025187072, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 14722 + }, + { + "epoch": 0.1284034815370393, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 14723 + }, + { + "epoch": 0.1284122028222079, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 14724 + }, + { + "epoch": 0.12842092410737646, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 14725 + }, + { + "epoch": 0.12842964539254503, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 14726 + }, + { + "epoch": 0.12843836667771363, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 14727 + }, + { + "epoch": 0.1284470879628822, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 14728 + }, + { + "epoch": 0.1284558092480508, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 14729 + }, + { + "epoch": 0.12846453053321938, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 14730 + }, + { + "epoch": 0.12847325181838795, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 14731 + }, + { + "epoch": 0.12848197310355655, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 14732 + }, + { + "epoch": 0.12849069438872512, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14733 + }, + { + "epoch": 0.1284994156738937, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 14734 + }, + { + "epoch": 0.1285081369590623, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 14735 + }, + { + "epoch": 0.12851685824423087, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 14736 + }, + { + "epoch": 0.12852557952939944, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 14737 + }, + { + "epoch": 0.12853430081456804, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 14738 + }, + { + "epoch": 0.12854302209973661, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 14739 + }, + { + "epoch": 0.1285517433849052, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 14740 + }, + { + "epoch": 0.1285604646700738, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 14741 + }, + { + "epoch": 0.12856918595524236, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 14742 + }, + { + "epoch": 0.12857790724041096, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 14743 + }, + { + "epoch": 0.12858662852557953, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 14744 + }, + { + "epoch": 0.1285953498107481, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 14745 + }, + { + "epoch": 0.1286040710959167, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 14746 + }, + { + "epoch": 0.12861279238108528, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 14747 + }, + { + "epoch": 0.12862151366625385, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 14748 + }, + { + "epoch": 0.12863023495142245, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 14749 + }, + { + "epoch": 0.12863895623659102, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 14750 + }, + { + "epoch": 0.1286476775217596, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 14751 + }, + { + "epoch": 0.1286563988069282, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 14752 + }, + { + "epoch": 0.12866512009209677, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 14753 + }, + { + "epoch": 0.12867384137726534, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 14754 + }, + { + "epoch": 0.12868256266243394, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 14755 + }, + { + "epoch": 0.12869128394760251, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 14756 + }, + { + "epoch": 0.12870000523277111, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 14757 + }, + { + "epoch": 0.1287087265179397, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 14758 + }, + { + "epoch": 0.12871744780310826, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 14759 + }, + { + "epoch": 0.12872616908827686, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 14760 + }, + { + "epoch": 0.12873489037344543, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 14761 + }, + { + "epoch": 0.128743611658614, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 14762 + }, + { + "epoch": 0.1287523329437826, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 14763 + }, + { + "epoch": 0.12876105422895118, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 14764 + }, + { + "epoch": 0.12876977551411975, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 14765 + }, + { + "epoch": 0.12877849679928835, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 14766 + }, + { + "epoch": 0.12878721808445692, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 14767 + }, + { + "epoch": 0.1287959393696255, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 14768 + }, + { + "epoch": 0.1288046606547941, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 14769 + }, + { + "epoch": 0.12881338193996267, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 14770 + }, + { + "epoch": 0.12882210322513127, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 14771 + }, + { + "epoch": 0.12883082451029984, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 14772 + }, + { + "epoch": 0.1288395457954684, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 14773 + }, + { + "epoch": 0.128848267080637, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 14774 + }, + { + "epoch": 0.12885698836580559, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 14775 + }, + { + "epoch": 0.12886570965097416, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 14776 + }, + { + "epoch": 0.12887443093614276, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 14777 + }, + { + "epoch": 0.12888315222131133, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 14778 + }, + { + "epoch": 0.1288918735064799, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 14779 + }, + { + "epoch": 0.1289005947916485, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 14780 + }, + { + "epoch": 0.12890931607681708, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 14781 + }, + { + "epoch": 0.12891803736198565, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 14782 + }, + { + "epoch": 0.12892675864715425, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 14783 + }, + { + "epoch": 0.12893547993232282, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 14784 + }, + { + "epoch": 0.12894420121749142, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 14785 + }, + { + "epoch": 0.12895292250266, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 14786 + }, + { + "epoch": 0.12896164378782857, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 14787 + }, + { + "epoch": 0.12897036507299717, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 14788 + }, + { + "epoch": 0.12897908635816574, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 14789 + }, + { + "epoch": 0.1289878076433343, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 14790 + }, + { + "epoch": 0.1289965289285029, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 14791 + }, + { + "epoch": 0.12900525021367149, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 14792 + }, + { + "epoch": 0.12901397149884006, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14793 + }, + { + "epoch": 0.12902269278400866, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 14794 + }, + { + "epoch": 0.12903141406917723, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 14795 + }, + { + "epoch": 0.1290401353543458, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 14796 + }, + { + "epoch": 0.1290488566395144, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 14797 + }, + { + "epoch": 0.12905757792468298, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 14798 + }, + { + "epoch": 0.12906629920985158, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 14799 + }, + { + "epoch": 0.12907502049502015, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 14800 + }, + { + "epoch": 0.12908374178018872, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 14801 + }, + { + "epoch": 0.12909246306535732, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 14802 + }, + { + "epoch": 0.1291011843505259, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 14803 + }, + { + "epoch": 0.12910990563569447, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 14804 + }, + { + "epoch": 0.12911862692086307, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 14805 + }, + { + "epoch": 0.12912734820603164, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 14806 + }, + { + "epoch": 0.1291360694912002, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 14807 + }, + { + "epoch": 0.1291447907763688, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 14808 + }, + { + "epoch": 0.12915351206153738, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 14809 + }, + { + "epoch": 0.12916223334670596, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 14810 + }, + { + "epoch": 0.12917095463187456, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 14811 + }, + { + "epoch": 0.12917967591704313, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 14812 + }, + { + "epoch": 0.12918839720221173, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 14813 + }, + { + "epoch": 0.1291971184873803, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 14814 + }, + { + "epoch": 0.12920583977254887, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 14815 + }, + { + "epoch": 0.12921456105771748, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 14816 + }, + { + "epoch": 0.12922328234288605, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 14817 + }, + { + "epoch": 0.12923200362805462, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 14818 + }, + { + "epoch": 0.12924072491322322, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 14819 + }, + { + "epoch": 0.1292494461983918, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 14820 + }, + { + "epoch": 0.12925816748356037, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 14821 + }, + { + "epoch": 0.12926688876872897, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 14822 + }, + { + "epoch": 0.12927561005389754, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 14823 + }, + { + "epoch": 0.12928433133906614, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 14824 + }, + { + "epoch": 0.1292930526242347, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 14825 + }, + { + "epoch": 0.12930177390940328, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 14826 + }, + { + "epoch": 0.12931049519457188, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 14827 + }, + { + "epoch": 0.12931921647974046, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 14828 + }, + { + "epoch": 0.12932793776490903, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 14829 + }, + { + "epoch": 0.12933665905007763, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 14830 + }, + { + "epoch": 0.1293453803352462, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 14831 + }, + { + "epoch": 0.12935410162041477, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 14832 + }, + { + "epoch": 0.12936282290558337, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 14833 + }, + { + "epoch": 0.12937154419075195, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 14834 + }, + { + "epoch": 0.12938026547592052, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 14835 + }, + { + "epoch": 0.12938898676108912, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 14836 + }, + { + "epoch": 0.1293977080462577, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 14837 + }, + { + "epoch": 0.1294064293314263, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 14838 + }, + { + "epoch": 0.12941515061659487, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 14839 + }, + { + "epoch": 0.12942387190176344, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 14840 + }, + { + "epoch": 0.12943259318693204, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 14841 + }, + { + "epoch": 0.1294413144721006, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 14842 + }, + { + "epoch": 0.12945003575726918, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 14843 + }, + { + "epoch": 0.12945875704243778, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14844 + }, + { + "epoch": 0.12946747832760636, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 14845 + }, + { + "epoch": 0.12947619961277493, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 14846 + }, + { + "epoch": 0.12948492089794353, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 14847 + }, + { + "epoch": 0.1294936421831121, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 14848 + }, + { + "epoch": 0.12950236346828067, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 14849 + }, + { + "epoch": 0.12951108475344927, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 14850 + }, + { + "epoch": 0.12951980603861785, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 14851 + }, + { + "epoch": 0.12952852732378645, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 14852 + }, + { + "epoch": 0.12953724860895502, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 14853 + }, + { + "epoch": 0.1295459698941236, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 14854 + }, + { + "epoch": 0.1295546911792922, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14855 + }, + { + "epoch": 0.12956341246446076, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 14856 + }, + { + "epoch": 0.12957213374962934, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 14857 + }, + { + "epoch": 0.12958085503479794, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 14858 + }, + { + "epoch": 0.1295895763199665, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 14859 + }, + { + "epoch": 0.12959829760513508, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 14860 + }, + { + "epoch": 0.12960701889030368, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 14861 + }, + { + "epoch": 0.12961574017547225, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 14862 + }, + { + "epoch": 0.12962446146064083, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 14863 + }, + { + "epoch": 0.12963318274580943, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 14864 + }, + { + "epoch": 0.129641904030978, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 14865 + }, + { + "epoch": 0.1296506253161466, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 14866 + }, + { + "epoch": 0.12965934660131517, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 14867 + }, + { + "epoch": 0.12966806788648375, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 14868 + }, + { + "epoch": 0.12967678917165235, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 14869 + }, + { + "epoch": 0.12968551045682092, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 14870 + }, + { + "epoch": 0.1296942317419895, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 14871 + }, + { + "epoch": 0.1297029530271581, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 14872 + }, + { + "epoch": 0.12971167431232666, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 14873 + }, + { + "epoch": 0.12972039559749524, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 14874 + }, + { + "epoch": 0.12972911688266384, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 14875 + }, + { + "epoch": 0.1297378381678324, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 14876 + }, + { + "epoch": 0.12974655945300098, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 14877 + }, + { + "epoch": 0.12975528073816958, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 14878 + }, + { + "epoch": 0.12976400202333815, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 14879 + }, + { + "epoch": 0.12977272330850675, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 14880 + }, + { + "epoch": 0.12978144459367533, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 14881 + }, + { + "epoch": 0.1297901658788439, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 14882 + }, + { + "epoch": 0.1297988871640125, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 14883 + }, + { + "epoch": 0.12980760844918107, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 14884 + }, + { + "epoch": 0.12981632973434964, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 14885 + }, + { + "epoch": 0.12982505101951824, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 14886 + }, + { + "epoch": 0.12983377230468682, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 14887 + }, + { + "epoch": 0.1298424935898554, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 14888 + }, + { + "epoch": 0.129851214875024, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 14889 + }, + { + "epoch": 0.12985993616019256, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 14890 + }, + { + "epoch": 0.12986865744536114, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 14891 + }, + { + "epoch": 0.12987737873052974, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 14892 + }, + { + "epoch": 0.1298861000156983, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 14893 + }, + { + "epoch": 0.1298948213008669, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 14894 + }, + { + "epoch": 0.12990354258603548, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 14895 + }, + { + "epoch": 0.12991226387120405, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 14896 + }, + { + "epoch": 0.12992098515637265, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 14897 + }, + { + "epoch": 0.12992970644154123, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 14898 + }, + { + "epoch": 0.1299384277267098, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 14899 + }, + { + "epoch": 0.1299471490118784, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 14900 + }, + { + "epoch": 0.12995587029704697, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 14901 + }, + { + "epoch": 0.12996459158221554, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 14902 + }, + { + "epoch": 0.12997331286738414, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 14903 + }, + { + "epoch": 0.12998203415255272, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 14904 + }, + { + "epoch": 0.1299907554377213, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 14905 + }, + { + "epoch": 0.1299994767228899, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 14906 + }, + { + "epoch": 0.13000819800805846, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 14907 + }, + { + "epoch": 0.13001691929322706, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 14908 + }, + { + "epoch": 0.13002564057839563, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 14909 + }, + { + "epoch": 0.1300343618635642, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 14910 + }, + { + "epoch": 0.1300430831487328, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 14911 + }, + { + "epoch": 0.13005180443390138, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 14912 + }, + { + "epoch": 0.13006052571906995, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 14913 + }, + { + "epoch": 0.13006924700423855, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 14914 + }, + { + "epoch": 0.13007796828940713, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 14915 + }, + { + "epoch": 0.1300866895745757, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 14916 + }, + { + "epoch": 0.1300954108597443, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 14917 + }, + { + "epoch": 0.13010413214491287, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 14918 + }, + { + "epoch": 0.13011285343008144, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 14919 + }, + { + "epoch": 0.13012157471525004, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 14920 + }, + { + "epoch": 0.13013029600041862, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 14921 + }, + { + "epoch": 0.13013901728558722, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 14922 + }, + { + "epoch": 0.1301477385707558, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 14923 + }, + { + "epoch": 0.13015645985592436, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 14924 + }, + { + "epoch": 0.13016518114109296, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 14925 + }, + { + "epoch": 0.13017390242626153, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 14926 + }, + { + "epoch": 0.1301826237114301, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 14927 + }, + { + "epoch": 0.1301913449965987, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 14928 + }, + { + "epoch": 0.13020006628176728, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 14929 + }, + { + "epoch": 0.13020878756693585, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 14930 + }, + { + "epoch": 0.13021750885210445, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 14931 + }, + { + "epoch": 0.13022623013727302, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14932 + }, + { + "epoch": 0.1302349514224416, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 14933 + }, + { + "epoch": 0.1302436727076102, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 14934 + }, + { + "epoch": 0.13025239399277877, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 14935 + }, + { + "epoch": 0.13026111527794737, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 14936 + }, + { + "epoch": 0.13026983656311594, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 14937 + }, + { + "epoch": 0.13027855784828452, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 14938 + }, + { + "epoch": 0.13028727913345312, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 14939 + }, + { + "epoch": 0.1302960004186217, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 14940 + }, + { + "epoch": 0.13030472170379026, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 14941 + }, + { + "epoch": 0.13031344298895886, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 14942 + }, + { + "epoch": 0.13032216427412743, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 14943 + }, + { + "epoch": 0.130330885559296, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 14944 + }, + { + "epoch": 0.1303396068444646, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 14945 + }, + { + "epoch": 0.13034832812963318, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 14946 + }, + { + "epoch": 0.13035704941480178, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 14947 + }, + { + "epoch": 0.13036577069997035, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 14948 + }, + { + "epoch": 0.13037449198513892, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 14949 + }, + { + "epoch": 0.13038321327030752, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 14950 + }, + { + "epoch": 0.1303919345554761, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 14951 + }, + { + "epoch": 0.13040065584064467, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 14952 + }, + { + "epoch": 0.13040937712581327, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 14953 + }, + { + "epoch": 0.13041809841098184, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 14954 + }, + { + "epoch": 0.13042681969615041, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 14955 + }, + { + "epoch": 0.13043554098131901, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 14956 + }, + { + "epoch": 0.1304442622664876, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 14957 + }, + { + "epoch": 0.13045298355165616, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 14958 + }, + { + "epoch": 0.13046170483682476, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 14959 + }, + { + "epoch": 0.13047042612199333, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 14960 + }, + { + "epoch": 0.13047914740716193, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 14961 + }, + { + "epoch": 0.1304878686923305, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 14962 + }, + { + "epoch": 0.13049658997749908, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 14963 + }, + { + "epoch": 0.13050531126266768, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 14964 + }, + { + "epoch": 0.13051403254783625, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 14965 + }, + { + "epoch": 0.13052275383300482, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 14966 + }, + { + "epoch": 0.13053147511817342, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 14967 + }, + { + "epoch": 0.130540196403342, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 14968 + }, + { + "epoch": 0.13054891768851057, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 14969 + }, + { + "epoch": 0.13055763897367917, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 14970 + }, + { + "epoch": 0.13056636025884774, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 14971 + }, + { + "epoch": 0.1305750815440163, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 14972 + }, + { + "epoch": 0.13058380282918491, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 14973 + }, + { + "epoch": 0.1305925241143535, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 14974 + }, + { + "epoch": 0.1306012453995221, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 14975 + }, + { + "epoch": 0.13060996668469066, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 14976 + }, + { + "epoch": 0.13061868796985923, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 14977 + }, + { + "epoch": 0.13062740925502783, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 14978 + }, + { + "epoch": 0.1306361305401964, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 14979 + }, + { + "epoch": 0.13064485182536498, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 14980 + }, + { + "epoch": 0.13065357311053358, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 14981 + }, + { + "epoch": 0.13066229439570215, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 14982 + }, + { + "epoch": 0.13067101568087072, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 14983 + }, + { + "epoch": 0.13067973696603932, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 14984 + }, + { + "epoch": 0.1306884582512079, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 14985 + }, + { + "epoch": 0.13069717953637647, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 14986 + }, + { + "epoch": 0.13070590082154507, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 14987 + }, + { + "epoch": 0.13071462210671364, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 14988 + }, + { + "epoch": 0.13072334339188224, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 14989 + }, + { + "epoch": 0.1307320646770508, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 14990 + }, + { + "epoch": 0.13074078596221939, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 14991 + }, + { + "epoch": 0.13074950724738799, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 14992 + }, + { + "epoch": 0.13075822853255656, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14993 + }, + { + "epoch": 0.13076694981772513, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 14994 + }, + { + "epoch": 0.13077567110289373, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 14995 + }, + { + "epoch": 0.1307843923880623, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 14996 + }, + { + "epoch": 0.13079311367323088, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 14997 + }, + { + "epoch": 0.13080183495839948, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 14998 + }, + { + "epoch": 0.13081055624356805, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 14999 + }, + { + "epoch": 0.13081927752873662, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 15000 + }, + { + "epoch": 0.13082799881390522, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15001 + }, + { + "epoch": 0.1308367200990738, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 15002 + }, + { + "epoch": 0.1308454413842424, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 15003 + }, + { + "epoch": 0.13085416266941097, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 15004 + }, + { + "epoch": 0.13086288395457954, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 15005 + }, + { + "epoch": 0.13087160523974814, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 15006 + }, + { + "epoch": 0.1308803265249167, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 15007 + }, + { + "epoch": 0.13088904781008528, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 15008 + }, + { + "epoch": 0.13089776909525389, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 15009 + }, + { + "epoch": 0.13090649038042246, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 15010 + }, + { + "epoch": 0.13091521166559103, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 15011 + }, + { + "epoch": 0.13092393295075963, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 15012 + }, + { + "epoch": 0.1309326542359282, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15013 + }, + { + "epoch": 0.13094137552109678, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 15014 + }, + { + "epoch": 0.13095009680626538, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 15015 + }, + { + "epoch": 0.13095881809143395, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 15016 + }, + { + "epoch": 0.13096753937660255, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 15017 + }, + { + "epoch": 0.13097626066177112, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 15018 + }, + { + "epoch": 0.1309849819469397, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 15019 + }, + { + "epoch": 0.1309937032321083, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 15020 + }, + { + "epoch": 0.13100242451727687, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 15021 + }, + { + "epoch": 0.13101114580244544, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 15022 + }, + { + "epoch": 0.13101986708761404, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 15023 + }, + { + "epoch": 0.1310285883727826, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 15024 + }, + { + "epoch": 0.13103730965795118, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 15025 + }, + { + "epoch": 0.13104603094311978, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 15026 + }, + { + "epoch": 0.13105475222828836, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 15027 + }, + { + "epoch": 0.13106347351345693, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 15028 + }, + { + "epoch": 0.13107219479862553, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 15029 + }, + { + "epoch": 0.1310809160837941, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 15030 + }, + { + "epoch": 0.1310896373689627, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 15031 + }, + { + "epoch": 0.13109835865413128, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 15032 + }, + { + "epoch": 0.13110707993929985, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 15033 + }, + { + "epoch": 0.13111580122446845, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 15034 + }, + { + "epoch": 0.13112452250963702, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15035 + }, + { + "epoch": 0.1311332437948056, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 15036 + }, + { + "epoch": 0.1311419650799742, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 15037 + }, + { + "epoch": 0.13115068636514277, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 15038 + }, + { + "epoch": 0.13115940765031134, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 15039 + }, + { + "epoch": 0.13116812893547994, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 15040 + }, + { + "epoch": 0.1311768502206485, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 15041 + }, + { + "epoch": 0.13118557150581708, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 15042 + }, + { + "epoch": 0.13119429279098568, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 15043 + }, + { + "epoch": 0.13120301407615426, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 15044 + }, + { + "epoch": 0.13121173536132286, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 15045 + }, + { + "epoch": 0.13122045664649143, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 15046 + }, + { + "epoch": 0.13122917793166, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 15047 + }, + { + "epoch": 0.1312378992168286, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 15048 + }, + { + "epoch": 0.13124662050199717, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 15049 + }, + { + "epoch": 0.13125534178716575, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 15050 + }, + { + "epoch": 0.13126406307233435, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 15051 + }, + { + "epoch": 0.13127278435750292, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 15052 + }, + { + "epoch": 0.1312815056426715, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 15053 + }, + { + "epoch": 0.1312902269278401, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 15054 + }, + { + "epoch": 0.13129894821300866, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 15055 + }, + { + "epoch": 0.13130766949817727, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15056 + }, + { + "epoch": 0.13131639078334584, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 15057 + }, + { + "epoch": 0.1313251120685144, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 15058 + }, + { + "epoch": 0.131333833353683, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 15059 + }, + { + "epoch": 0.13134255463885158, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 15060 + }, + { + "epoch": 0.13135127592402016, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 15061 + }, + { + "epoch": 0.13135999720918876, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15062 + }, + { + "epoch": 0.13136871849435733, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 15063 + }, + { + "epoch": 0.1313774397795259, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 15064 + }, + { + "epoch": 0.1313861610646945, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 15065 + }, + { + "epoch": 0.13139488234986307, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 15066 + }, + { + "epoch": 0.13140360363503165, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 15067 + }, + { + "epoch": 0.13141232492020025, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 15068 + }, + { + "epoch": 0.13142104620536882, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 15069 + }, + { + "epoch": 0.13142976749053742, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 15070 + }, + { + "epoch": 0.131438488775706, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 15071 + }, + { + "epoch": 0.13144721006087456, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 15072 + }, + { + "epoch": 0.13145593134604316, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 15073 + }, + { + "epoch": 0.13146465263121174, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 15074 + }, + { + "epoch": 0.1314733739163803, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 15075 + }, + { + "epoch": 0.1314820952015489, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 15076 + }, + { + "epoch": 0.13149081648671748, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 15077 + }, + { + "epoch": 0.13149953777188605, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 15078 + }, + { + "epoch": 0.13150825905705466, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 15079 + }, + { + "epoch": 0.13151698034222323, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 15080 + }, + { + "epoch": 0.1315257016273918, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 15081 + }, + { + "epoch": 0.1315344229125604, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15082 + }, + { + "epoch": 0.13154314419772897, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 15083 + }, + { + "epoch": 0.13155186548289757, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 15084 + }, + { + "epoch": 0.13156058676806615, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 15085 + }, + { + "epoch": 0.13156930805323472, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 15086 + }, + { + "epoch": 0.13157802933840332, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 15087 + }, + { + "epoch": 0.1315867506235719, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 15088 + }, + { + "epoch": 0.13159547190874046, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 15089 + }, + { + "epoch": 0.13160419319390906, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 15090 + }, + { + "epoch": 0.13161291447907764, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 15091 + }, + { + "epoch": 0.1316216357642462, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 15092 + }, + { + "epoch": 0.1316303570494148, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 15093 + }, + { + "epoch": 0.13163907833458338, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 15094 + }, + { + "epoch": 0.13164779961975195, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 15095 + }, + { + "epoch": 0.13165652090492055, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 15096 + }, + { + "epoch": 0.13166524219008913, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 15097 + }, + { + "epoch": 0.13167396347525773, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 15098 + }, + { + "epoch": 0.1316826847604263, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 15099 + }, + { + "epoch": 0.13169140604559487, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 15100 + }, + { + "epoch": 0.13170012733076347, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 15101 + }, + { + "epoch": 0.13170884861593204, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 15102 + }, + { + "epoch": 0.13171756990110062, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 15103 + }, + { + "epoch": 0.13172629118626922, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 15104 + }, + { + "epoch": 0.1317350124714378, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 15105 + }, + { + "epoch": 0.13174373375660636, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 15106 + }, + { + "epoch": 0.13175245504177496, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 15107 + }, + { + "epoch": 0.13176117632694354, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 15108 + }, + { + "epoch": 0.1317698976121121, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 15109 + }, + { + "epoch": 0.1317786188972807, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 15110 + }, + { + "epoch": 0.13178734018244928, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 15111 + }, + { + "epoch": 0.13179606146761788, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 15112 + }, + { + "epoch": 0.13180478275278645, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 15113 + }, + { + "epoch": 0.13181350403795503, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 15114 + }, + { + "epoch": 0.13182222532312363, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 15115 + }, + { + "epoch": 0.1318309466082922, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 15116 + }, + { + "epoch": 0.13183966789346077, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 15117 + }, + { + "epoch": 0.13184838917862937, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 15118 + }, + { + "epoch": 0.13185711046379794, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 15119 + }, + { + "epoch": 0.13186583174896652, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 15120 + }, + { + "epoch": 0.13187455303413512, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 15121 + }, + { + "epoch": 0.1318832743193037, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 15122 + }, + { + "epoch": 0.13189199560447226, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 15123 + }, + { + "epoch": 0.13190071688964086, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 15124 + }, + { + "epoch": 0.13190943817480943, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 15125 + }, + { + "epoch": 0.13191815945997803, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 15126 + }, + { + "epoch": 0.1319268807451466, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 15127 + }, + { + "epoch": 0.13193560203031518, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 15128 + }, + { + "epoch": 0.13194432331548378, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15129 + }, + { + "epoch": 0.13195304460065235, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 15130 + }, + { + "epoch": 0.13196176588582093, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 15131 + }, + { + "epoch": 0.13197048717098953, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 15132 + }, + { + "epoch": 0.1319792084561581, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 15133 + }, + { + "epoch": 0.13198792974132667, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 15134 + }, + { + "epoch": 0.13199665102649527, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 15135 + }, + { + "epoch": 0.13200537231166384, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 15136 + }, + { + "epoch": 0.13201409359683242, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 15137 + }, + { + "epoch": 0.13202281488200102, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 15138 + }, + { + "epoch": 0.1320315361671696, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 15139 + }, + { + "epoch": 0.1320402574523382, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 15140 + }, + { + "epoch": 0.13204897873750676, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 15141 + }, + { + "epoch": 0.13205770002267533, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 15142 + }, + { + "epoch": 0.13206642130784393, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 15143 + }, + { + "epoch": 0.1320751425930125, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 15144 + }, + { + "epoch": 0.13208386387818108, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 15145 + }, + { + "epoch": 0.13209258516334968, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15146 + }, + { + "epoch": 0.13210130644851825, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 15147 + }, + { + "epoch": 0.13211002773368682, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 15148 + }, + { + "epoch": 0.13211874901885542, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 15149 + }, + { + "epoch": 0.132127470304024, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 15150 + }, + { + "epoch": 0.13213619158919257, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 15151 + }, + { + "epoch": 0.13214491287436117, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 15152 + }, + { + "epoch": 0.13215363415952974, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 15153 + }, + { + "epoch": 0.13216235544469834, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 15154 + }, + { + "epoch": 0.13217107672986692, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 15155 + }, + { + "epoch": 0.1321797980150355, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 15156 + }, + { + "epoch": 0.1321885193002041, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 15157 + }, + { + "epoch": 0.13219724058537266, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 15158 + }, + { + "epoch": 0.13220596187054123, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 15159 + }, + { + "epoch": 0.13221468315570983, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 15160 + }, + { + "epoch": 0.1322234044408784, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 15161 + }, + { + "epoch": 0.13223212572604698, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 15162 + }, + { + "epoch": 0.13224084701121558, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 15163 + }, + { + "epoch": 0.13224956829638415, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 15164 + }, + { + "epoch": 0.13225828958155272, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 15165 + }, + { + "epoch": 0.13226701086672132, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 15166 + }, + { + "epoch": 0.1322757321518899, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 15167 + }, + { + "epoch": 0.1322844534370585, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 15168 + }, + { + "epoch": 0.13229317472222707, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 15169 + }, + { + "epoch": 0.13230189600739564, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 15170 + }, + { + "epoch": 0.13231061729256424, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 15171 + }, + { + "epoch": 0.13231933857773281, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 15172 + }, + { + "epoch": 0.1323280598629014, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 15173 + }, + { + "epoch": 0.13233678114807, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15174 + }, + { + "epoch": 0.13234550243323856, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 15175 + }, + { + "epoch": 0.13235422371840713, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 15176 + }, + { + "epoch": 0.13236294500357573, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 15177 + }, + { + "epoch": 0.1323716662887443, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 15178 + }, + { + "epoch": 0.1323803875739129, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 15179 + }, + { + "epoch": 0.13238910885908148, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15180 + }, + { + "epoch": 0.13239783014425005, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 15181 + }, + { + "epoch": 0.13240655142941865, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 15182 + }, + { + "epoch": 0.13241527271458722, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15183 + }, + { + "epoch": 0.1324239939997558, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 15184 + }, + { + "epoch": 0.1324327152849244, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 15185 + }, + { + "epoch": 0.13244143657009297, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 15186 + }, + { + "epoch": 0.13245015785526154, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 15187 + }, + { + "epoch": 0.13245887914043014, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 15188 + }, + { + "epoch": 0.1324676004255987, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 15189 + }, + { + "epoch": 0.1324763217107673, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 15190 + }, + { + "epoch": 0.1324850429959359, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15191 + }, + { + "epoch": 0.13249376428110446, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 15192 + }, + { + "epoch": 0.13250248556627306, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 15193 + }, + { + "epoch": 0.13251120685144163, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 15194 + }, + { + "epoch": 0.1325199281366102, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 15195 + }, + { + "epoch": 0.1325286494217788, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 15196 + }, + { + "epoch": 0.13253737070694738, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 15197 + }, + { + "epoch": 0.13254609199211595, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 15198 + }, + { + "epoch": 0.13255481327728455, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 15199 + }, + { + "epoch": 0.13256353456245312, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 15200 + }, + { + "epoch": 0.1325722558476217, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 15201 + }, + { + "epoch": 0.1325809771327903, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 15202 + }, + { + "epoch": 0.13258969841795887, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 15203 + }, + { + "epoch": 0.13259841970312744, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 15204 + }, + { + "epoch": 0.13260714098829604, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 15205 + }, + { + "epoch": 0.1326158622734646, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 15206 + }, + { + "epoch": 0.1326245835586332, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 15207 + }, + { + "epoch": 0.13263330484380179, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 15208 + }, + { + "epoch": 0.13264202612897036, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 15209 + }, + { + "epoch": 0.13265074741413896, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 15210 + }, + { + "epoch": 0.13265946869930753, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 15211 + }, + { + "epoch": 0.1326681899844761, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 15212 + }, + { + "epoch": 0.1326769112696447, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 15213 + }, + { + "epoch": 0.13268563255481328, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 15214 + }, + { + "epoch": 0.13269435383998185, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 15215 + }, + { + "epoch": 0.13270307512515045, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 15216 + }, + { + "epoch": 0.13271179641031902, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 15217 + }, + { + "epoch": 0.1327205176954876, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 15218 + }, + { + "epoch": 0.1327292389806562, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 15219 + }, + { + "epoch": 0.13273796026582477, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 15220 + }, + { + "epoch": 0.13274668155099337, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 15221 + }, + { + "epoch": 0.13275540283616194, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 15222 + }, + { + "epoch": 0.1327641241213305, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 15223 + }, + { + "epoch": 0.1327728454064991, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 15224 + }, + { + "epoch": 0.13278156669166769, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 15225 + }, + { + "epoch": 0.13279028797683626, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 15226 + }, + { + "epoch": 0.13279900926200486, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 15227 + }, + { + "epoch": 0.13280773054717343, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 15228 + }, + { + "epoch": 0.132816451832342, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 15229 + }, + { + "epoch": 0.1328251731175106, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 15230 + }, + { + "epoch": 0.13283389440267918, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 15231 + }, + { + "epoch": 0.13284261568784775, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 15232 + }, + { + "epoch": 0.13285133697301635, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 15233 + }, + { + "epoch": 0.13286005825818492, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15234 + }, + { + "epoch": 0.13286877954335352, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 15235 + }, + { + "epoch": 0.1328775008285221, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 15236 + }, + { + "epoch": 0.13288622211369067, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 15237 + }, + { + "epoch": 0.13289494339885927, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 15238 + }, + { + "epoch": 0.13290366468402784, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 15239 + }, + { + "epoch": 0.1329123859691964, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 15240 + }, + { + "epoch": 0.132921107254365, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 15241 + }, + { + "epoch": 0.13292982853953358, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 15242 + }, + { + "epoch": 0.13293854982470216, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 15243 + }, + { + "epoch": 0.13294727110987076, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 15244 + }, + { + "epoch": 0.13295599239503933, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 15245 + }, + { + "epoch": 0.1329647136802079, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 15246 + }, + { + "epoch": 0.1329734349653765, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 15247 + }, + { + "epoch": 0.13298215625054507, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 15248 + }, + { + "epoch": 0.13299087753571368, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 15249 + }, + { + "epoch": 0.13299959882088225, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 15250 + }, + { + "epoch": 0.13300832010605082, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 15251 + }, + { + "epoch": 0.13301704139121942, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15252 + }, + { + "epoch": 0.133025762676388, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 15253 + }, + { + "epoch": 0.13303448396155657, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 15254 + }, + { + "epoch": 0.13304320524672517, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 15255 + }, + { + "epoch": 0.13305192653189374, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 15256 + }, + { + "epoch": 0.1330606478170623, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15257 + }, + { + "epoch": 0.1330693691022309, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 15258 + }, + { + "epoch": 0.13307809038739948, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 15259 + }, + { + "epoch": 0.13308681167256806, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 15260 + }, + { + "epoch": 0.13309553295773666, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 15261 + }, + { + "epoch": 0.13310425424290523, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 15262 + }, + { + "epoch": 0.13311297552807383, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 15263 + }, + { + "epoch": 0.1331216968132424, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 15264 + }, + { + "epoch": 0.13313041809841097, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 15265 + }, + { + "epoch": 0.13313913938357957, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 15266 + }, + { + "epoch": 0.13314786066874815, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 15267 + }, + { + "epoch": 0.13315658195391672, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 15268 + }, + { + "epoch": 0.13316530323908532, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 15269 + }, + { + "epoch": 0.1331740245242539, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 15270 + }, + { + "epoch": 0.13318274580942246, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 15271 + }, + { + "epoch": 0.13319146709459107, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 15272 + }, + { + "epoch": 0.13320018837975964, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 15273 + }, + { + "epoch": 0.1332089096649282, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 15274 + }, + { + "epoch": 0.1332176309500968, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 15275 + }, + { + "epoch": 0.13322635223526538, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 15276 + }, + { + "epoch": 0.13323507352043398, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 15277 + }, + { + "epoch": 0.13324379480560256, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 15278 + }, + { + "epoch": 0.13325251609077113, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 15279 + }, + { + "epoch": 0.13326123737593973, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 15280 + }, + { + "epoch": 0.1332699586611083, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 15281 + }, + { + "epoch": 0.13327867994627687, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 15282 + }, + { + "epoch": 0.13328740123144547, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 15283 + }, + { + "epoch": 0.13329612251661405, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 15284 + }, + { + "epoch": 0.13330484380178262, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 15285 + }, + { + "epoch": 0.13331356508695122, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 15286 + }, + { + "epoch": 0.1333222863721198, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 15287 + }, + { + "epoch": 0.1333310076572884, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 15288 + }, + { + "epoch": 0.13333972894245696, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 15289 + }, + { + "epoch": 0.13334845022762554, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 15290 + }, + { + "epoch": 0.13335717151279414, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 15291 + }, + { + "epoch": 0.1333658927979627, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 15292 + }, + { + "epoch": 0.13337461408313128, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 15293 + }, + { + "epoch": 0.13338333536829988, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 15294 + }, + { + "epoch": 0.13339205665346845, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 15295 + }, + { + "epoch": 0.13340077793863703, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 15296 + }, + { + "epoch": 0.13340949922380563, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 15297 + }, + { + "epoch": 0.1334182205089742, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 15298 + }, + { + "epoch": 0.13342694179414277, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 15299 + }, + { + "epoch": 0.13343566307931137, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 15300 + }, + { + "epoch": 0.13344438436447995, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 15301 + }, + { + "epoch": 0.13345310564964855, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 15302 + }, + { + "epoch": 0.13346182693481712, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 15303 + }, + { + "epoch": 0.1334705482199857, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 15304 + }, + { + "epoch": 0.1334792695051543, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 15305 + }, + { + "epoch": 0.13348799079032286, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 15306 + }, + { + "epoch": 0.13349671207549144, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 15307 + }, + { + "epoch": 0.13350543336066004, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 15308 + }, + { + "epoch": 0.1335141546458286, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 15309 + }, + { + "epoch": 0.13352287593099718, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 15310 + }, + { + "epoch": 0.13353159721616578, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 15311 + }, + { + "epoch": 0.13354031850133435, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 15312 + }, + { + "epoch": 0.13354903978650293, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 15313 + }, + { + "epoch": 0.13355776107167153, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 15314 + }, + { + "epoch": 0.1335664823568401, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 15315 + }, + { + "epoch": 0.1335752036420087, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 15316 + }, + { + "epoch": 0.13358392492717727, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 15317 + }, + { + "epoch": 0.13359264621234584, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 15318 + }, + { + "epoch": 0.13360136749751444, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 15319 + }, + { + "epoch": 0.13361008878268302, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 15320 + }, + { + "epoch": 0.1336188100678516, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 15321 + }, + { + "epoch": 0.1336275313530202, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 15322 + }, + { + "epoch": 0.13363625263818876, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 15323 + }, + { + "epoch": 0.13364497392335734, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 15324 + }, + { + "epoch": 0.13365369520852594, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 15325 + }, + { + "epoch": 0.1336624164936945, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 15326 + }, + { + "epoch": 0.13367113777886308, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 15327 + }, + { + "epoch": 0.13367985906403168, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 15328 + }, + { + "epoch": 0.13368858034920025, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 15329 + }, + { + "epoch": 0.13369730163436885, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 15330 + }, + { + "epoch": 0.13370602291953743, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 15331 + }, + { + "epoch": 0.133714744204706, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 15332 + }, + { + "epoch": 0.1337234654898746, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 15333 + }, + { + "epoch": 0.13373218677504317, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 15334 + }, + { + "epoch": 0.13374090806021174, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 15335 + }, + { + "epoch": 0.13374962934538034, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 15336 + }, + { + "epoch": 0.13375835063054892, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 15337 + }, + { + "epoch": 0.1337670719157175, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 15338 + }, + { + "epoch": 0.1337757932008861, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 15339 + }, + { + "epoch": 0.13378451448605466, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 15340 + }, + { + "epoch": 0.13379323577122323, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 15341 + }, + { + "epoch": 0.13380195705639183, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 15342 + }, + { + "epoch": 0.1338106783415604, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 15343 + }, + { + "epoch": 0.133819399626729, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 15344 + }, + { + "epoch": 0.13382812091189758, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 15345 + }, + { + "epoch": 0.13383684219706615, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 15346 + }, + { + "epoch": 0.13384556348223475, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 15347 + }, + { + "epoch": 0.13385428476740333, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 15348 + }, + { + "epoch": 0.1338630060525719, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 15349 + }, + { + "epoch": 0.1338717273377405, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 15350 + }, + { + "epoch": 0.13388044862290907, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 15351 + }, + { + "epoch": 0.13388916990807764, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 15352 + }, + { + "epoch": 0.13389789119324624, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 15353 + }, + { + "epoch": 0.13390661247841482, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 15354 + }, + { + "epoch": 0.1339153337635834, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 15355 + }, + { + "epoch": 0.133924055048752, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 15356 + }, + { + "epoch": 0.13393277633392056, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15357 + }, + { + "epoch": 0.13394149761908916, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 15358 + }, + { + "epoch": 0.13395021890425773, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 15359 + }, + { + "epoch": 0.1339589401894263, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 15360 + }, + { + "epoch": 0.1339676614745949, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 15361 + }, + { + "epoch": 0.13397638275976348, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 15362 + }, + { + "epoch": 0.13398510404493205, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 15363 + }, + { + "epoch": 0.13399382533010065, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 15364 + }, + { + "epoch": 0.13400254661526922, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 15365 + }, + { + "epoch": 0.1340112679004378, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 15366 + }, + { + "epoch": 0.1340199891856064, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 15367 + }, + { + "epoch": 0.13402871047077497, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 15368 + }, + { + "epoch": 0.13403743175594354, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 15369 + }, + { + "epoch": 0.13404615304111214, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 15370 + }, + { + "epoch": 0.13405487432628072, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 15371 + }, + { + "epoch": 0.13406359561144932, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 15372 + }, + { + "epoch": 0.1340723168966179, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15373 + }, + { + "epoch": 0.13408103818178646, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 15374 + }, + { + "epoch": 0.13408975946695506, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 15375 + }, + { + "epoch": 0.13409848075212363, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 15376 + }, + { + "epoch": 0.1341072020372922, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 15377 + }, + { + "epoch": 0.1341159233224608, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 15378 + }, + { + "epoch": 0.13412464460762938, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 15379 + }, + { + "epoch": 0.13413336589279795, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 15380 + }, + { + "epoch": 0.13414208717796655, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 15381 + }, + { + "epoch": 0.13415080846313512, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 15382 + }, + { + "epoch": 0.1341595297483037, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 15383 + }, + { + "epoch": 0.1341682510334723, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 15384 + }, + { + "epoch": 0.13417697231864087, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 15385 + }, + { + "epoch": 0.13418569360380947, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 15386 + }, + { + "epoch": 0.13419441488897804, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 15387 + }, + { + "epoch": 0.13420313617414661, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 15388 + }, + { + "epoch": 0.13421185745931521, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 15389 + }, + { + "epoch": 0.1342205787444838, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 15390 + }, + { + "epoch": 0.13422930002965236, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 15391 + }, + { + "epoch": 0.13423802131482096, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 15392 + }, + { + "epoch": 0.13424674259998953, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 15393 + }, + { + "epoch": 0.1342554638851581, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 15394 + }, + { + "epoch": 0.1342641851703267, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 15395 + }, + { + "epoch": 0.13427290645549528, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 15396 + }, + { + "epoch": 0.13428162774066388, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 15397 + }, + { + "epoch": 0.13429034902583245, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 15398 + }, + { + "epoch": 0.13429907031100102, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 15399 + }, + { + "epoch": 0.13430779159616962, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 15400 + }, + { + "epoch": 0.1343165128813382, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 15401 + }, + { + "epoch": 0.13432523416650677, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 15402 + }, + { + "epoch": 0.13433395545167537, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 15403 + }, + { + "epoch": 0.13434267673684394, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 15404 + }, + { + "epoch": 0.1343513980220125, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 15405 + }, + { + "epoch": 0.13436011930718111, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 15406 + }, + { + "epoch": 0.1343688405923497, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 15407 + }, + { + "epoch": 0.13437756187751826, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 15408 + }, + { + "epoch": 0.13438628316268686, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 15409 + }, + { + "epoch": 0.13439500444785543, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 15410 + }, + { + "epoch": 0.13440372573302403, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 15411 + }, + { + "epoch": 0.1344124470181926, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 15412 + }, + { + "epoch": 0.13442116830336118, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 15413 + }, + { + "epoch": 0.13442988958852978, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 15414 + }, + { + "epoch": 0.13443861087369835, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 15415 + }, + { + "epoch": 0.13444733215886692, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 15416 + }, + { + "epoch": 0.13445605344403552, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15417 + }, + { + "epoch": 0.1344647747292041, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 15418 + }, + { + "epoch": 0.13447349601437267, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 15419 + }, + { + "epoch": 0.13448221729954127, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 15420 + }, + { + "epoch": 0.13449093858470984, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 15421 + }, + { + "epoch": 0.1344996598698784, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 15422 + }, + { + "epoch": 0.134508381155047, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 15423 + }, + { + "epoch": 0.13451710244021559, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 15424 + }, + { + "epoch": 0.13452582372538419, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 15425 + }, + { + "epoch": 0.13453454501055276, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15426 + }, + { + "epoch": 0.13454326629572133, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 15427 + }, + { + "epoch": 0.13455198758088993, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 15428 + }, + { + "epoch": 0.1345607088660585, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 15429 + }, + { + "epoch": 0.13456943015122708, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 15430 + }, + { + "epoch": 0.13457815143639568, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 15431 + }, + { + "epoch": 0.13458687272156425, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15432 + }, + { + "epoch": 0.13459559400673282, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 15433 + }, + { + "epoch": 0.13460431529190142, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 15434 + }, + { + "epoch": 0.13461303657707, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 15435 + }, + { + "epoch": 0.13462175786223857, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 15436 + }, + { + "epoch": 0.13463047914740717, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 15437 + }, + { + "epoch": 0.13463920043257574, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15438 + }, + { + "epoch": 0.13464792171774434, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 15439 + }, + { + "epoch": 0.1346566430029129, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 15440 + }, + { + "epoch": 0.13466536428808148, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 15441 + }, + { + "epoch": 0.13467408557325009, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 15442 + }, + { + "epoch": 0.13468280685841866, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 15443 + }, + { + "epoch": 0.13469152814358723, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 15444 + }, + { + "epoch": 0.13470024942875583, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 15445 + }, + { + "epoch": 0.1347089707139244, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 15446 + }, + { + "epoch": 0.13471769199909298, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 15447 + }, + { + "epoch": 0.13472641328426158, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 15448 + }, + { + "epoch": 0.13473513456943015, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 15449 + }, + { + "epoch": 0.13474385585459872, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 15450 + }, + { + "epoch": 0.13475257713976732, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 15451 + }, + { + "epoch": 0.1347612984249359, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 15452 + }, + { + "epoch": 0.1347700197101045, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 15453 + }, + { + "epoch": 0.13477874099527307, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 15454 + }, + { + "epoch": 0.13478746228044164, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 15455 + }, + { + "epoch": 0.13479618356561024, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 15456 + }, + { + "epoch": 0.1348049048507788, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 15457 + }, + { + "epoch": 0.13481362613594738, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 15458 + }, + { + "epoch": 0.13482234742111598, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 15459 + }, + { + "epoch": 0.13483106870628456, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 15460 + }, + { + "epoch": 0.13483978999145313, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 15461 + }, + { + "epoch": 0.13484851127662173, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 15462 + }, + { + "epoch": 0.1348572325617903, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 15463 + }, + { + "epoch": 0.13486595384695887, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15464 + }, + { + "epoch": 0.13487467513212748, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 15465 + }, + { + "epoch": 0.13488339641729605, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 15466 + }, + { + "epoch": 0.13489211770246465, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 15467 + }, + { + "epoch": 0.13490083898763322, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 15468 + }, + { + "epoch": 0.1349095602728018, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 15469 + }, + { + "epoch": 0.1349182815579704, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 15470 + }, + { + "epoch": 0.13492700284313897, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 15471 + }, + { + "epoch": 0.13493572412830754, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 15472 + }, + { + "epoch": 0.13494444541347614, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 15473 + }, + { + "epoch": 0.1349531666986447, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 15474 + }, + { + "epoch": 0.13496188798381328, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 15475 + }, + { + "epoch": 0.13497060926898188, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 15476 + }, + { + "epoch": 0.13497933055415046, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 15477 + }, + { + "epoch": 0.13498805183931903, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 15478 + }, + { + "epoch": 0.13499677312448763, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 15479 + }, + { + "epoch": 0.1350054944096562, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 15480 + }, + { + "epoch": 0.1350142156948248, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 15481 + }, + { + "epoch": 0.13502293697999337, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 15482 + }, + { + "epoch": 0.13503165826516195, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 15483 + }, + { + "epoch": 0.13504037955033055, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 15484 + }, + { + "epoch": 0.13504910083549912, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 15485 + }, + { + "epoch": 0.1350578221206677, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 15486 + }, + { + "epoch": 0.1350665434058363, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 15487 + }, + { + "epoch": 0.13507526469100486, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 15488 + }, + { + "epoch": 0.13508398597617344, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 15489 + }, + { + "epoch": 0.13509270726134204, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15490 + }, + { + "epoch": 0.1351014285465106, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 15491 + }, + { + "epoch": 0.13511014983167918, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 15492 + }, + { + "epoch": 0.13511887111684778, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 15493 + }, + { + "epoch": 0.13512759240201636, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 15494 + }, + { + "epoch": 0.13513631368718496, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 15495 + }, + { + "epoch": 0.13514503497235353, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 15496 + }, + { + "epoch": 0.1351537562575221, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 15497 + }, + { + "epoch": 0.1351624775426907, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15498 + }, + { + "epoch": 0.13517119882785927, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 15499 + }, + { + "epoch": 0.13517992011302785, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 15500 + }, + { + "epoch": 0.13518864139819645, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 15501 + }, + { + "epoch": 0.13519736268336502, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 15502 + }, + { + "epoch": 0.1352060839685336, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 15503 + }, + { + "epoch": 0.1352148052537022, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 15504 + }, + { + "epoch": 0.13522352653887076, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 15505 + }, + { + "epoch": 0.13523224782403934, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 15506 + }, + { + "epoch": 0.13524096910920794, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 15507 + }, + { + "epoch": 0.1352496903943765, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 15508 + }, + { + "epoch": 0.1352584116795451, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 15509 + }, + { + "epoch": 0.13526713296471368, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 15510 + }, + { + "epoch": 0.13527585424988225, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 15511 + }, + { + "epoch": 0.13528457553505085, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 15512 + }, + { + "epoch": 0.13529329682021943, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 15513 + }, + { + "epoch": 0.135302018105388, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 15514 + }, + { + "epoch": 0.1353107393905566, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 15515 + }, + { + "epoch": 0.13531946067572517, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15516 + }, + { + "epoch": 0.13532818196089375, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 15517 + }, + { + "epoch": 0.13533690324606235, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15518 + }, + { + "epoch": 0.13534562453123092, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 15519 + }, + { + "epoch": 0.13535434581639952, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 15520 + }, + { + "epoch": 0.1353630671015681, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 15521 + }, + { + "epoch": 0.13537178838673666, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 15522 + }, + { + "epoch": 0.13538050967190526, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 15523 + }, + { + "epoch": 0.13538923095707384, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 15524 + }, + { + "epoch": 0.1353979522422424, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 15525 + }, + { + "epoch": 0.135406673527411, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 15526 + }, + { + "epoch": 0.13541539481257958, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 15527 + }, + { + "epoch": 0.13542411609774815, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 15528 + }, + { + "epoch": 0.13543283738291675, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 15529 + }, + { + "epoch": 0.13544155866808533, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 15530 + }, + { + "epoch": 0.1354502799532539, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 15531 + }, + { + "epoch": 0.1354590012384225, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 15532 + }, + { + "epoch": 0.13546772252359107, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 15533 + }, + { + "epoch": 0.13547644380875967, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 15534 + }, + { + "epoch": 0.13548516509392824, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 15535 + }, + { + "epoch": 0.13549388637909682, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 15536 + }, + { + "epoch": 0.13550260766426542, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 15537 + }, + { + "epoch": 0.135511328949434, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 15538 + }, + { + "epoch": 0.13552005023460256, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 15539 + }, + { + "epoch": 0.13552877151977116, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 15540 + }, + { + "epoch": 0.13553749280493974, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 15541 + }, + { + "epoch": 0.1355462140901083, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 15542 + }, + { + "epoch": 0.1355549353752769, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 15543 + }, + { + "epoch": 0.13556365666044548, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 15544 + }, + { + "epoch": 0.13557237794561405, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 15545 + }, + { + "epoch": 0.13558109923078265, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 15546 + }, + { + "epoch": 0.13558982051595123, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15547 + }, + { + "epoch": 0.13559854180111983, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 15548 + }, + { + "epoch": 0.1356072630862884, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 15549 + }, + { + "epoch": 0.13561598437145697, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15550 + }, + { + "epoch": 0.13562470565662557, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 15551 + }, + { + "epoch": 0.13563342694179414, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 15552 + }, + { + "epoch": 0.13564214822696272, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 15553 + }, + { + "epoch": 0.13565086951213132, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 15554 + }, + { + "epoch": 0.1356595907972999, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 15555 + }, + { + "epoch": 0.13566831208246846, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15556 + }, + { + "epoch": 0.13567703336763706, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 15557 + }, + { + "epoch": 0.13568575465280563, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 15558 + }, + { + "epoch": 0.1356944759379742, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 15559 + }, + { + "epoch": 0.1357031972231428, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 15560 + }, + { + "epoch": 0.13571191850831138, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 15561 + }, + { + "epoch": 0.13572063979347998, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 15562 + }, + { + "epoch": 0.13572936107864855, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 15563 + }, + { + "epoch": 0.13573808236381713, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 15564 + }, + { + "epoch": 0.13574680364898573, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 15565 + }, + { + "epoch": 0.1357555249341543, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 15566 + }, + { + "epoch": 0.13576424621932287, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15567 + }, + { + "epoch": 0.13577296750449147, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 15568 + }, + { + "epoch": 0.13578168878966004, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 15569 + }, + { + "epoch": 0.13579041007482862, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 15570 + }, + { + "epoch": 0.13579913135999722, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 15571 + }, + { + "epoch": 0.1358078526451658, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 15572 + }, + { + "epoch": 0.13581657393033436, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 15573 + }, + { + "epoch": 0.13582529521550296, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 15574 + }, + { + "epoch": 0.13583401650067153, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 15575 + }, + { + "epoch": 0.13584273778584013, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 15576 + }, + { + "epoch": 0.1358514590710087, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 15577 + }, + { + "epoch": 0.13586018035617728, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 15578 + }, + { + "epoch": 0.13586890164134588, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 15579 + }, + { + "epoch": 0.13587762292651445, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 15580 + }, + { + "epoch": 0.13588634421168302, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 15581 + }, + { + "epoch": 0.13589506549685162, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 15582 + }, + { + "epoch": 0.1359037867820202, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 15583 + }, + { + "epoch": 0.13591250806718877, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 15584 + }, + { + "epoch": 0.13592122935235737, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15585 + }, + { + "epoch": 0.13592995063752594, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 15586 + }, + { + "epoch": 0.13593867192269452, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 15587 + }, + { + "epoch": 0.13594739320786312, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15588 + }, + { + "epoch": 0.1359561144930317, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 15589 + }, + { + "epoch": 0.1359648357782003, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 15590 + }, + { + "epoch": 0.13597355706336886, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15591 + }, + { + "epoch": 0.13598227834853743, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 15592 + }, + { + "epoch": 0.13599099963370603, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 15593 + }, + { + "epoch": 0.1359997209188746, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 15594 + }, + { + "epoch": 0.13600844220404318, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 15595 + }, + { + "epoch": 0.13601716348921178, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15596 + }, + { + "epoch": 0.13602588477438035, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15597 + }, + { + "epoch": 0.13603460605954892, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 15598 + }, + { + "epoch": 0.13604332734471752, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 15599 + }, + { + "epoch": 0.1360520486298861, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 15600 + }, + { + "epoch": 0.13606076991505467, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 15601 + }, + { + "epoch": 0.13606949120022327, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 15602 + }, + { + "epoch": 0.13607821248539184, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 15603 + }, + { + "epoch": 0.13608693377056044, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 15604 + }, + { + "epoch": 0.13609565505572901, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 15605 + }, + { + "epoch": 0.1361043763408976, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 15606 + }, + { + "epoch": 0.1361130976260662, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 15607 + }, + { + "epoch": 0.13612181891123476, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 15608 + }, + { + "epoch": 0.13613054019640333, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 15609 + }, + { + "epoch": 0.13613926148157193, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 15610 + }, + { + "epoch": 0.1361479827667405, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 15611 + }, + { + "epoch": 0.13615670405190908, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 15612 + }, + { + "epoch": 0.13616542533707768, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 15613 + }, + { + "epoch": 0.13617414662224625, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 15614 + }, + { + "epoch": 0.13618286790741482, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 15615 + }, + { + "epoch": 0.13619158919258342, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 15616 + }, + { + "epoch": 0.136200310477752, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 15617 + }, + { + "epoch": 0.1362090317629206, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 15618 + }, + { + "epoch": 0.13621775304808917, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15619 + }, + { + "epoch": 0.13622647433325774, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 15620 + }, + { + "epoch": 0.13623519561842634, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 15621 + }, + { + "epoch": 0.1362439169035949, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 15622 + }, + { + "epoch": 0.1362526381887635, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 15623 + }, + { + "epoch": 0.1362613594739321, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 15624 + }, + { + "epoch": 0.13627008075910066, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15625 + }, + { + "epoch": 0.13627880204426923, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 15626 + }, + { + "epoch": 0.13628752332943783, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 15627 + }, + { + "epoch": 0.1362962446146064, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 15628 + }, + { + "epoch": 0.136304965899775, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 15629 + }, + { + "epoch": 0.13631368718494358, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 15630 + }, + { + "epoch": 0.13632240847011215, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 15631 + }, + { + "epoch": 0.13633112975528075, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 15632 + }, + { + "epoch": 0.13633985104044932, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 15633 + }, + { + "epoch": 0.1363485723256179, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 15634 + }, + { + "epoch": 0.1363572936107865, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 15635 + }, + { + "epoch": 0.13636601489595507, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 15636 + }, + { + "epoch": 0.13637473618112364, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 15637 + }, + { + "epoch": 0.13638345746629224, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 15638 + }, + { + "epoch": 0.1363921787514608, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 15639 + }, + { + "epoch": 0.13640090003662939, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 15640 + }, + { + "epoch": 0.13640962132179799, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15641 + }, + { + "epoch": 0.13641834260696656, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 15642 + }, + { + "epoch": 0.13642706389213516, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 15643 + }, + { + "epoch": 0.13643578517730373, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 15644 + }, + { + "epoch": 0.1364445064624723, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 15645 + }, + { + "epoch": 0.1364532277476409, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 15646 + }, + { + "epoch": 0.13646194903280948, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 15647 + }, + { + "epoch": 0.13647067031797805, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 15648 + }, + { + "epoch": 0.13647939160314665, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 15649 + }, + { + "epoch": 0.13648811288831522, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 15650 + }, + { + "epoch": 0.1364968341734838, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 15651 + }, + { + "epoch": 0.1365055554586524, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 15652 + }, + { + "epoch": 0.13651427674382097, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 15653 + }, + { + "epoch": 0.13652299802898954, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 15654 + }, + { + "epoch": 0.13653171931415814, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 15655 + }, + { + "epoch": 0.1365404405993267, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 15656 + }, + { + "epoch": 0.1365491618844953, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 15657 + }, + { + "epoch": 0.13655788316966389, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 15658 + }, + { + "epoch": 0.13656660445483246, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15659 + }, + { + "epoch": 0.13657532574000106, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 15660 + }, + { + "epoch": 0.13658404702516963, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 15661 + }, + { + "epoch": 0.1365927683103382, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 15662 + }, + { + "epoch": 0.1366014895955068, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 15663 + }, + { + "epoch": 0.13661021088067538, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 15664 + }, + { + "epoch": 0.13661893216584395, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 15665 + }, + { + "epoch": 0.13662765345101255, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 15666 + }, + { + "epoch": 0.13663637473618112, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 15667 + }, + { + "epoch": 0.1366450960213497, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15668 + }, + { + "epoch": 0.1366538173065183, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 15669 + }, + { + "epoch": 0.13666253859168687, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 15670 + }, + { + "epoch": 0.13667125987685547, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 15671 + }, + { + "epoch": 0.13667998116202404, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 15672 + }, + { + "epoch": 0.1366887024471926, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 15673 + }, + { + "epoch": 0.1366974237323612, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 15674 + }, + { + "epoch": 0.13670614501752978, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 15675 + }, + { + "epoch": 0.13671486630269836, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 15676 + }, + { + "epoch": 0.13672358758786696, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 15677 + }, + { + "epoch": 0.13673230887303553, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 15678 + }, + { + "epoch": 0.1367410301582041, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 15679 + }, + { + "epoch": 0.1367497514433727, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 15680 + }, + { + "epoch": 0.13675847272854127, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 15681 + }, + { + "epoch": 0.13676719401370985, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 15682 + }, + { + "epoch": 0.13677591529887845, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 15683 + }, + { + "epoch": 0.13678463658404702, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 15684 + }, + { + "epoch": 0.13679335786921562, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 15685 + }, + { + "epoch": 0.1368020791543842, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 15686 + }, + { + "epoch": 0.13681080043955277, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 15687 + }, + { + "epoch": 0.13681952172472137, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 15688 + }, + { + "epoch": 0.13682824300988994, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 15689 + }, + { + "epoch": 0.1368369642950585, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 15690 + }, + { + "epoch": 0.1368456855802271, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 15691 + }, + { + "epoch": 0.13685440686539568, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 15692 + }, + { + "epoch": 0.13686312815056426, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 15693 + }, + { + "epoch": 0.13687184943573286, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 15694 + }, + { + "epoch": 0.13688057072090143, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 15695 + }, + { + "epoch": 0.13688929200607, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 15696 + }, + { + "epoch": 0.1368980132912386, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 15697 + }, + { + "epoch": 0.13690673457640717, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 15698 + }, + { + "epoch": 0.13691545586157577, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 15699 + }, + { + "epoch": 0.13692417714674435, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 15700 + }, + { + "epoch": 0.13693289843191292, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 15701 + }, + { + "epoch": 0.13694161971708152, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 15702 + }, + { + "epoch": 0.1369503410022501, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 15703 + }, + { + "epoch": 0.13695906228741866, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 15704 + }, + { + "epoch": 0.13696778357258726, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 15705 + }, + { + "epoch": 0.13697650485775584, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 15706 + }, + { + "epoch": 0.1369852261429244, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 15707 + }, + { + "epoch": 0.136993947428093, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 15708 + }, + { + "epoch": 0.13700266871326158, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 15709 + }, + { + "epoch": 0.13701138999843016, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15710 + }, + { + "epoch": 0.13702011128359876, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 15711 + }, + { + "epoch": 0.13702883256876733, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 15712 + }, + { + "epoch": 0.13703755385393593, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 15713 + }, + { + "epoch": 0.1370462751391045, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 15714 + }, + { + "epoch": 0.13705499642427307, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 15715 + }, + { + "epoch": 0.13706371770944167, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 15716 + }, + { + "epoch": 0.13707243899461025, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 15717 + }, + { + "epoch": 0.13708116027977882, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 15718 + }, + { + "epoch": 0.13708988156494742, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 15719 + }, + { + "epoch": 0.137098602850116, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 15720 + }, + { + "epoch": 0.13710732413528456, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 15721 + }, + { + "epoch": 0.13711604542045316, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 15722 + }, + { + "epoch": 0.13712476670562174, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 15723 + }, + { + "epoch": 0.1371334879907903, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 15724 + }, + { + "epoch": 0.1371422092759589, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 15725 + }, + { + "epoch": 0.13715093056112748, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 15726 + }, + { + "epoch": 0.13715965184629608, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 15727 + }, + { + "epoch": 0.13716837313146465, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 15728 + }, + { + "epoch": 0.13717709441663323, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 15729 + }, + { + "epoch": 0.13718581570180183, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 15730 + }, + { + "epoch": 0.1371945369869704, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 15731 + }, + { + "epoch": 0.13720325827213897, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 15732 + }, + { + "epoch": 0.13721197955730757, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 15733 + }, + { + "epoch": 0.13722070084247615, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 15734 + }, + { + "epoch": 0.13722942212764472, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 15735 + }, + { + "epoch": 0.13723814341281332, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 15736 + }, + { + "epoch": 0.1372468646979819, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 15737 + }, + { + "epoch": 0.13725558598315046, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 15738 + }, + { + "epoch": 0.13726430726831906, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 15739 + }, + { + "epoch": 0.13727302855348764, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 15740 + }, + { + "epoch": 0.13728174983865624, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 15741 + }, + { + "epoch": 0.1372904711238248, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 15742 + }, + { + "epoch": 0.13729919240899338, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 15743 + }, + { + "epoch": 0.13730791369416198, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 15744 + }, + { + "epoch": 0.13731663497933055, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 15745 + }, + { + "epoch": 0.13732535626449913, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 15746 + }, + { + "epoch": 0.13733407754966773, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 15747 + }, + { + "epoch": 0.1373427988348363, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 15748 + }, + { + "epoch": 0.13735152012000487, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 15749 + }, + { + "epoch": 0.13736024140517347, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 15750 + }, + { + "epoch": 0.13736896269034204, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 15751 + }, + { + "epoch": 0.13737768397551064, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 15752 + }, + { + "epoch": 0.13738640526067922, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 15753 + }, + { + "epoch": 0.1373951265458478, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 15754 + }, + { + "epoch": 0.1374038478310164, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 15755 + }, + { + "epoch": 0.13741256911618496, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 15756 + }, + { + "epoch": 0.13742129040135354, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 15757 + }, + { + "epoch": 0.13743001168652214, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 15758 + }, + { + "epoch": 0.1374387329716907, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 15759 + }, + { + "epoch": 0.13744745425685928, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 15760 + }, + { + "epoch": 0.13745617554202788, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 15761 + }, + { + "epoch": 0.13746489682719645, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 15762 + }, + { + "epoch": 0.13747361811236503, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 15763 + }, + { + "epoch": 0.13748233939753363, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15764 + }, + { + "epoch": 0.1374910606827022, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 15765 + }, + { + "epoch": 0.1374997819678708, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 15766 + }, + { + "epoch": 0.13750850325303937, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 15767 + }, + { + "epoch": 0.13751722453820794, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 15768 + }, + { + "epoch": 0.13752594582337654, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 15769 + }, + { + "epoch": 0.13753466710854512, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 15770 + }, + { + "epoch": 0.1375433883937137, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 15771 + }, + { + "epoch": 0.1375521096788823, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 15772 + }, + { + "epoch": 0.13756083096405086, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 15773 + }, + { + "epoch": 0.13756955224921943, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 15774 + }, + { + "epoch": 0.13757827353438803, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 15775 + }, + { + "epoch": 0.1375869948195566, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 15776 + }, + { + "epoch": 0.13759571610472518, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 15777 + }, + { + "epoch": 0.13760443738989378, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 15778 + }, + { + "epoch": 0.13761315867506235, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 15779 + }, + { + "epoch": 0.13762187996023095, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 15780 + }, + { + "epoch": 0.13763060124539953, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 15781 + }, + { + "epoch": 0.1376393225305681, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 15782 + }, + { + "epoch": 0.1376480438157367, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 15783 + }, + { + "epoch": 0.13765676510090527, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 15784 + }, + { + "epoch": 0.13766548638607384, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 15785 + }, + { + "epoch": 0.13767420767124244, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 15786 + }, + { + "epoch": 0.13768292895641102, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 15787 + }, + { + "epoch": 0.1376916502415796, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 15788 + }, + { + "epoch": 0.1377003715267482, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 15789 + }, + { + "epoch": 0.13770909281191676, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 15790 + }, + { + "epoch": 0.13771781409708533, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 15791 + }, + { + "epoch": 0.13772653538225393, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 15792 + }, + { + "epoch": 0.1377352566674225, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 15793 + }, + { + "epoch": 0.1377439779525911, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 15794 + }, + { + "epoch": 0.13775269923775968, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 15795 + }, + { + "epoch": 0.13776142052292825, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 15796 + }, + { + "epoch": 0.13777014180809685, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 15797 + }, + { + "epoch": 0.13777886309326542, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 15798 + }, + { + "epoch": 0.137787584378434, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 15799 + }, + { + "epoch": 0.1377963056636026, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 15800 + }, + { + "epoch": 0.13780502694877117, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 15801 + }, + { + "epoch": 0.13781374823393974, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 15802 + }, + { + "epoch": 0.13782246951910834, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 15803 + }, + { + "epoch": 0.13783119080427692, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 15804 + }, + { + "epoch": 0.1378399120894455, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 15805 + }, + { + "epoch": 0.1378486333746141, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 15806 + }, + { + "epoch": 0.13785735465978266, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15807 + }, + { + "epoch": 0.13786607594495126, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 15808 + }, + { + "epoch": 0.13787479723011983, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 15809 + }, + { + "epoch": 0.1378835185152884, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 15810 + }, + { + "epoch": 0.137892239800457, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 15811 + }, + { + "epoch": 0.13790096108562558, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 15812 + }, + { + "epoch": 0.13790968237079415, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 15813 + }, + { + "epoch": 0.13791840365596275, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 15814 + }, + { + "epoch": 0.13792712494113132, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 15815 + }, + { + "epoch": 0.1379358462262999, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 15816 + }, + { + "epoch": 0.1379445675114685, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 15817 + }, + { + "epoch": 0.13795328879663707, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 15818 + }, + { + "epoch": 0.13796201008180564, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 15819 + }, + { + "epoch": 0.13797073136697424, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 15820 + }, + { + "epoch": 0.13797945265214281, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 15821 + }, + { + "epoch": 0.13798817393731141, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 15822 + }, + { + "epoch": 0.13799689522248, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 15823 + }, + { + "epoch": 0.13800561650764856, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 15824 + }, + { + "epoch": 0.13801433779281716, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 15825 + }, + { + "epoch": 0.13802305907798573, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 15826 + }, + { + "epoch": 0.1380317803631543, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 15827 + }, + { + "epoch": 0.1380405016483229, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 15828 + }, + { + "epoch": 0.13804922293349148, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 15829 + }, + { + "epoch": 0.13805794421866005, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 15830 + }, + { + "epoch": 0.13806666550382865, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 15831 + }, + { + "epoch": 0.13807538678899722, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 15832 + }, + { + "epoch": 0.1380841080741658, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 15833 + }, + { + "epoch": 0.1380928293593344, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 15834 + }, + { + "epoch": 0.13810155064450297, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 15835 + }, + { + "epoch": 0.13811027192967157, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 15836 + }, + { + "epoch": 0.13811899321484014, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 15837 + }, + { + "epoch": 0.1381277145000087, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 15838 + }, + { + "epoch": 0.13813643578517731, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 15839 + }, + { + "epoch": 0.1381451570703459, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 15840 + }, + { + "epoch": 0.13815387835551446, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 15841 + }, + { + "epoch": 0.13816259964068306, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 15842 + }, + { + "epoch": 0.13817132092585163, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 15843 + }, + { + "epoch": 0.1381800422110202, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 15844 + }, + { + "epoch": 0.1381887634961888, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 15845 + }, + { + "epoch": 0.13819748478135738, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 15846 + }, + { + "epoch": 0.13820620606652595, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 15847 + }, + { + "epoch": 0.13821492735169455, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 15848 + }, + { + "epoch": 0.13822364863686312, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 15849 + }, + { + "epoch": 0.13823236992203172, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 15850 + }, + { + "epoch": 0.1382410912072003, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 15851 + }, + { + "epoch": 0.13824981249236887, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 15852 + }, + { + "epoch": 0.13825853377753747, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 15853 + }, + { + "epoch": 0.13826725506270604, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 15854 + }, + { + "epoch": 0.1382759763478746, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 15855 + }, + { + "epoch": 0.1382846976330432, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 15856 + }, + { + "epoch": 0.13829341891821179, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15857 + }, + { + "epoch": 0.13830214020338036, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 15858 + }, + { + "epoch": 0.13831086148854896, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 15859 + }, + { + "epoch": 0.13831958277371753, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 15860 + }, + { + "epoch": 0.13832830405888613, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 15861 + }, + { + "epoch": 0.1383370253440547, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 15862 + }, + { + "epoch": 0.13834574662922328, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 15863 + }, + { + "epoch": 0.13835446791439188, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 15864 + }, + { + "epoch": 0.13836318919956045, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 15865 + }, + { + "epoch": 0.13837191048472902, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 15866 + }, + { + "epoch": 0.13838063176989762, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 15867 + }, + { + "epoch": 0.1383893530550662, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 15868 + }, + { + "epoch": 0.13839807434023477, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 15869 + }, + { + "epoch": 0.13840679562540337, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15870 + }, + { + "epoch": 0.13841551691057194, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 15871 + }, + { + "epoch": 0.1384242381957405, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 15872 + }, + { + "epoch": 0.1384329594809091, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 15873 + }, + { + "epoch": 0.13844168076607768, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 15874 + }, + { + "epoch": 0.13845040205124629, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 15875 + }, + { + "epoch": 0.13845912333641486, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 15876 + }, + { + "epoch": 0.13846784462158343, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 15877 + }, + { + "epoch": 0.13847656590675203, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 15878 + }, + { + "epoch": 0.1384852871919206, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 15879 + }, + { + "epoch": 0.13849400847708918, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 15880 + }, + { + "epoch": 0.13850272976225778, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 15881 + }, + { + "epoch": 0.13851145104742635, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 15882 + }, + { + "epoch": 0.13852017233259492, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 15883 + }, + { + "epoch": 0.13852889361776352, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 15884 + }, + { + "epoch": 0.1385376149029321, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 15885 + }, + { + "epoch": 0.13854633618810067, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 15886 + }, + { + "epoch": 0.13855505747326927, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 15887 + }, + { + "epoch": 0.13856377875843784, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 15888 + }, + { + "epoch": 0.13857250004360644, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 15889 + }, + { + "epoch": 0.138581221328775, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 15890 + }, + { + "epoch": 0.13858994261394358, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 15891 + }, + { + "epoch": 0.13859866389911218, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 15892 + }, + { + "epoch": 0.13860738518428076, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 15893 + }, + { + "epoch": 0.13861610646944933, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 15894 + }, + { + "epoch": 0.13862482775461793, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 15895 + }, + { + "epoch": 0.1386335490397865, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 15896 + }, + { + "epoch": 0.13864227032495507, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 15897 + }, + { + "epoch": 0.13865099161012368, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 15898 + }, + { + "epoch": 0.13865971289529225, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 15899 + }, + { + "epoch": 0.13866843418046082, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 15900 + }, + { + "epoch": 0.13867715546562942, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 15901 + }, + { + "epoch": 0.138685876750798, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 15902 + }, + { + "epoch": 0.1386945980359666, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 15903 + }, + { + "epoch": 0.13870331932113517, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 15904 + }, + { + "epoch": 0.13871204060630374, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 15905 + }, + { + "epoch": 0.13872076189147234, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 15906 + }, + { + "epoch": 0.1387294831766409, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 15907 + }, + { + "epoch": 0.13873820446180948, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 15908 + }, + { + "epoch": 0.13874692574697808, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 15909 + }, + { + "epoch": 0.13875564703214666, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 15910 + }, + { + "epoch": 0.13876436831731523, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 15911 + }, + { + "epoch": 0.13877308960248383, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 15912 + }, + { + "epoch": 0.1387818108876524, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 15913 + }, + { + "epoch": 0.13879053217282097, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 15914 + }, + { + "epoch": 0.13879925345798957, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 15915 + }, + { + "epoch": 0.13880797474315815, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 15916 + }, + { + "epoch": 0.13881669602832675, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 15917 + }, + { + "epoch": 0.13882541731349532, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 15918 + }, + { + "epoch": 0.1388341385986639, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 15919 + }, + { + "epoch": 0.1388428598838325, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 15920 + }, + { + "epoch": 0.13885158116900106, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 15921 + }, + { + "epoch": 0.13886030245416964, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 15922 + }, + { + "epoch": 0.13886902373933824, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 15923 + }, + { + "epoch": 0.1388777450245068, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 15924 + }, + { + "epoch": 0.13888646630967538, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 15925 + }, + { + "epoch": 0.13889518759484398, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 15926 + }, + { + "epoch": 0.13890390888001256, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15927 + }, + { + "epoch": 0.13891263016518113, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 15928 + }, + { + "epoch": 0.13892135145034973, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 15929 + }, + { + "epoch": 0.1389300727355183, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 15930 + }, + { + "epoch": 0.1389387940206869, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 15931 + }, + { + "epoch": 0.13894751530585547, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 15932 + }, + { + "epoch": 0.13895623659102405, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 15933 + }, + { + "epoch": 0.13896495787619265, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 15934 + }, + { + "epoch": 0.13897367916136122, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 15935 + }, + { + "epoch": 0.1389824004465298, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 15936 + }, + { + "epoch": 0.1389911217316984, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 15937 + }, + { + "epoch": 0.13899984301686696, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 15938 + }, + { + "epoch": 0.13900856430203554, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 15939 + }, + { + "epoch": 0.13901728558720414, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 15940 + }, + { + "epoch": 0.1390260068723727, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 15941 + }, + { + "epoch": 0.13903472815754128, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 15942 + }, + { + "epoch": 0.13904344944270988, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 15943 + }, + { + "epoch": 0.13905217072787845, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 15944 + }, + { + "epoch": 0.13906089201304705, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 15945 + }, + { + "epoch": 0.13906961329821563, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 15946 + }, + { + "epoch": 0.1390783345833842, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 15947 + }, + { + "epoch": 0.1390870558685528, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 15948 + }, + { + "epoch": 0.13909577715372137, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 15949 + }, + { + "epoch": 0.13910449843888995, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 15950 + }, + { + "epoch": 0.13911321972405855, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 15951 + }, + { + "epoch": 0.13912194100922712, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 15952 + }, + { + "epoch": 0.1391306622943957, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 15953 + }, + { + "epoch": 0.1391393835795643, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 15954 + }, + { + "epoch": 0.13914810486473286, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 15955 + }, + { + "epoch": 0.13915682614990144, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 15956 + }, + { + "epoch": 0.13916554743507004, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 15957 + }, + { + "epoch": 0.1391742687202386, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 15958 + }, + { + "epoch": 0.1391829900054072, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 15959 + }, + { + "epoch": 0.13919171129057578, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 15960 + }, + { + "epoch": 0.13920043257574435, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 15961 + }, + { + "epoch": 0.13920915386091295, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 15962 + }, + { + "epoch": 0.13921787514608153, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 15963 + }, + { + "epoch": 0.1392265964312501, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 15964 + }, + { + "epoch": 0.1392353177164187, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 15965 + }, + { + "epoch": 0.13924403900158727, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 15966 + }, + { + "epoch": 0.13925276028675584, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 15967 + }, + { + "epoch": 0.13926148157192444, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 15968 + }, + { + "epoch": 0.13927020285709302, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 15969 + }, + { + "epoch": 0.13927892414226162, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 15970 + }, + { + "epoch": 0.1392876454274302, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 15971 + }, + { + "epoch": 0.13929636671259876, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 15972 + }, + { + "epoch": 0.13930508799776736, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 15973 + }, + { + "epoch": 0.13931380928293594, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 15974 + }, + { + "epoch": 0.1393225305681045, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 15975 + }, + { + "epoch": 0.1393312518532731, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 15976 + }, + { + "epoch": 0.13933997313844168, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 15977 + }, + { + "epoch": 0.13934869442361025, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 15978 + }, + { + "epoch": 0.13935741570877885, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 15979 + }, + { + "epoch": 0.13936613699394743, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 15980 + }, + { + "epoch": 0.139374858279116, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 15981 + }, + { + "epoch": 0.1393835795642846, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 15982 + }, + { + "epoch": 0.13939230084945317, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 15983 + }, + { + "epoch": 0.13940102213462177, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 15984 + }, + { + "epoch": 0.13940974341979034, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 15985 + }, + { + "epoch": 0.13941846470495892, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 15986 + }, + { + "epoch": 0.13942718599012752, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 15987 + }, + { + "epoch": 0.1394359072752961, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 15988 + }, + { + "epoch": 0.13944462856046466, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 15989 + }, + { + "epoch": 0.13945334984563326, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 15990 + }, + { + "epoch": 0.13946207113080183, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 15991 + }, + { + "epoch": 0.1394707924159704, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 15992 + }, + { + "epoch": 0.139479513701139, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 15993 + }, + { + "epoch": 0.13948823498630758, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 15994 + }, + { + "epoch": 0.13949695627147615, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 15995 + }, + { + "epoch": 0.13950567755664475, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 15996 + }, + { + "epoch": 0.13951439884181333, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 15997 + }, + { + "epoch": 0.13952312012698193, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 15998 + }, + { + "epoch": 0.1395318414121505, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 15999 + }, + { + "epoch": 0.13954056269731907, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 16000 + }, + { + "epoch": 0.13954928398248767, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 16001 + }, + { + "epoch": 0.13955800526765624, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 16002 + }, + { + "epoch": 0.13956672655282482, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 16003 + }, + { + "epoch": 0.13957544783799342, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16004 + }, + { + "epoch": 0.139584169123162, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 16005 + }, + { + "epoch": 0.13959289040833056, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 16006 + }, + { + "epoch": 0.13960161169349916, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 16007 + }, + { + "epoch": 0.13961033297866773, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 16008 + }, + { + "epoch": 0.1396190542638363, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 16009 + }, + { + "epoch": 0.1396277755490049, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 16010 + }, + { + "epoch": 0.13963649683417348, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 16011 + }, + { + "epoch": 0.13964521811934208, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 16012 + }, + { + "epoch": 0.13965393940451065, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 16013 + }, + { + "epoch": 0.13966266068967922, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 16014 + }, + { + "epoch": 0.13967138197484782, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 16015 + }, + { + "epoch": 0.1396801032600164, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 16016 + }, + { + "epoch": 0.13968882454518497, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 16017 + }, + { + "epoch": 0.13969754583035357, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 16018 + }, + { + "epoch": 0.13970626711552214, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 16019 + }, + { + "epoch": 0.13971498840069072, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 16020 + }, + { + "epoch": 0.13972370968585932, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16021 + }, + { + "epoch": 0.1397324309710279, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 16022 + }, + { + "epoch": 0.13974115225619646, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 16023 + }, + { + "epoch": 0.13974987354136506, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 16024 + }, + { + "epoch": 0.13975859482653363, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 16025 + }, + { + "epoch": 0.13976731611170223, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 16026 + }, + { + "epoch": 0.1397760373968708, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 16027 + }, + { + "epoch": 0.13978475868203938, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 16028 + }, + { + "epoch": 0.13979347996720798, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 16029 + }, + { + "epoch": 0.13980220125237655, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 16030 + }, + { + "epoch": 0.13981092253754512, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 16031 + }, + { + "epoch": 0.13981964382271372, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 16032 + }, + { + "epoch": 0.1398283651078823, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 16033 + }, + { + "epoch": 0.13983708639305087, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 16034 + }, + { + "epoch": 0.13984580767821947, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 16035 + }, + { + "epoch": 0.13985452896338804, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16036 + }, + { + "epoch": 0.13986325024855661, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 16037 + }, + { + "epoch": 0.13987197153372521, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 16038 + }, + { + "epoch": 0.1398806928188938, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 16039 + }, + { + "epoch": 0.1398894141040624, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 16040 + }, + { + "epoch": 0.13989813538923096, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 16041 + }, + { + "epoch": 0.13990685667439953, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16042 + }, + { + "epoch": 0.13991557795956813, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 16043 + }, + { + "epoch": 0.1399242992447367, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 16044 + }, + { + "epoch": 0.13993302052990528, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 16045 + }, + { + "epoch": 0.13994174181507388, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 16046 + }, + { + "epoch": 0.13995046310024245, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 16047 + }, + { + "epoch": 0.13995918438541102, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 16048 + }, + { + "epoch": 0.13996790567057962, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 16049 + }, + { + "epoch": 0.1399766269557482, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 16050 + }, + { + "epoch": 0.13998534824091677, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 16051 + }, + { + "epoch": 0.13999406952608537, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 16052 + }, + { + "epoch": 0.14000279081125394, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 16053 + }, + { + "epoch": 0.14001151209642254, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 16054 + }, + { + "epoch": 0.1400202333815911, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 16055 + }, + { + "epoch": 0.1400289546667597, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 16056 + }, + { + "epoch": 0.1400376759519283, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 16057 + }, + { + "epoch": 0.14004639723709686, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 16058 + }, + { + "epoch": 0.14005511852226543, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 16059 + }, + { + "epoch": 0.14006383980743403, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 16060 + }, + { + "epoch": 0.1400725610926026, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 16061 + }, + { + "epoch": 0.14008128237777118, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 16062 + }, + { + "epoch": 0.14009000366293978, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 16063 + }, + { + "epoch": 0.14009872494810835, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 16064 + }, + { + "epoch": 0.14010744623327692, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 16065 + }, + { + "epoch": 0.14011616751844552, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 16066 + }, + { + "epoch": 0.1401248888036141, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 16067 + }, + { + "epoch": 0.1401336100887827, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 16068 + }, + { + "epoch": 0.14014233137395127, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 16069 + }, + { + "epoch": 0.14015105265911984, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 16070 + }, + { + "epoch": 0.14015977394428844, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 16071 + }, + { + "epoch": 0.140168495229457, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 16072 + }, + { + "epoch": 0.14017721651462559, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 16073 + }, + { + "epoch": 0.14018593779979419, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 16074 + }, + { + "epoch": 0.14019465908496276, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 16075 + }, + { + "epoch": 0.14020338037013133, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 16076 + }, + { + "epoch": 0.14021210165529993, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 16077 + }, + { + "epoch": 0.1402208229404685, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 16078 + }, + { + "epoch": 0.14022954422563708, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 16079 + }, + { + "epoch": 0.14023826551080568, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 16080 + }, + { + "epoch": 0.14024698679597425, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 16081 + }, + { + "epoch": 0.14025570808114285, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 16082 + }, + { + "epoch": 0.14026442936631142, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 16083 + }, + { + "epoch": 0.14027315065148, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 16084 + }, + { + "epoch": 0.1402818719366486, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 16085 + }, + { + "epoch": 0.14029059322181717, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 16086 + }, + { + "epoch": 0.14029931450698574, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 16087 + }, + { + "epoch": 0.14030803579215434, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 16088 + }, + { + "epoch": 0.1403167570773229, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 16089 + }, + { + "epoch": 0.14032547836249148, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 16090 + }, + { + "epoch": 0.14033419964766009, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 16091 + }, + { + "epoch": 0.14034292093282866, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 16092 + }, + { + "epoch": 0.14035164221799726, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 16093 + }, + { + "epoch": 0.14036036350316583, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 16094 + }, + { + "epoch": 0.1403690847883344, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 16095 + }, + { + "epoch": 0.140377806073503, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 16096 + }, + { + "epoch": 0.14038652735867158, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 16097 + }, + { + "epoch": 0.14039524864384015, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 16098 + }, + { + "epoch": 0.14040396992900875, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 16099 + }, + { + "epoch": 0.14041269121417732, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 16100 + }, + { + "epoch": 0.1404214124993459, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 16101 + }, + { + "epoch": 0.1404301337845145, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 16102 + }, + { + "epoch": 0.14043885506968307, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 16103 + }, + { + "epoch": 0.14044757635485164, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 16104 + }, + { + "epoch": 0.14045629764002024, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 16105 + }, + { + "epoch": 0.1404650189251888, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 16106 + }, + { + "epoch": 0.1404737402103574, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 16107 + }, + { + "epoch": 0.14048246149552598, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 16108 + }, + { + "epoch": 0.14049118278069456, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 16109 + }, + { + "epoch": 0.14049990406586316, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 16110 + }, + { + "epoch": 0.14050862535103173, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 16111 + }, + { + "epoch": 0.1405173466362003, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 16112 + }, + { + "epoch": 0.1405260679213689, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 16113 + }, + { + "epoch": 0.14053478920653747, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 16114 + }, + { + "epoch": 0.14054351049170605, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 16115 + }, + { + "epoch": 0.14055223177687465, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 16116 + }, + { + "epoch": 0.14056095306204322, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 16117 + }, + { + "epoch": 0.1405696743472118, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 16118 + }, + { + "epoch": 0.1405783956323804, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 16119 + }, + { + "epoch": 0.14058711691754897, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 16120 + }, + { + "epoch": 0.14059583820271757, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 16121 + }, + { + "epoch": 0.14060455948788614, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 16122 + }, + { + "epoch": 0.1406132807730547, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 16123 + }, + { + "epoch": 0.1406220020582233, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 16124 + }, + { + "epoch": 0.14063072334339188, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 16125 + }, + { + "epoch": 0.14063944462856046, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 16126 + }, + { + "epoch": 0.14064816591372906, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 16127 + }, + { + "epoch": 0.14065688719889763, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 16128 + }, + { + "epoch": 0.1406656084840662, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 16129 + }, + { + "epoch": 0.1406743297692348, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 16130 + }, + { + "epoch": 0.14068305105440337, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 16131 + }, + { + "epoch": 0.14069177233957195, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 16132 + }, + { + "epoch": 0.14070049362474055, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 16133 + }, + { + "epoch": 0.14070921490990912, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 16134 + }, + { + "epoch": 0.14071793619507772, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 16135 + }, + { + "epoch": 0.1407266574802463, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 16136 + }, + { + "epoch": 0.14073537876541486, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 16137 + }, + { + "epoch": 0.14074410005058346, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 16138 + }, + { + "epoch": 0.14075282133575204, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 16139 + }, + { + "epoch": 0.1407615426209206, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 16140 + }, + { + "epoch": 0.1407702639060892, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 16141 + }, + { + "epoch": 0.14077898519125778, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 16142 + }, + { + "epoch": 0.14078770647642636, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 16143 + }, + { + "epoch": 0.14079642776159496, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 16144 + }, + { + "epoch": 0.14080514904676353, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16145 + }, + { + "epoch": 0.1408138703319321, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 16146 + }, + { + "epoch": 0.1408225916171007, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 16147 + }, + { + "epoch": 0.14083131290226927, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 16148 + }, + { + "epoch": 0.14084003418743787, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 16149 + }, + { + "epoch": 0.14084875547260645, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 16150 + }, + { + "epoch": 0.14085747675777502, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 16151 + }, + { + "epoch": 0.14086619804294362, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 16152 + }, + { + "epoch": 0.1408749193281122, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 16153 + }, + { + "epoch": 0.14088364061328076, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 16154 + }, + { + "epoch": 0.14089236189844936, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 16155 + }, + { + "epoch": 0.14090108318361794, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 16156 + }, + { + "epoch": 0.1409098044687865, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 16157 + }, + { + "epoch": 0.1409185257539551, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 16158 + }, + { + "epoch": 0.14092724703912368, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 16159 + }, + { + "epoch": 0.14093596832429225, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 16160 + }, + { + "epoch": 0.14094468960946085, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 16161 + }, + { + "epoch": 0.14095341089462943, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 16162 + }, + { + "epoch": 0.14096213217979803, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 16163 + }, + { + "epoch": 0.1409708534649666, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 16164 + }, + { + "epoch": 0.14097957475013517, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 16165 + }, + { + "epoch": 0.14098829603530377, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 16166 + }, + { + "epoch": 0.14099701732047235, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 16167 + }, + { + "epoch": 0.14100573860564092, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 16168 + }, + { + "epoch": 0.14101445989080952, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 16169 + }, + { + "epoch": 0.1410231811759781, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 16170 + }, + { + "epoch": 0.14103190246114666, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 16171 + }, + { + "epoch": 0.14104062374631526, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 16172 + }, + { + "epoch": 0.14104934503148384, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 16173 + }, + { + "epoch": 0.1410580663166524, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 16174 + }, + { + "epoch": 0.141066787601821, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 16175 + }, + { + "epoch": 0.14107550888698958, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 16176 + }, + { + "epoch": 0.14108423017215818, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 16177 + }, + { + "epoch": 0.14109295145732675, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 16178 + }, + { + "epoch": 0.14110167274249533, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 16179 + }, + { + "epoch": 0.14111039402766393, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 16180 + }, + { + "epoch": 0.1411191153128325, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 16181 + }, + { + "epoch": 0.14112783659800107, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 16182 + }, + { + "epoch": 0.14113655788316967, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 16183 + }, + { + "epoch": 0.14114527916833824, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16184 + }, + { + "epoch": 0.14115400045350682, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 16185 + }, + { + "epoch": 0.14116272173867542, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 16186 + }, + { + "epoch": 0.141171443023844, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 16187 + }, + { + "epoch": 0.14118016430901256, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16188 + }, + { + "epoch": 0.14118888559418116, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 16189 + }, + { + "epoch": 0.14119760687934974, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 16190 + }, + { + "epoch": 0.14120632816451834, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 16191 + }, + { + "epoch": 0.1412150494496869, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 16192 + }, + { + "epoch": 0.14122377073485548, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 16193 + }, + { + "epoch": 0.14123249202002408, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 16194 + }, + { + "epoch": 0.14124121330519265, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 16195 + }, + { + "epoch": 0.14124993459036123, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 16196 + }, + { + "epoch": 0.14125865587552983, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 16197 + }, + { + "epoch": 0.1412673771606984, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 16198 + }, + { + "epoch": 0.14127609844586697, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 16199 + }, + { + "epoch": 0.14128481973103557, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 16200 + }, + { + "epoch": 0.14129354101620414, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 16201 + }, + { + "epoch": 0.14130226230137274, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 16202 + }, + { + "epoch": 0.14131098358654132, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 16203 + }, + { + "epoch": 0.1413197048717099, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 16204 + }, + { + "epoch": 0.1413284261568785, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 16205 + }, + { + "epoch": 0.14133714744204706, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 16206 + }, + { + "epoch": 0.14134586872721563, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 16207 + }, + { + "epoch": 0.14135459001238423, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 16208 + }, + { + "epoch": 0.1413633112975528, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 16209 + }, + { + "epoch": 0.14137203258272138, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 16210 + }, + { + "epoch": 0.14138075386788998, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 16211 + }, + { + "epoch": 0.14138947515305855, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 16212 + }, + { + "epoch": 0.14139819643822713, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 16213 + }, + { + "epoch": 0.14140691772339573, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 16214 + }, + { + "epoch": 0.1414156390085643, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 16215 + }, + { + "epoch": 0.1414243602937329, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 16216 + }, + { + "epoch": 0.14143308157890147, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 16217 + }, + { + "epoch": 0.14144180286407004, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 16218 + }, + { + "epoch": 0.14145052414923864, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 16219 + }, + { + "epoch": 0.14145924543440722, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 16220 + }, + { + "epoch": 0.1414679667195758, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 16221 + }, + { + "epoch": 0.1414766880047444, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 16222 + }, + { + "epoch": 0.14148540928991296, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 16223 + }, + { + "epoch": 0.14149413057508153, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 16224 + }, + { + "epoch": 0.14150285186025013, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 16225 + }, + { + "epoch": 0.1415115731454187, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 16226 + }, + { + "epoch": 0.14152029443058728, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 16227 + }, + { + "epoch": 0.14152901571575588, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 16228 + }, + { + "epoch": 0.14153773700092445, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 16229 + }, + { + "epoch": 0.14154645828609305, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 16230 + }, + { + "epoch": 0.14155517957126162, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 16231 + }, + { + "epoch": 0.1415639008564302, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 16232 + }, + { + "epoch": 0.1415726221415988, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 16233 + }, + { + "epoch": 0.14158134342676737, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 16234 + }, + { + "epoch": 0.14159006471193594, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 16235 + }, + { + "epoch": 0.14159878599710454, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 16236 + }, + { + "epoch": 0.14160750728227312, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 16237 + }, + { + "epoch": 0.1416162285674417, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 16238 + }, + { + "epoch": 0.1416249498526103, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 16239 + }, + { + "epoch": 0.14163367113777886, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 16240 + }, + { + "epoch": 0.14164239242294743, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 16241 + }, + { + "epoch": 0.14165111370811603, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 16242 + }, + { + "epoch": 0.1416598349932846, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 16243 + }, + { + "epoch": 0.1416685562784532, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 16244 + }, + { + "epoch": 0.14167727756362178, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 16245 + }, + { + "epoch": 0.14168599884879035, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 16246 + }, + { + "epoch": 0.14169472013395895, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 16247 + }, + { + "epoch": 0.14170344141912752, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 16248 + }, + { + "epoch": 0.1417121627042961, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 16249 + }, + { + "epoch": 0.1417208839894647, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 16250 + }, + { + "epoch": 0.14172960527463327, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 16251 + }, + { + "epoch": 0.14173832655980184, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 16252 + }, + { + "epoch": 0.14174704784497044, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 16253 + }, + { + "epoch": 0.14175576913013901, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 16254 + }, + { + "epoch": 0.1417644904153076, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 16255 + }, + { + "epoch": 0.1417732117004762, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 16256 + }, + { + "epoch": 0.14178193298564476, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 16257 + }, + { + "epoch": 0.14179065427081336, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 16258 + }, + { + "epoch": 0.14179937555598193, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 16259 + }, + { + "epoch": 0.1418080968411505, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 16260 + }, + { + "epoch": 0.1418168181263191, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 16261 + }, + { + "epoch": 0.14182553941148768, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 16262 + }, + { + "epoch": 0.14183426069665625, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 16263 + }, + { + "epoch": 0.14184298198182485, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 16264 + }, + { + "epoch": 0.14185170326699342, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 16265 + }, + { + "epoch": 0.141860424552162, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 16266 + }, + { + "epoch": 0.1418691458373306, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16267 + }, + { + "epoch": 0.14187786712249917, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 16268 + }, + { + "epoch": 0.14188658840766774, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 16269 + }, + { + "epoch": 0.14189530969283634, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 16270 + }, + { + "epoch": 0.1419040309780049, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 16271 + }, + { + "epoch": 0.14191275226317351, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 16272 + }, + { + "epoch": 0.1419214735483421, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 16273 + }, + { + "epoch": 0.14193019483351066, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 16274 + }, + { + "epoch": 0.14193891611867926, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 16275 + }, + { + "epoch": 0.14194763740384783, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 16276 + }, + { + "epoch": 0.1419563586890164, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16277 + }, + { + "epoch": 0.141965079974185, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 16278 + }, + { + "epoch": 0.14197380125935358, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16279 + }, + { + "epoch": 0.14198252254452215, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 16280 + }, + { + "epoch": 0.14199124382969075, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 16281 + }, + { + "epoch": 0.14199996511485932, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 16282 + }, + { + "epoch": 0.1420086864000279, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16283 + }, + { + "epoch": 0.1420174076851965, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16284 + }, + { + "epoch": 0.14202612897036507, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 16285 + }, + { + "epoch": 0.14203485025553367, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16286 + }, + { + "epoch": 0.14204357154070224, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 16287 + }, + { + "epoch": 0.1420522928258708, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 16288 + }, + { + "epoch": 0.1420610141110394, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 16289 + }, + { + "epoch": 0.14206973539620799, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 16290 + }, + { + "epoch": 0.14207845668137656, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 16291 + }, + { + "epoch": 0.14208717796654516, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 16292 + }, + { + "epoch": 0.14209589925171373, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 16293 + }, + { + "epoch": 0.1421046205368823, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 16294 + }, + { + "epoch": 0.1421133418220509, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 16295 + }, + { + "epoch": 0.14212206310721948, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 16296 + }, + { + "epoch": 0.14213078439238805, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16297 + }, + { + "epoch": 0.14213950567755665, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 16298 + }, + { + "epoch": 0.14214822696272522, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 16299 + }, + { + "epoch": 0.14215694824789382, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 16300 + }, + { + "epoch": 0.1421656695330624, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 16301 + }, + { + "epoch": 0.14217439081823097, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 16302 + }, + { + "epoch": 0.14218311210339957, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 16303 + }, + { + "epoch": 0.14219183338856814, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 16304 + }, + { + "epoch": 0.1422005546737367, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 16305 + }, + { + "epoch": 0.1422092759589053, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 16306 + }, + { + "epoch": 0.14221799724407388, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 16307 + }, + { + "epoch": 0.14222671852924246, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 16308 + }, + { + "epoch": 0.14223543981441106, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 16309 + }, + { + "epoch": 0.14224416109957963, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 16310 + }, + { + "epoch": 0.1422528823847482, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 16311 + }, + { + "epoch": 0.1422616036699168, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 16312 + }, + { + "epoch": 0.14227032495508538, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 16313 + }, + { + "epoch": 0.14227904624025398, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 16314 + }, + { + "epoch": 0.14228776752542255, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 16315 + }, + { + "epoch": 0.14229648881059112, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 16316 + }, + { + "epoch": 0.14230521009575972, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 16317 + }, + { + "epoch": 0.1423139313809283, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 16318 + }, + { + "epoch": 0.14232265266609687, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 16319 + }, + { + "epoch": 0.14233137395126547, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 16320 + }, + { + "epoch": 0.14234009523643404, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 16321 + }, + { + "epoch": 0.1423488165216026, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 16322 + }, + { + "epoch": 0.1423575378067712, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 16323 + }, + { + "epoch": 0.14236625909193978, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 16324 + }, + { + "epoch": 0.14237498037710838, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 16325 + }, + { + "epoch": 0.14238370166227696, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16326 + }, + { + "epoch": 0.14239242294744553, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 16327 + }, + { + "epoch": 0.14240114423261413, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 16328 + }, + { + "epoch": 0.1424098655177827, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 16329 + }, + { + "epoch": 0.14241858680295127, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 16330 + }, + { + "epoch": 0.14242730808811987, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 16331 + }, + { + "epoch": 0.14243602937328845, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 16332 + }, + { + "epoch": 0.14244475065845702, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 16333 + }, + { + "epoch": 0.14245347194362562, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 16334 + }, + { + "epoch": 0.1424621932287942, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 16335 + }, + { + "epoch": 0.14247091451396277, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 16336 + }, + { + "epoch": 0.14247963579913137, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 16337 + }, + { + "epoch": 0.14248835708429994, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 16338 + }, + { + "epoch": 0.14249707836946854, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 16339 + }, + { + "epoch": 0.1425057996546371, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 16340 + }, + { + "epoch": 0.14251452093980568, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 16341 + }, + { + "epoch": 0.14252324222497428, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 16342 + }, + { + "epoch": 0.14253196351014286, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 16343 + }, + { + "epoch": 0.14254068479531143, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 16344 + }, + { + "epoch": 0.14254940608048003, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 16345 + }, + { + "epoch": 0.1425581273656486, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 16346 + }, + { + "epoch": 0.14256684865081717, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 16347 + }, + { + "epoch": 0.14257556993598577, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 16348 + }, + { + "epoch": 0.14258429122115435, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 16349 + }, + { + "epoch": 0.14259301250632292, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 16350 + }, + { + "epoch": 0.14260173379149152, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16351 + }, + { + "epoch": 0.1426104550766601, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 16352 + }, + { + "epoch": 0.1426191763618287, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 16353 + }, + { + "epoch": 0.14262789764699726, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 16354 + }, + { + "epoch": 0.14263661893216584, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 16355 + }, + { + "epoch": 0.14264534021733444, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16356 + }, + { + "epoch": 0.142654061502503, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 16357 + }, + { + "epoch": 0.14266278278767158, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 16358 + }, + { + "epoch": 0.14267150407284018, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 16359 + }, + { + "epoch": 0.14268022535800876, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 16360 + }, + { + "epoch": 0.14268894664317733, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 16361 + }, + { + "epoch": 0.14269766792834593, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 16362 + }, + { + "epoch": 0.1427063892135145, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 16363 + }, + { + "epoch": 0.14271511049868307, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 16364 + }, + { + "epoch": 0.14272383178385167, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 16365 + }, + { + "epoch": 0.14273255306902025, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 16366 + }, + { + "epoch": 0.14274127435418885, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 16367 + }, + { + "epoch": 0.14274999563935742, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 16368 + }, + { + "epoch": 0.142758716924526, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 16369 + }, + { + "epoch": 0.1427674382096946, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 16370 + }, + { + "epoch": 0.14277615949486316, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 16371 + }, + { + "epoch": 0.14278488078003174, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 16372 + }, + { + "epoch": 0.14279360206520034, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 16373 + }, + { + "epoch": 0.1428023233503689, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 16374 + }, + { + "epoch": 0.14281104463553748, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16375 + }, + { + "epoch": 0.14281976592070608, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 16376 + }, + { + "epoch": 0.14282848720587465, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 16377 + }, + { + "epoch": 0.14283720849104323, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 16378 + }, + { + "epoch": 0.14284592977621183, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 16379 + }, + { + "epoch": 0.1428546510613804, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 16380 + }, + { + "epoch": 0.142863372346549, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 16381 + }, + { + "epoch": 0.14287209363171757, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16382 + }, + { + "epoch": 0.14288081491688615, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 16383 + }, + { + "epoch": 0.14288953620205475, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 16384 + }, + { + "epoch": 0.14289825748722332, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 16385 + }, + { + "epoch": 0.1429069787723919, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 16386 + }, + { + "epoch": 0.1429157000575605, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 16387 + }, + { + "epoch": 0.14292442134272906, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 16388 + }, + { + "epoch": 0.14293314262789764, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 16389 + }, + { + "epoch": 0.14294186391306624, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 16390 + }, + { + "epoch": 0.1429505851982348, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 16391 + }, + { + "epoch": 0.14295930648340338, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 16392 + }, + { + "epoch": 0.14296802776857198, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 16393 + }, + { + "epoch": 0.14297674905374055, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 16394 + }, + { + "epoch": 0.14298547033890915, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 16395 + }, + { + "epoch": 0.14299419162407773, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 16396 + }, + { + "epoch": 0.1430029129092463, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 16397 + }, + { + "epoch": 0.1430116341944149, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 16398 + }, + { + "epoch": 0.14302035547958347, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 16399 + }, + { + "epoch": 0.14302907676475204, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 16400 + }, + { + "epoch": 0.14303779804992064, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 16401 + }, + { + "epoch": 0.14304651933508922, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 16402 + }, + { + "epoch": 0.1430552406202578, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 16403 + }, + { + "epoch": 0.1430639619054264, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 16404 + }, + { + "epoch": 0.14307268319059496, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 16405 + }, + { + "epoch": 0.14308140447576354, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 16406 + }, + { + "epoch": 0.14309012576093214, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 16407 + }, + { + "epoch": 0.1430988470461007, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16408 + }, + { + "epoch": 0.1431075683312693, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 16409 + }, + { + "epoch": 0.14311628961643788, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 16410 + }, + { + "epoch": 0.14312501090160645, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 16411 + }, + { + "epoch": 0.14313373218677505, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 16412 + }, + { + "epoch": 0.14314245347194363, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 16413 + }, + { + "epoch": 0.1431511747571122, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16414 + }, + { + "epoch": 0.1431598960422808, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 16415 + }, + { + "epoch": 0.14316861732744937, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 16416 + }, + { + "epoch": 0.14317733861261794, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 16417 + }, + { + "epoch": 0.14318605989778654, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 16418 + }, + { + "epoch": 0.14319478118295512, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 16419 + }, + { + "epoch": 0.1432035024681237, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 16420 + }, + { + "epoch": 0.1432122237532923, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 16421 + }, + { + "epoch": 0.14322094503846086, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 16422 + }, + { + "epoch": 0.14322966632362946, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 16423 + }, + { + "epoch": 0.14323838760879803, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 16424 + }, + { + "epoch": 0.1432471088939666, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 16425 + }, + { + "epoch": 0.1432558301791352, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 16426 + }, + { + "epoch": 0.14326455146430378, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 16427 + }, + { + "epoch": 0.14327327274947235, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 16428 + }, + { + "epoch": 0.14328199403464095, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 16429 + }, + { + "epoch": 0.14329071531980953, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 16430 + }, + { + "epoch": 0.1432994366049781, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 16431 + }, + { + "epoch": 0.1433081578901467, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 16432 + }, + { + "epoch": 0.14331687917531527, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 16433 + }, + { + "epoch": 0.14332560046048387, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 16434 + }, + { + "epoch": 0.14333432174565244, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 16435 + }, + { + "epoch": 0.14334304303082102, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 16436 + }, + { + "epoch": 0.14335176431598962, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 16437 + }, + { + "epoch": 0.1433604856011582, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 16438 + }, + { + "epoch": 0.14336920688632676, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 16439 + }, + { + "epoch": 0.14337792817149536, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 16440 + }, + { + "epoch": 0.14338664945666393, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16441 + }, + { + "epoch": 0.1433953707418325, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 16442 + }, + { + "epoch": 0.1434040920270011, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 16443 + }, + { + "epoch": 0.14341281331216968, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 16444 + }, + { + "epoch": 0.14342153459733825, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 16445 + }, + { + "epoch": 0.14343025588250685, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 16446 + }, + { + "epoch": 0.14343897716767542, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 16447 + }, + { + "epoch": 0.14344769845284402, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 16448 + }, + { + "epoch": 0.1434564197380126, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 16449 + }, + { + "epoch": 0.14346514102318117, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 16450 + }, + { + "epoch": 0.14347386230834977, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 16451 + }, + { + "epoch": 0.14348258359351834, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 16452 + }, + { + "epoch": 0.14349130487868691, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 16453 + }, + { + "epoch": 0.14350002616385552, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 16454 + }, + { + "epoch": 0.1435087474490241, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 16455 + }, + { + "epoch": 0.14351746873419266, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 16456 + }, + { + "epoch": 0.14352619001936126, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 16457 + }, + { + "epoch": 0.14353491130452983, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 16458 + }, + { + "epoch": 0.1435436325896984, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 16459 + }, + { + "epoch": 0.143552353874867, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 16460 + }, + { + "epoch": 0.14356107516003558, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 16461 + }, + { + "epoch": 0.14356979644520418, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 16462 + }, + { + "epoch": 0.14357851773037275, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 16463 + }, + { + "epoch": 0.14358723901554132, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 16464 + }, + { + "epoch": 0.14359596030070992, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 16465 + }, + { + "epoch": 0.1436046815858785, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 16466 + }, + { + "epoch": 0.14361340287104707, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 16467 + }, + { + "epoch": 0.14362212415621567, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 16468 + }, + { + "epoch": 0.14363084544138424, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 16469 + }, + { + "epoch": 0.14363956672655281, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 16470 + }, + { + "epoch": 0.14364828801172141, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 16471 + }, + { + "epoch": 0.14365700929689, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 16472 + }, + { + "epoch": 0.14366573058205856, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 16473 + }, + { + "epoch": 0.14367445186722716, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 16474 + }, + { + "epoch": 0.14368317315239573, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 16475 + }, + { + "epoch": 0.14369189443756433, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 16476 + }, + { + "epoch": 0.1437006157227329, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 16477 + }, + { + "epoch": 0.14370933700790148, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 16478 + }, + { + "epoch": 0.14371805829307008, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 16479 + }, + { + "epoch": 0.14372677957823865, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 16480 + }, + { + "epoch": 0.14373550086340722, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 16481 + }, + { + "epoch": 0.14374422214857582, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 16482 + }, + { + "epoch": 0.1437529434337444, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 16483 + }, + { + "epoch": 0.14376166471891297, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 16484 + }, + { + "epoch": 0.14377038600408157, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 16485 + }, + { + "epoch": 0.14377910728925014, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 16486 + }, + { + "epoch": 0.1437878285744187, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 16487 + }, + { + "epoch": 0.1437965498595873, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 16488 + }, + { + "epoch": 0.1438052711447559, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 16489 + }, + { + "epoch": 0.1438139924299245, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 16490 + }, + { + "epoch": 0.14382271371509306, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 16491 + }, + { + "epoch": 0.14383143500026163, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16492 + }, + { + "epoch": 0.14384015628543023, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 16493 + }, + { + "epoch": 0.1438488775705988, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 16494 + }, + { + "epoch": 0.14385759885576738, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 16495 + }, + { + "epoch": 0.14386632014093598, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 16496 + }, + { + "epoch": 0.14387504142610455, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 16497 + }, + { + "epoch": 0.14388376271127312, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 16498 + }, + { + "epoch": 0.14389248399644172, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 16499 + }, + { + "epoch": 0.1439012052816103, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 16500 + }, + { + "epoch": 0.14390992656677887, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 16501 + }, + { + "epoch": 0.14391864785194747, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 16502 + }, + { + "epoch": 0.14392736913711604, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 16503 + }, + { + "epoch": 0.14393609042228464, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 16504 + }, + { + "epoch": 0.1439448117074532, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 16505 + }, + { + "epoch": 0.14395353299262179, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 16506 + }, + { + "epoch": 0.14396225427779039, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 16507 + }, + { + "epoch": 0.14397097556295896, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 16508 + }, + { + "epoch": 0.14397969684812753, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 16509 + }, + { + "epoch": 0.14398841813329613, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 16510 + }, + { + "epoch": 0.1439971394184647, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 16511 + }, + { + "epoch": 0.14400586070363328, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 16512 + }, + { + "epoch": 0.14401458198880188, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 16513 + }, + { + "epoch": 0.14402330327397045, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 16514 + }, + { + "epoch": 0.14403202455913902, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16515 + }, + { + "epoch": 0.14404074584430762, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 16516 + }, + { + "epoch": 0.1440494671294762, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 16517 + }, + { + "epoch": 0.1440581884146448, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 16518 + }, + { + "epoch": 0.14406690969981337, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 16519 + }, + { + "epoch": 0.14407563098498194, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 16520 + }, + { + "epoch": 0.14408435227015054, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 16521 + }, + { + "epoch": 0.1440930735553191, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 16522 + }, + { + "epoch": 0.14410179484048768, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 16523 + }, + { + "epoch": 0.14411051612565628, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 16524 + }, + { + "epoch": 0.14411923741082486, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 16525 + }, + { + "epoch": 0.14412795869599343, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 16526 + }, + { + "epoch": 0.14413667998116203, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 16527 + }, + { + "epoch": 0.1441454012663306, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 16528 + }, + { + "epoch": 0.14415412255149918, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 16529 + }, + { + "epoch": 0.14416284383666778, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 16530 + }, + { + "epoch": 0.14417156512183635, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 16531 + }, + { + "epoch": 0.14418028640700495, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 16532 + }, + { + "epoch": 0.14418900769217352, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 16533 + }, + { + "epoch": 0.1441977289773421, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 16534 + }, + { + "epoch": 0.1442064502625107, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 16535 + }, + { + "epoch": 0.14421517154767927, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 16536 + }, + { + "epoch": 0.14422389283284784, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 16537 + }, + { + "epoch": 0.14423261411801644, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 16538 + }, + { + "epoch": 0.144241335403185, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 16539 + }, + { + "epoch": 0.14425005668835358, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 16540 + }, + { + "epoch": 0.14425877797352218, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 16541 + }, + { + "epoch": 0.14426749925869076, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 16542 + }, + { + "epoch": 0.14427622054385933, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 16543 + }, + { + "epoch": 0.14428494182902793, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 16544 + }, + { + "epoch": 0.1442936631141965, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 16545 + }, + { + "epoch": 0.1443023843993651, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 16546 + }, + { + "epoch": 0.14431110568453367, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 16547 + }, + { + "epoch": 0.14431982696970225, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 16548 + }, + { + "epoch": 0.14432854825487085, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 16549 + }, + { + "epoch": 0.14433726954003942, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 16550 + }, + { + "epoch": 0.144345990825208, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 16551 + }, + { + "epoch": 0.1443547121103766, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 16552 + }, + { + "epoch": 0.14436343339554517, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 16553 + }, + { + "epoch": 0.14437215468071374, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16554 + }, + { + "epoch": 0.14438087596588234, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 16555 + }, + { + "epoch": 0.1443895972510509, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 16556 + }, + { + "epoch": 0.1443983185362195, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 16557 + }, + { + "epoch": 0.14440703982138808, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 16558 + }, + { + "epoch": 0.14441576110655666, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 16559 + }, + { + "epoch": 0.14442448239172526, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 16560 + }, + { + "epoch": 0.14443320367689383, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 16561 + }, + { + "epoch": 0.1444419249620624, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 16562 + }, + { + "epoch": 0.144450646247231, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 16563 + }, + { + "epoch": 0.14445936753239957, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 16564 + }, + { + "epoch": 0.14446808881756815, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 16565 + }, + { + "epoch": 0.14447681010273675, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 16566 + }, + { + "epoch": 0.14448553138790532, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 16567 + }, + { + "epoch": 0.1444942526730739, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 16568 + }, + { + "epoch": 0.1445029739582425, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 16569 + }, + { + "epoch": 0.14451169524341106, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 16570 + }, + { + "epoch": 0.14452041652857966, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 16571 + }, + { + "epoch": 0.14452913781374824, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 16572 + }, + { + "epoch": 0.1445378590989168, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16573 + }, + { + "epoch": 0.1445465803840854, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 16574 + }, + { + "epoch": 0.14455530166925398, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 16575 + }, + { + "epoch": 0.14456402295442256, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 16576 + }, + { + "epoch": 0.14457274423959116, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 16577 + }, + { + "epoch": 0.14458146552475973, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 16578 + }, + { + "epoch": 0.1445901868099283, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 16579 + }, + { + "epoch": 0.1445989080950969, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 16580 + }, + { + "epoch": 0.14460762938026547, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 16581 + }, + { + "epoch": 0.14461635066543405, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16582 + }, + { + "epoch": 0.14462507195060265, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 16583 + }, + { + "epoch": 0.14463379323577122, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 16584 + }, + { + "epoch": 0.14464251452093982, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 16585 + }, + { + "epoch": 0.1446512358061084, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16586 + }, + { + "epoch": 0.14465995709127696, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16587 + }, + { + "epoch": 0.14466867837644556, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 16588 + }, + { + "epoch": 0.14467739966161414, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 16589 + }, + { + "epoch": 0.1446861209467827, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 16590 + }, + { + "epoch": 0.1446948422319513, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 16591 + }, + { + "epoch": 0.14470356351711988, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 16592 + }, + { + "epoch": 0.14471228480228845, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 16593 + }, + { + "epoch": 0.14472100608745705, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 16594 + }, + { + "epoch": 0.14472972737262563, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 16595 + }, + { + "epoch": 0.1447384486577942, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 16596 + }, + { + "epoch": 0.1447471699429628, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 16597 + }, + { + "epoch": 0.14475589122813137, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 16598 + }, + { + "epoch": 0.14476461251329997, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 16599 + }, + { + "epoch": 0.14477333379846855, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 16600 + }, + { + "epoch": 0.14478205508363712, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 16601 + }, + { + "epoch": 0.14479077636880572, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 16602 + }, + { + "epoch": 0.1447994976539743, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 16603 + }, + { + "epoch": 0.14480821893914286, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 16604 + }, + { + "epoch": 0.14481694022431146, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 16605 + }, + { + "epoch": 0.14482566150948004, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 16606 + }, + { + "epoch": 0.1448343827946486, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 16607 + }, + { + "epoch": 0.1448431040798172, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 16608 + }, + { + "epoch": 0.14485182536498578, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 16609 + }, + { + "epoch": 0.14486054665015435, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 16610 + }, + { + "epoch": 0.14486926793532295, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 16611 + }, + { + "epoch": 0.14487798922049153, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 16612 + }, + { + "epoch": 0.14488671050566013, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 16613 + }, + { + "epoch": 0.1448954317908287, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 16614 + }, + { + "epoch": 0.14490415307599727, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 16615 + }, + { + "epoch": 0.14491287436116587, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 16616 + }, + { + "epoch": 0.14492159564633444, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 16617 + }, + { + "epoch": 0.14493031693150302, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 16618 + }, + { + "epoch": 0.14493903821667162, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 16619 + }, + { + "epoch": 0.1449477595018402, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 16620 + }, + { + "epoch": 0.14495648078700876, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 16621 + }, + { + "epoch": 0.14496520207217736, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 16622 + }, + { + "epoch": 0.14497392335734594, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 16623 + }, + { + "epoch": 0.1449826446425145, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 16624 + }, + { + "epoch": 0.1449913659276831, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 16625 + }, + { + "epoch": 0.14500008721285168, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 16626 + }, + { + "epoch": 0.14500880849802028, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 16627 + }, + { + "epoch": 0.14501752978318885, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 16628 + }, + { + "epoch": 0.14502625106835743, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16629 + }, + { + "epoch": 0.14503497235352603, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 16630 + }, + { + "epoch": 0.1450436936386946, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 16631 + }, + { + "epoch": 0.14505241492386317, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 16632 + }, + { + "epoch": 0.14506113620903177, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 16633 + }, + { + "epoch": 0.14506985749420034, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 16634 + }, + { + "epoch": 0.14507857877936892, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 16635 + }, + { + "epoch": 0.14508730006453752, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 16636 + }, + { + "epoch": 0.1450960213497061, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 16637 + }, + { + "epoch": 0.14510474263487466, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 16638 + }, + { + "epoch": 0.14511346392004326, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 16639 + }, + { + "epoch": 0.14512218520521183, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 16640 + }, + { + "epoch": 0.14513090649038043, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 16641 + }, + { + "epoch": 0.145139627775549, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 16642 + }, + { + "epoch": 0.14514834906071758, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 16643 + }, + { + "epoch": 0.14515707034588618, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16644 + }, + { + "epoch": 0.14516579163105475, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 16645 + }, + { + "epoch": 0.14517451291622332, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 16646 + }, + { + "epoch": 0.14518323420139193, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 16647 + }, + { + "epoch": 0.1451919554865605, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 16648 + }, + { + "epoch": 0.14520067677172907, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 16649 + }, + { + "epoch": 0.14520939805689767, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 16650 + }, + { + "epoch": 0.14521811934206624, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 16651 + }, + { + "epoch": 0.14522684062723482, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 16652 + }, + { + "epoch": 0.14523556191240342, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 16653 + }, + { + "epoch": 0.145244283197572, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 16654 + }, + { + "epoch": 0.1452530044827406, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 16655 + }, + { + "epoch": 0.14526172576790916, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 16656 + }, + { + "epoch": 0.14527044705307773, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 16657 + }, + { + "epoch": 0.14527916833824633, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 16658 + }, + { + "epoch": 0.1452878896234149, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 16659 + }, + { + "epoch": 0.14529661090858348, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 16660 + }, + { + "epoch": 0.14530533219375208, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 16661 + }, + { + "epoch": 0.14531405347892065, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 16662 + }, + { + "epoch": 0.14532277476408922, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 16663 + }, + { + "epoch": 0.14533149604925782, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 16664 + }, + { + "epoch": 0.1453402173344264, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 16665 + }, + { + "epoch": 0.145348938619595, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16666 + }, + { + "epoch": 0.14535765990476357, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 16667 + }, + { + "epoch": 0.14536638118993214, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 16668 + }, + { + "epoch": 0.14537510247510074, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 16669 + }, + { + "epoch": 0.14538382376026932, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 16670 + }, + { + "epoch": 0.1453925450454379, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 16671 + }, + { + "epoch": 0.1454012663306065, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 16672 + }, + { + "epoch": 0.14540998761577506, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 16673 + }, + { + "epoch": 0.14541870890094363, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 16674 + }, + { + "epoch": 0.14542743018611223, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 16675 + }, + { + "epoch": 0.1454361514712808, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 16676 + }, + { + "epoch": 0.14544487275644938, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 16677 + }, + { + "epoch": 0.14545359404161798, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 16678 + }, + { + "epoch": 0.14546231532678655, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 16679 + }, + { + "epoch": 0.14547103661195515, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 16680 + }, + { + "epoch": 0.14547975789712372, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 16681 + }, + { + "epoch": 0.1454884791822923, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 16682 + }, + { + "epoch": 0.1454972004674609, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 16683 + }, + { + "epoch": 0.14550592175262947, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 16684 + }, + { + "epoch": 0.14551464303779804, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 16685 + }, + { + "epoch": 0.14552336432296664, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 16686 + }, + { + "epoch": 0.14553208560813521, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 16687 + }, + { + "epoch": 0.1455408068933038, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 16688 + }, + { + "epoch": 0.1455495281784724, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 16689 + }, + { + "epoch": 0.14555824946364096, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 16690 + }, + { + "epoch": 0.14556697074880953, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 16691 + }, + { + "epoch": 0.14557569203397813, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 16692 + }, + { + "epoch": 0.1455844133191467, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 16693 + }, + { + "epoch": 0.1455931346043153, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 16694 + }, + { + "epoch": 0.14560185588948388, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 16695 + }, + { + "epoch": 0.14561057717465245, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 16696 + }, + { + "epoch": 0.14561929845982105, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 16697 + }, + { + "epoch": 0.14562801974498962, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 16698 + }, + { + "epoch": 0.1456367410301582, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 16699 + }, + { + "epoch": 0.1456454623153268, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 16700 + }, + { + "epoch": 0.14565418360049537, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 16701 + }, + { + "epoch": 0.14566290488566394, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 16702 + }, + { + "epoch": 0.14567162617083254, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 16703 + }, + { + "epoch": 0.1456803474560011, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 16704 + }, + { + "epoch": 0.14568906874116969, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 16705 + }, + { + "epoch": 0.1456977900263383, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 16706 + }, + { + "epoch": 0.14570651131150686, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 16707 + }, + { + "epoch": 0.14571523259667546, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 16708 + }, + { + "epoch": 0.14572395388184403, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 16709 + }, + { + "epoch": 0.1457326751670126, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 16710 + }, + { + "epoch": 0.1457413964521812, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 16711 + }, + { + "epoch": 0.14575011773734978, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 16712 + }, + { + "epoch": 0.14575883902251835, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 16713 + }, + { + "epoch": 0.14576756030768695, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 16714 + }, + { + "epoch": 0.14577628159285552, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 16715 + }, + { + "epoch": 0.1457850028780241, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 16716 + }, + { + "epoch": 0.1457937241631927, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16717 + }, + { + "epoch": 0.14580244544836127, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 16718 + }, + { + "epoch": 0.14581116673352984, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 16719 + }, + { + "epoch": 0.14581988801869844, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 16720 + }, + { + "epoch": 0.145828609303867, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 16721 + }, + { + "epoch": 0.1458373305890356, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 16722 + }, + { + "epoch": 0.14584605187420419, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 16723 + }, + { + "epoch": 0.14585477315937276, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 16724 + }, + { + "epoch": 0.14586349444454136, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 16725 + }, + { + "epoch": 0.14587221572970993, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 16726 + }, + { + "epoch": 0.1458809370148785, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 16727 + }, + { + "epoch": 0.1458896583000471, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 16728 + }, + { + "epoch": 0.14589837958521568, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 16729 + }, + { + "epoch": 0.14590710087038425, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 16730 + }, + { + "epoch": 0.14591582215555285, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 16731 + }, + { + "epoch": 0.14592454344072142, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 16732 + }, + { + "epoch": 0.14593326472589, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16733 + }, + { + "epoch": 0.1459419860110586, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 16734 + }, + { + "epoch": 0.14595070729622717, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 16735 + }, + { + "epoch": 0.14595942858139577, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 16736 + }, + { + "epoch": 0.14596814986656434, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 16737 + }, + { + "epoch": 0.1459768711517329, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 16738 + }, + { + "epoch": 0.1459855924369015, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 16739 + }, + { + "epoch": 0.14599431372207008, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 16740 + }, + { + "epoch": 0.14600303500723866, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 16741 + }, + { + "epoch": 0.14601175629240726, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 16742 + }, + { + "epoch": 0.14602047757757583, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16743 + }, + { + "epoch": 0.1460291988627444, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 16744 + }, + { + "epoch": 0.146037920147913, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 16745 + }, + { + "epoch": 0.14604664143308158, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 16746 + }, + { + "epoch": 0.14605536271825015, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 16747 + }, + { + "epoch": 0.14606408400341875, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 16748 + }, + { + "epoch": 0.14607280528858732, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 16749 + }, + { + "epoch": 0.14608152657375592, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 16750 + }, + { + "epoch": 0.1460902478589245, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 16751 + }, + { + "epoch": 0.14609896914409307, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 16752 + }, + { + "epoch": 0.14610769042926167, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 16753 + }, + { + "epoch": 0.14611641171443024, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 16754 + }, + { + "epoch": 0.1461251329995988, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 16755 + }, + { + "epoch": 0.1461338542847674, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 16756 + }, + { + "epoch": 0.14614257556993598, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 16757 + }, + { + "epoch": 0.14615129685510456, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 16758 + }, + { + "epoch": 0.14616001814027316, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 16759 + }, + { + "epoch": 0.14616873942544173, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 16760 + }, + { + "epoch": 0.1461774607106103, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 16761 + }, + { + "epoch": 0.1461861819957789, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 16762 + }, + { + "epoch": 0.14619490328094747, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 16763 + }, + { + "epoch": 0.14620362456611607, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 16764 + }, + { + "epoch": 0.14621234585128465, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 16765 + }, + { + "epoch": 0.14622106713645322, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 16766 + }, + { + "epoch": 0.14622978842162182, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 16767 + }, + { + "epoch": 0.1462385097067904, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 16768 + }, + { + "epoch": 0.14624723099195897, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 16769 + }, + { + "epoch": 0.14625595227712757, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 16770 + }, + { + "epoch": 0.14626467356229614, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 16771 + }, + { + "epoch": 0.1462733948474647, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 16772 + }, + { + "epoch": 0.1462821161326333, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 16773 + }, + { + "epoch": 0.14629083741780188, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 16774 + }, + { + "epoch": 0.14629955870297048, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 16775 + }, + { + "epoch": 0.14630827998813906, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 16776 + }, + { + "epoch": 0.14631700127330763, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16777 + }, + { + "epoch": 0.14632572255847623, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 16778 + }, + { + "epoch": 0.1463344438436448, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 16779 + }, + { + "epoch": 0.14634316512881337, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16780 + }, + { + "epoch": 0.14635188641398197, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 16781 + }, + { + "epoch": 0.14636060769915055, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 16782 + }, + { + "epoch": 0.14636932898431912, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 16783 + }, + { + "epoch": 0.14637805026948772, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 16784 + }, + { + "epoch": 0.1463867715546563, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 16785 + }, + { + "epoch": 0.14639549283982486, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 16786 + }, + { + "epoch": 0.14640421412499346, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 16787 + }, + { + "epoch": 0.14641293541016204, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 16788 + }, + { + "epoch": 0.14642165669533064, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 16789 + }, + { + "epoch": 0.1464303779804992, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 16790 + }, + { + "epoch": 0.14643909926566778, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 16791 + }, + { + "epoch": 0.14644782055083638, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 16792 + }, + { + "epoch": 0.14645654183600496, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 16793 + }, + { + "epoch": 0.14646526312117353, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 16794 + }, + { + "epoch": 0.14647398440634213, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 16795 + }, + { + "epoch": 0.1464827056915107, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 16796 + }, + { + "epoch": 0.14649142697667927, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 16797 + }, + { + "epoch": 0.14650014826184787, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 16798 + }, + { + "epoch": 0.14650886954701645, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 16799 + }, + { + "epoch": 0.14651759083218502, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 16800 + }, + { + "epoch": 0.14652631211735362, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16801 + }, + { + "epoch": 0.1465350334025222, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 16802 + }, + { + "epoch": 0.1465437546876908, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 16803 + }, + { + "epoch": 0.14655247597285936, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 16804 + }, + { + "epoch": 0.14656119725802794, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 16805 + }, + { + "epoch": 0.14656991854319654, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 16806 + }, + { + "epoch": 0.1465786398283651, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 16807 + }, + { + "epoch": 0.14658736111353368, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 16808 + }, + { + "epoch": 0.14659608239870228, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 16809 + }, + { + "epoch": 0.14660480368387085, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 16810 + }, + { + "epoch": 0.14661352496903943, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 16811 + }, + { + "epoch": 0.14662224625420803, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 16812 + }, + { + "epoch": 0.1466309675393766, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 16813 + }, + { + "epoch": 0.14663968882454517, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 16814 + }, + { + "epoch": 0.14664841010971377, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 16815 + }, + { + "epoch": 0.14665713139488235, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 16816 + }, + { + "epoch": 0.14666585268005095, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 16817 + }, + { + "epoch": 0.14667457396521952, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 16818 + }, + { + "epoch": 0.1466832952503881, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 16819 + }, + { + "epoch": 0.1466920165355567, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 16820 + }, + { + "epoch": 0.14670073782072526, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 16821 + }, + { + "epoch": 0.14670945910589384, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 16822 + }, + { + "epoch": 0.14671818039106244, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 16823 + }, + { + "epoch": 0.146726901676231, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 16824 + }, + { + "epoch": 0.14673562296139958, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 16825 + }, + { + "epoch": 0.14674434424656818, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 16826 + }, + { + "epoch": 0.14675306553173675, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 16827 + }, + { + "epoch": 0.14676178681690533, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 16828 + }, + { + "epoch": 0.14677050810207393, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 16829 + }, + { + "epoch": 0.1467792293872425, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 16830 + }, + { + "epoch": 0.1467879506724111, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 16831 + }, + { + "epoch": 0.14679667195757967, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 16832 + }, + { + "epoch": 0.14680539324274824, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16833 + }, + { + "epoch": 0.14681411452791684, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 16834 + }, + { + "epoch": 0.14682283581308542, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 16835 + }, + { + "epoch": 0.146831557098254, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 16836 + }, + { + "epoch": 0.1468402783834226, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 16837 + }, + { + "epoch": 0.14684899966859116, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 16838 + }, + { + "epoch": 0.14685772095375974, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 16839 + }, + { + "epoch": 0.14686644223892834, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 16840 + }, + { + "epoch": 0.1468751635240969, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 16841 + }, + { + "epoch": 0.14688388480926548, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 16842 + }, + { + "epoch": 0.14689260609443408, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 16843 + }, + { + "epoch": 0.14690132737960265, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 16844 + }, + { + "epoch": 0.14691004866477125, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 16845 + }, + { + "epoch": 0.14691876994993983, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 16846 + }, + { + "epoch": 0.1469274912351084, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 16847 + }, + { + "epoch": 0.146936212520277, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 16848 + }, + { + "epoch": 0.14694493380544557, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 16849 + }, + { + "epoch": 0.14695365509061414, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 16850 + }, + { + "epoch": 0.14696237637578274, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 16851 + }, + { + "epoch": 0.14697109766095132, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 16852 + }, + { + "epoch": 0.1469798189461199, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 16853 + }, + { + "epoch": 0.1469885402312885, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 16854 + }, + { + "epoch": 0.14699726151645706, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 16855 + }, + { + "epoch": 0.14700598280162563, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 16856 + }, + { + "epoch": 0.14701470408679423, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 16857 + }, + { + "epoch": 0.1470234253719628, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 16858 + }, + { + "epoch": 0.1470321466571314, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 16859 + }, + { + "epoch": 0.14704086794229998, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 16860 + }, + { + "epoch": 0.14704958922746855, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 16861 + }, + { + "epoch": 0.14705831051263715, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 16862 + }, + { + "epoch": 0.14706703179780573, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 16863 + }, + { + "epoch": 0.1470757530829743, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 16864 + }, + { + "epoch": 0.1470844743681429, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 16865 + }, + { + "epoch": 0.14709319565331147, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 16866 + }, + { + "epoch": 0.14710191693848004, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 16867 + }, + { + "epoch": 0.14711063822364864, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 16868 + }, + { + "epoch": 0.14711935950881722, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 16869 + }, + { + "epoch": 0.1471280807939858, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 16870 + }, + { + "epoch": 0.1471368020791544, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 16871 + }, + { + "epoch": 0.14714552336432296, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 16872 + }, + { + "epoch": 0.14715424464949156, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 16873 + }, + { + "epoch": 0.14716296593466013, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 16874 + }, + { + "epoch": 0.1471716872198287, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 16875 + }, + { + "epoch": 0.1471804085049973, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 16876 + }, + { + "epoch": 0.14718912979016588, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 16877 + }, + { + "epoch": 0.14719785107533445, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 16878 + }, + { + "epoch": 0.14720657236050305, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 16879 + }, + { + "epoch": 0.14721529364567162, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 16880 + }, + { + "epoch": 0.1472240149308402, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 16881 + }, + { + "epoch": 0.1472327362160088, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 16882 + }, + { + "epoch": 0.14724145750117737, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 16883 + }, + { + "epoch": 0.14725017878634594, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 16884 + }, + { + "epoch": 0.14725890007151454, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 16885 + }, + { + "epoch": 0.14726762135668311, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 16886 + }, + { + "epoch": 0.14727634264185172, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 16887 + }, + { + "epoch": 0.1472850639270203, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 16888 + }, + { + "epoch": 0.14729378521218886, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 16889 + }, + { + "epoch": 0.14730250649735746, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 16890 + }, + { + "epoch": 0.14731122778252603, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 16891 + }, + { + "epoch": 0.1473199490676946, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 16892 + }, + { + "epoch": 0.1473286703528632, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 16893 + }, + { + "epoch": 0.14733739163803178, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 16894 + }, + { + "epoch": 0.14734611292320035, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 16895 + }, + { + "epoch": 0.14735483420836895, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 16896 + }, + { + "epoch": 0.14736355549353752, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 16897 + }, + { + "epoch": 0.14737227677870612, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 16898 + }, + { + "epoch": 0.1473809980638747, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 16899 + }, + { + "epoch": 0.14738971934904327, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 16900 + }, + { + "epoch": 0.14739844063421187, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 16901 + }, + { + "epoch": 0.14740716191938044, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 16902 + }, + { + "epoch": 0.14741588320454901, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 16903 + }, + { + "epoch": 0.14742460448971761, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 16904 + }, + { + "epoch": 0.1474333257748862, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 16905 + }, + { + "epoch": 0.14744204706005476, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 16906 + }, + { + "epoch": 0.14745076834522336, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 16907 + }, + { + "epoch": 0.14745948963039193, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 16908 + }, + { + "epoch": 0.1474682109155605, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 16909 + }, + { + "epoch": 0.1474769322007291, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 16910 + }, + { + "epoch": 0.14748565348589768, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 16911 + }, + { + "epoch": 0.14749437477106628, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 16912 + }, + { + "epoch": 0.14750309605623485, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 16913 + }, + { + "epoch": 0.14751181734140342, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 16914 + }, + { + "epoch": 0.14752053862657202, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 16915 + }, + { + "epoch": 0.1475292599117406, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 16916 + }, + { + "epoch": 0.14753798119690917, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 16917 + }, + { + "epoch": 0.14754670248207777, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 16918 + }, + { + "epoch": 0.14755542376724634, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 16919 + }, + { + "epoch": 0.1475641450524149, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 16920 + }, + { + "epoch": 0.1475728663375835, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 16921 + }, + { + "epoch": 0.1475815876227521, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 16922 + }, + { + "epoch": 0.14759030890792066, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 16923 + }, + { + "epoch": 0.14759903019308926, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 16924 + }, + { + "epoch": 0.14760775147825783, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16925 + }, + { + "epoch": 0.14761647276342643, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 16926 + }, + { + "epoch": 0.147625194048595, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 16927 + }, + { + "epoch": 0.14763391533376358, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 16928 + }, + { + "epoch": 0.14764263661893218, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 16929 + }, + { + "epoch": 0.14765135790410075, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 16930 + }, + { + "epoch": 0.14766007918926932, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 16931 + }, + { + "epoch": 0.14766880047443792, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 16932 + }, + { + "epoch": 0.1476775217596065, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 16933 + }, + { + "epoch": 0.14768624304477507, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 16934 + }, + { + "epoch": 0.14769496432994367, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 16935 + }, + { + "epoch": 0.14770368561511224, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 16936 + }, + { + "epoch": 0.1477124069002808, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 16937 + }, + { + "epoch": 0.1477211281854494, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 16938 + }, + { + "epoch": 0.14772984947061799, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 16939 + }, + { + "epoch": 0.14773857075578659, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 16940 + }, + { + "epoch": 0.14774729204095516, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 16941 + }, + { + "epoch": 0.14775601332612373, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 16942 + }, + { + "epoch": 0.14776473461129233, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 16943 + }, + { + "epoch": 0.1477734558964609, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 16944 + }, + { + "epoch": 0.14778217718162948, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 16945 + }, + { + "epoch": 0.14779089846679808, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 16946 + }, + { + "epoch": 0.14779961975196665, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 16947 + }, + { + "epoch": 0.14780834103713522, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 16948 + }, + { + "epoch": 0.14781706232230382, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 16949 + }, + { + "epoch": 0.1478257836074724, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 16950 + }, + { + "epoch": 0.14783450489264097, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 16951 + }, + { + "epoch": 0.14784322617780957, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 16952 + }, + { + "epoch": 0.14785194746297814, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 16953 + }, + { + "epoch": 0.14786066874814674, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 16954 + }, + { + "epoch": 0.1478693900333153, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 16955 + }, + { + "epoch": 0.14787811131848388, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 16956 + }, + { + "epoch": 0.14788683260365248, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 16957 + }, + { + "epoch": 0.14789555388882106, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 16958 + }, + { + "epoch": 0.14790427517398963, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 16959 + }, + { + "epoch": 0.14791299645915823, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 16960 + }, + { + "epoch": 0.1479217177443268, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 16961 + }, + { + "epoch": 0.14793043902949538, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 16962 + }, + { + "epoch": 0.14793916031466398, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 16963 + }, + { + "epoch": 0.14794788159983255, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 16964 + }, + { + "epoch": 0.14795660288500112, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 16965 + }, + { + "epoch": 0.14796532417016972, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 16966 + }, + { + "epoch": 0.1479740454553383, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 16967 + }, + { + "epoch": 0.1479827667405069, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 16968 + }, + { + "epoch": 0.14799148802567547, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 16969 + }, + { + "epoch": 0.14800020931084404, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 16970 + }, + { + "epoch": 0.14800893059601264, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 16971 + }, + { + "epoch": 0.1480176518811812, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 16972 + }, + { + "epoch": 0.14802637316634978, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 16973 + }, + { + "epoch": 0.14803509445151838, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 16974 + }, + { + "epoch": 0.14804381573668696, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 16975 + }, + { + "epoch": 0.14805253702185553, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 16976 + }, + { + "epoch": 0.14806125830702413, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 16977 + }, + { + "epoch": 0.1480699795921927, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 16978 + }, + { + "epoch": 0.14807870087736127, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 16979 + }, + { + "epoch": 0.14808742216252987, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 16980 + }, + { + "epoch": 0.14809614344769845, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 16981 + }, + { + "epoch": 0.14810486473286705, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 16982 + }, + { + "epoch": 0.14811358601803562, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 16983 + }, + { + "epoch": 0.1481223073032042, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 16984 + }, + { + "epoch": 0.1481310285883728, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 16985 + }, + { + "epoch": 0.14813974987354137, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 16986 + }, + { + "epoch": 0.14814847115870994, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 16987 + }, + { + "epoch": 0.14815719244387854, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 16988 + }, + { + "epoch": 0.1481659137290471, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 16989 + }, + { + "epoch": 0.14817463501421568, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 16990 + }, + { + "epoch": 0.14818335629938428, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 16991 + }, + { + "epoch": 0.14819207758455286, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 16992 + }, + { + "epoch": 0.14820079886972143, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 16993 + }, + { + "epoch": 0.14820952015489003, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 16994 + }, + { + "epoch": 0.1482182414400586, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 16995 + }, + { + "epoch": 0.1482269627252272, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 16996 + }, + { + "epoch": 0.14823568401039577, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 16997 + }, + { + "epoch": 0.14824440529556435, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 16998 + }, + { + "epoch": 0.14825312658073295, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 16999 + }, + { + "epoch": 0.14826184786590152, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 17000 + }, + { + "epoch": 0.1482705691510701, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 17001 + }, + { + "epoch": 0.1482792904362387, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 17002 + }, + { + "epoch": 0.14828801172140726, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 17003 + }, + { + "epoch": 0.14829673300657584, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 17004 + }, + { + "epoch": 0.14830545429174444, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 17005 + }, + { + "epoch": 0.148314175576913, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 17006 + }, + { + "epoch": 0.1483228968620816, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 17007 + }, + { + "epoch": 0.14833161814725018, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 17008 + }, + { + "epoch": 0.14834033943241876, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 17009 + }, + { + "epoch": 0.14834906071758736, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 17010 + }, + { + "epoch": 0.14835778200275593, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 17011 + }, + { + "epoch": 0.1483665032879245, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 17012 + }, + { + "epoch": 0.1483752245730931, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 17013 + }, + { + "epoch": 0.14838394585826167, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17014 + }, + { + "epoch": 0.14839266714343025, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 17015 + }, + { + "epoch": 0.14840138842859885, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 17016 + }, + { + "epoch": 0.14841010971376742, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 17017 + }, + { + "epoch": 0.148418830998936, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17018 + }, + { + "epoch": 0.1484275522841046, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 17019 + }, + { + "epoch": 0.14843627356927316, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 17020 + }, + { + "epoch": 0.14844499485444176, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 17021 + }, + { + "epoch": 0.14845371613961034, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 17022 + }, + { + "epoch": 0.1484624374247789, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 17023 + }, + { + "epoch": 0.1484711587099475, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 17024 + }, + { + "epoch": 0.14847987999511608, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 17025 + }, + { + "epoch": 0.14848860128028465, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 17026 + }, + { + "epoch": 0.14849732256545325, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 17027 + }, + { + "epoch": 0.14850604385062183, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 17028 + }, + { + "epoch": 0.1485147651357904, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 17029 + }, + { + "epoch": 0.148523486420959, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 17030 + }, + { + "epoch": 0.14853220770612757, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 17031 + }, + { + "epoch": 0.14854092899129615, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 17032 + }, + { + "epoch": 0.14854965027646475, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 17033 + }, + { + "epoch": 0.14855837156163332, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 17034 + }, + { + "epoch": 0.14856709284680192, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 17035 + }, + { + "epoch": 0.1485758141319705, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 17036 + }, + { + "epoch": 0.14858453541713906, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 17037 + }, + { + "epoch": 0.14859325670230766, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 17038 + }, + { + "epoch": 0.14860197798747624, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17039 + }, + { + "epoch": 0.1486106992726448, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 17040 + }, + { + "epoch": 0.1486194205578134, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 17041 + }, + { + "epoch": 0.14862814184298198, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 17042 + }, + { + "epoch": 0.14863686312815055, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 17043 + }, + { + "epoch": 0.14864558441331915, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 17044 + }, + { + "epoch": 0.14865430569848773, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 17045 + }, + { + "epoch": 0.1486630269836563, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17046 + }, + { + "epoch": 0.1486717482688249, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 17047 + }, + { + "epoch": 0.14868046955399347, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 17048 + }, + { + "epoch": 0.14868919083916207, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 17049 + }, + { + "epoch": 0.14869791212433064, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 17050 + }, + { + "epoch": 0.14870663340949922, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17051 + }, + { + "epoch": 0.14871535469466782, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 17052 + }, + { + "epoch": 0.1487240759798364, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 17053 + }, + { + "epoch": 0.14873279726500496, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 17054 + }, + { + "epoch": 0.14874151855017356, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 17055 + }, + { + "epoch": 0.14875023983534214, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 17056 + }, + { + "epoch": 0.1487589611205107, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 17057 + }, + { + "epoch": 0.1487676824056793, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 17058 + }, + { + "epoch": 0.14877640369084788, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 17059 + }, + { + "epoch": 0.14878512497601645, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 17060 + }, + { + "epoch": 0.14879384626118505, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 17061 + }, + { + "epoch": 0.14880256754635363, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 17062 + }, + { + "epoch": 0.14881128883152223, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 17063 + }, + { + "epoch": 0.1488200101166908, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 17064 + }, + { + "epoch": 0.14882873140185937, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 17065 + }, + { + "epoch": 0.14883745268702797, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 17066 + }, + { + "epoch": 0.14884617397219654, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 17067 + }, + { + "epoch": 0.14885489525736512, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 17068 + }, + { + "epoch": 0.14886361654253372, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 17069 + }, + { + "epoch": 0.1488723378277023, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 17070 + }, + { + "epoch": 0.14888105911287086, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 17071 + }, + { + "epoch": 0.14888978039803946, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 17072 + }, + { + "epoch": 0.14889850168320803, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 17073 + }, + { + "epoch": 0.1489072229683766, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 17074 + }, + { + "epoch": 0.1489159442535452, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 17075 + }, + { + "epoch": 0.14892466553871378, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 17076 + }, + { + "epoch": 0.14893338682388238, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 17077 + }, + { + "epoch": 0.14894210810905095, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 17078 + }, + { + "epoch": 0.14895082939421952, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 17079 + }, + { + "epoch": 0.14895955067938813, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 17080 + }, + { + "epoch": 0.1489682719645567, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 17081 + }, + { + "epoch": 0.14897699324972527, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 17082 + }, + { + "epoch": 0.14898571453489387, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 17083 + }, + { + "epoch": 0.14899443582006244, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 17084 + }, + { + "epoch": 0.14900315710523102, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 17085 + }, + { + "epoch": 0.14901187839039962, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 17086 + }, + { + "epoch": 0.1490205996755682, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 17087 + }, + { + "epoch": 0.14902932096073676, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 17088 + }, + { + "epoch": 0.14903804224590536, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 17089 + }, + { + "epoch": 0.14904676353107393, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 17090 + }, + { + "epoch": 0.14905548481624253, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 17091 + }, + { + "epoch": 0.1490642061014111, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 17092 + }, + { + "epoch": 0.14907292738657968, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 17093 + }, + { + "epoch": 0.14908164867174828, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 17094 + }, + { + "epoch": 0.14909036995691685, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 17095 + }, + { + "epoch": 0.14909909124208542, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 17096 + }, + { + "epoch": 0.14910781252725402, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 17097 + }, + { + "epoch": 0.1491165338124226, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 17098 + }, + { + "epoch": 0.14912525509759117, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 17099 + }, + { + "epoch": 0.14913397638275977, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 17100 + }, + { + "epoch": 0.14914269766792834, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 17101 + }, + { + "epoch": 0.14915141895309691, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 17102 + }, + { + "epoch": 0.14916014023826552, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17103 + }, + { + "epoch": 0.1491688615234341, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 17104 + }, + { + "epoch": 0.1491775828086027, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 17105 + }, + { + "epoch": 0.14918630409377126, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 17106 + }, + { + "epoch": 0.14919502537893983, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 17107 + }, + { + "epoch": 0.14920374666410843, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 17108 + }, + { + "epoch": 0.149212467949277, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 17109 + }, + { + "epoch": 0.14922118923444558, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 17110 + }, + { + "epoch": 0.14922991051961418, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 17111 + }, + { + "epoch": 0.14923863180478275, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 17112 + }, + { + "epoch": 0.14924735308995132, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 17113 + }, + { + "epoch": 0.14925607437511992, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 17114 + }, + { + "epoch": 0.1492647956602885, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17115 + }, + { + "epoch": 0.14927351694545707, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 17116 + }, + { + "epoch": 0.14928223823062567, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 17117 + }, + { + "epoch": 0.14929095951579424, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 17118 + }, + { + "epoch": 0.14929968080096284, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 17119 + }, + { + "epoch": 0.14930840208613141, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 17120 + }, + { + "epoch": 0.1493171233713, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 17121 + }, + { + "epoch": 0.1493258446564686, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 17122 + }, + { + "epoch": 0.14933456594163716, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 17123 + }, + { + "epoch": 0.14934328722680573, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 17124 + }, + { + "epoch": 0.14935200851197433, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 17125 + }, + { + "epoch": 0.1493607297971429, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 17126 + }, + { + "epoch": 0.14936945108231148, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17127 + }, + { + "epoch": 0.14937817236748008, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 17128 + }, + { + "epoch": 0.14938689365264865, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 17129 + }, + { + "epoch": 0.14939561493781725, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 17130 + }, + { + "epoch": 0.14940433622298582, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 17131 + }, + { + "epoch": 0.1494130575081544, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 17132 + }, + { + "epoch": 0.149421778793323, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 17133 + }, + { + "epoch": 0.14943050007849157, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 17134 + }, + { + "epoch": 0.14943922136366014, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 17135 + }, + { + "epoch": 0.14944794264882874, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 17136 + }, + { + "epoch": 0.1494566639339973, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 17137 + }, + { + "epoch": 0.14946538521916589, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 17138 + }, + { + "epoch": 0.1494741065043345, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 17139 + }, + { + "epoch": 0.14948282778950306, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 17140 + }, + { + "epoch": 0.14949154907467163, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 17141 + }, + { + "epoch": 0.14950027035984023, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 17142 + }, + { + "epoch": 0.1495089916450088, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 17143 + }, + { + "epoch": 0.1495177129301774, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 17144 + }, + { + "epoch": 0.14952643421534598, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 17145 + }, + { + "epoch": 0.14953515550051455, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17146 + }, + { + "epoch": 0.14954387678568315, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 17147 + }, + { + "epoch": 0.14955259807085172, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 17148 + }, + { + "epoch": 0.1495613193560203, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 17149 + }, + { + "epoch": 0.1495700406411889, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17150 + }, + { + "epoch": 0.14957876192635747, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 17151 + }, + { + "epoch": 0.14958748321152604, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 17152 + }, + { + "epoch": 0.14959620449669464, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 17153 + }, + { + "epoch": 0.1496049257818632, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 17154 + }, + { + "epoch": 0.14961364706703179, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17155 + }, + { + "epoch": 0.14962236835220039, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 17156 + }, + { + "epoch": 0.14963108963736896, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 17157 + }, + { + "epoch": 0.14963981092253756, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 17158 + }, + { + "epoch": 0.14964853220770613, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 17159 + }, + { + "epoch": 0.1496572534928747, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 17160 + }, + { + "epoch": 0.1496659747780433, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 17161 + }, + { + "epoch": 0.14967469606321188, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 17162 + }, + { + "epoch": 0.14968341734838045, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 17163 + }, + { + "epoch": 0.14969213863354905, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 17164 + }, + { + "epoch": 0.14970085991871762, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 17165 + }, + { + "epoch": 0.1497095812038862, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17166 + }, + { + "epoch": 0.1497183024890548, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 17167 + }, + { + "epoch": 0.14972702377422337, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 17168 + }, + { + "epoch": 0.14973574505939194, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 17169 + }, + { + "epoch": 0.14974446634456054, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 17170 + }, + { + "epoch": 0.1497531876297291, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 17171 + }, + { + "epoch": 0.1497619089148977, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 17172 + }, + { + "epoch": 0.14977063020006628, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 17173 + }, + { + "epoch": 0.14977935148523486, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 17174 + }, + { + "epoch": 0.14978807277040346, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17175 + }, + { + "epoch": 0.14979679405557203, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 17176 + }, + { + "epoch": 0.1498055153407406, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 17177 + }, + { + "epoch": 0.1498142366259092, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 17178 + }, + { + "epoch": 0.14982295791107778, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17179 + }, + { + "epoch": 0.14983167919624635, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 17180 + }, + { + "epoch": 0.14984040048141495, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 17181 + }, + { + "epoch": 0.14984912176658352, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 17182 + }, + { + "epoch": 0.1498578430517521, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 17183 + }, + { + "epoch": 0.1498665643369207, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 17184 + }, + { + "epoch": 0.14987528562208927, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 17185 + }, + { + "epoch": 0.14988400690725787, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 17186 + }, + { + "epoch": 0.14989272819242644, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 17187 + }, + { + "epoch": 0.149901449477595, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17188 + }, + { + "epoch": 0.1499101707627636, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 17189 + }, + { + "epoch": 0.14991889204793218, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 17190 + }, + { + "epoch": 0.14992761333310076, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 17191 + }, + { + "epoch": 0.14993633461826936, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 17192 + }, + { + "epoch": 0.14994505590343793, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 17193 + }, + { + "epoch": 0.1499537771886065, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 17194 + }, + { + "epoch": 0.1499624984737751, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 17195 + }, + { + "epoch": 0.14997121975894367, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 17196 + }, + { + "epoch": 0.14997994104411225, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 17197 + }, + { + "epoch": 0.14998866232928085, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 17198 + }, + { + "epoch": 0.14999738361444942, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 17199 + }, + { + "epoch": 0.15000610489961802, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 17200 + }, + { + "epoch": 0.1500148261847866, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 17201 + }, + { + "epoch": 0.15002354746995517, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 17202 + }, + { + "epoch": 0.15003226875512377, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 17203 + }, + { + "epoch": 0.15004099004029234, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 17204 + }, + { + "epoch": 0.1500497113254609, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 17205 + }, + { + "epoch": 0.1500584326106295, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 17206 + }, + { + "epoch": 0.15006715389579808, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 17207 + }, + { + "epoch": 0.15007587518096666, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 17208 + }, + { + "epoch": 0.15008459646613526, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 17209 + }, + { + "epoch": 0.15009331775130383, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 17210 + }, + { + "epoch": 0.1501020390364724, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 17211 + }, + { + "epoch": 0.150110760321641, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 17212 + }, + { + "epoch": 0.15011948160680957, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 17213 + }, + { + "epoch": 0.15012820289197817, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 17214 + }, + { + "epoch": 0.15013692417714675, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 17215 + }, + { + "epoch": 0.15014564546231532, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 17216 + }, + { + "epoch": 0.15015436674748392, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 17217 + }, + { + "epoch": 0.1501630880326525, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 17218 + }, + { + "epoch": 0.15017180931782106, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17219 + }, + { + "epoch": 0.15018053060298966, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 17220 + }, + { + "epoch": 0.15018925188815824, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 17221 + }, + { + "epoch": 0.1501979731733268, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 17222 + }, + { + "epoch": 0.1502066944584954, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17223 + }, + { + "epoch": 0.15021541574366398, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 17224 + }, + { + "epoch": 0.15022413702883256, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 17225 + }, + { + "epoch": 0.15023285831400116, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 17226 + }, + { + "epoch": 0.15024157959916973, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17227 + }, + { + "epoch": 0.15025030088433833, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 17228 + }, + { + "epoch": 0.1502590221695069, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 17229 + }, + { + "epoch": 0.15026774345467547, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 17230 + }, + { + "epoch": 0.15027646473984407, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 17231 + }, + { + "epoch": 0.15028518602501265, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 17232 + }, + { + "epoch": 0.15029390731018122, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17233 + }, + { + "epoch": 0.15030262859534982, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 17234 + }, + { + "epoch": 0.1503113498805184, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 17235 + }, + { + "epoch": 0.15032007116568696, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 17236 + }, + { + "epoch": 0.15032879245085556, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 17237 + }, + { + "epoch": 0.15033751373602414, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 17238 + }, + { + "epoch": 0.15034623502119274, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 17239 + }, + { + "epoch": 0.1503549563063613, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 17240 + }, + { + "epoch": 0.15036367759152988, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 17241 + }, + { + "epoch": 0.15037239887669848, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 17242 + }, + { + "epoch": 0.15038112016186705, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 17243 + }, + { + "epoch": 0.15038984144703563, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 17244 + }, + { + "epoch": 0.15039856273220423, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 17245 + }, + { + "epoch": 0.1504072840173728, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 17246 + }, + { + "epoch": 0.15041600530254137, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 17247 + }, + { + "epoch": 0.15042472658770997, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 17248 + }, + { + "epoch": 0.15043344787287855, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 17249 + }, + { + "epoch": 0.15044216915804712, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 17250 + }, + { + "epoch": 0.15045089044321572, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 17251 + }, + { + "epoch": 0.1504596117283843, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 17252 + }, + { + "epoch": 0.1504683330135529, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 17253 + }, + { + "epoch": 0.15047705429872146, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 17254 + }, + { + "epoch": 0.15048577558389004, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 17255 + }, + { + "epoch": 0.15049449686905864, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 17256 + }, + { + "epoch": 0.1505032181542272, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 17257 + }, + { + "epoch": 0.15051193943939578, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 17258 + }, + { + "epoch": 0.15052066072456438, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 17259 + }, + { + "epoch": 0.15052938200973295, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17260 + }, + { + "epoch": 0.15053810329490153, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17261 + }, + { + "epoch": 0.15054682458007013, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17262 + }, + { + "epoch": 0.1505555458652387, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 17263 + }, + { + "epoch": 0.15056426715040727, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 17264 + }, + { + "epoch": 0.15057298843557587, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 17265 + }, + { + "epoch": 0.15058170972074444, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17266 + }, + { + "epoch": 0.15059043100591304, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 17267 + }, + { + "epoch": 0.15059915229108162, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 17268 + }, + { + "epoch": 0.1506078735762502, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 17269 + }, + { + "epoch": 0.1506165948614188, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17270 + }, + { + "epoch": 0.15062531614658736, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 17271 + }, + { + "epoch": 0.15063403743175593, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17272 + }, + { + "epoch": 0.15064275871692454, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 17273 + }, + { + "epoch": 0.1506514800020931, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17274 + }, + { + "epoch": 0.15066020128726168, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 17275 + }, + { + "epoch": 0.15066892257243028, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 17276 + }, + { + "epoch": 0.15067764385759885, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 17277 + }, + { + "epoch": 0.15068636514276743, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 17278 + }, + { + "epoch": 0.15069508642793603, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 17279 + }, + { + "epoch": 0.1507038077131046, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 17280 + }, + { + "epoch": 0.1507125289982732, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 17281 + }, + { + "epoch": 0.15072125028344177, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 17282 + }, + { + "epoch": 0.15072997156861034, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 17283 + }, + { + "epoch": 0.15073869285377894, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 17284 + }, + { + "epoch": 0.15074741413894752, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 17285 + }, + { + "epoch": 0.1507561354241161, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 17286 + }, + { + "epoch": 0.1507648567092847, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 17287 + }, + { + "epoch": 0.15077357799445326, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 17288 + }, + { + "epoch": 0.15078229927962183, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 17289 + }, + { + "epoch": 0.15079102056479043, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 17290 + }, + { + "epoch": 0.150799741849959, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 17291 + }, + { + "epoch": 0.15080846313512758, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 17292 + }, + { + "epoch": 0.15081718442029618, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 17293 + }, + { + "epoch": 0.15082590570546475, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 17294 + }, + { + "epoch": 0.15083462699063335, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 17295 + }, + { + "epoch": 0.15084334827580193, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 17296 + }, + { + "epoch": 0.1508520695609705, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 17297 + }, + { + "epoch": 0.1508607908461391, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 17298 + }, + { + "epoch": 0.15086951213130767, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 17299 + }, + { + "epoch": 0.15087823341647624, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 17300 + }, + { + "epoch": 0.15088695470164484, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 17301 + }, + { + "epoch": 0.15089567598681342, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 17302 + }, + { + "epoch": 0.150904397271982, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 17303 + }, + { + "epoch": 0.1509131185571506, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 17304 + }, + { + "epoch": 0.15092183984231916, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 17305 + }, + { + "epoch": 0.15093056112748773, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 17306 + }, + { + "epoch": 0.15093928241265633, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 17307 + }, + { + "epoch": 0.1509480036978249, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 17308 + }, + { + "epoch": 0.1509567249829935, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17309 + }, + { + "epoch": 0.15096544626816208, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 17310 + }, + { + "epoch": 0.15097416755333065, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 17311 + }, + { + "epoch": 0.15098288883849925, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 17312 + }, + { + "epoch": 0.15099161012366782, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 17313 + }, + { + "epoch": 0.1510003314088364, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 17314 + }, + { + "epoch": 0.151009052694005, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 17315 + }, + { + "epoch": 0.15101777397917357, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 17316 + }, + { + "epoch": 0.15102649526434214, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 17317 + }, + { + "epoch": 0.15103521654951074, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 17318 + }, + { + "epoch": 0.15104393783467931, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 17319 + }, + { + "epoch": 0.1510526591198479, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 17320 + }, + { + "epoch": 0.1510613804050165, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 17321 + }, + { + "epoch": 0.15107010169018506, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 17322 + }, + { + "epoch": 0.15107882297535366, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 17323 + }, + { + "epoch": 0.15108754426052223, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 17324 + }, + { + "epoch": 0.1510962655456908, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 17325 + }, + { + "epoch": 0.1511049868308594, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 17326 + }, + { + "epoch": 0.15111370811602798, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 17327 + }, + { + "epoch": 0.15112242940119655, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 17328 + }, + { + "epoch": 0.15113115068636515, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 17329 + }, + { + "epoch": 0.15113987197153372, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 17330 + }, + { + "epoch": 0.1511485932567023, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 17331 + }, + { + "epoch": 0.1511573145418709, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 17332 + }, + { + "epoch": 0.15116603582703947, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 17333 + }, + { + "epoch": 0.15117475711220804, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 17334 + }, + { + "epoch": 0.15118347839737664, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 17335 + }, + { + "epoch": 0.15119219968254521, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 17336 + }, + { + "epoch": 0.15120092096771381, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 17337 + }, + { + "epoch": 0.1512096422528824, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17338 + }, + { + "epoch": 0.15121836353805096, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 17339 + }, + { + "epoch": 0.15122708482321956, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 17340 + }, + { + "epoch": 0.15123580610838813, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 17341 + }, + { + "epoch": 0.1512445273935567, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 17342 + }, + { + "epoch": 0.1512532486787253, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 17343 + }, + { + "epoch": 0.15126196996389388, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 17344 + }, + { + "epoch": 0.15127069124906245, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 17345 + }, + { + "epoch": 0.15127941253423105, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 17346 + }, + { + "epoch": 0.15128813381939962, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 17347 + }, + { + "epoch": 0.1512968551045682, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 17348 + }, + { + "epoch": 0.1513055763897368, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 17349 + }, + { + "epoch": 0.15131429767490537, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 17350 + }, + { + "epoch": 0.15132301896007397, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 17351 + }, + { + "epoch": 0.15133174024524254, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 17352 + }, + { + "epoch": 0.1513404615304111, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17353 + }, + { + "epoch": 0.1513491828155797, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 17354 + }, + { + "epoch": 0.1513579041007483, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 17355 + }, + { + "epoch": 0.15136662538591686, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 17356 + }, + { + "epoch": 0.15137534667108546, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 17357 + }, + { + "epoch": 0.15138406795625403, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 17358 + }, + { + "epoch": 0.1513927892414226, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 17359 + }, + { + "epoch": 0.1514015105265912, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 17360 + }, + { + "epoch": 0.15141023181175978, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 17361 + }, + { + "epoch": 0.15141895309692838, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 17362 + }, + { + "epoch": 0.15142767438209695, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 17363 + }, + { + "epoch": 0.15143639566726552, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 17364 + }, + { + "epoch": 0.15144511695243412, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17365 + }, + { + "epoch": 0.1514538382376027, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 17366 + }, + { + "epoch": 0.15146255952277127, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 17367 + }, + { + "epoch": 0.15147128080793987, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 17368 + }, + { + "epoch": 0.15148000209310844, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 17369 + }, + { + "epoch": 0.151488723378277, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 17370 + }, + { + "epoch": 0.1514974446634456, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 17371 + }, + { + "epoch": 0.15150616594861419, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 17372 + }, + { + "epoch": 0.15151488723378276, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 17373 + }, + { + "epoch": 0.15152360851895136, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 17374 + }, + { + "epoch": 0.15153232980411993, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 17375 + }, + { + "epoch": 0.15154105108928853, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 17376 + }, + { + "epoch": 0.1515497723744571, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 17377 + }, + { + "epoch": 0.15155849365962568, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 17378 + }, + { + "epoch": 0.15156721494479428, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 17379 + }, + { + "epoch": 0.15157593622996285, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 17380 + }, + { + "epoch": 0.15158465751513142, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 17381 + }, + { + "epoch": 0.15159337880030002, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17382 + }, + { + "epoch": 0.1516021000854686, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 17383 + }, + { + "epoch": 0.15161082137063717, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 17384 + }, + { + "epoch": 0.15161954265580577, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 17385 + }, + { + "epoch": 0.15162826394097434, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 17386 + }, + { + "epoch": 0.1516369852261429, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 17387 + }, + { + "epoch": 0.1516457065113115, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 17388 + }, + { + "epoch": 0.15165442779648008, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 17389 + }, + { + "epoch": 0.15166314908164868, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 17390 + }, + { + "epoch": 0.15167187036681726, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 17391 + }, + { + "epoch": 0.15168059165198583, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 17392 + }, + { + "epoch": 0.15168931293715443, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 17393 + }, + { + "epoch": 0.151698034222323, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 17394 + }, + { + "epoch": 0.15170675550749158, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 17395 + }, + { + "epoch": 0.15171547679266018, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 17396 + }, + { + "epoch": 0.15172419807782875, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 17397 + }, + { + "epoch": 0.15173291936299732, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 17398 + }, + { + "epoch": 0.15174164064816592, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 17399 + }, + { + "epoch": 0.1517503619333345, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17400 + }, + { + "epoch": 0.15175908321850307, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 17401 + }, + { + "epoch": 0.15176780450367167, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17402 + }, + { + "epoch": 0.15177652578884024, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 17403 + }, + { + "epoch": 0.15178524707400884, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 17404 + }, + { + "epoch": 0.1517939683591774, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 17405 + }, + { + "epoch": 0.15180268964434598, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 17406 + }, + { + "epoch": 0.15181141092951458, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 17407 + }, + { + "epoch": 0.15182013221468316, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 17408 + }, + { + "epoch": 0.15182885349985173, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 17409 + }, + { + "epoch": 0.15183757478502033, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 17410 + }, + { + "epoch": 0.1518462960701889, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17411 + }, + { + "epoch": 0.15185501735535747, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 17412 + }, + { + "epoch": 0.15186373864052607, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17413 + }, + { + "epoch": 0.15187245992569465, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 17414 + }, + { + "epoch": 0.15188118121086322, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 17415 + }, + { + "epoch": 0.15188990249603182, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 17416 + }, + { + "epoch": 0.1518986237812004, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 17417 + }, + { + "epoch": 0.151907345066369, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 17418 + }, + { + "epoch": 0.15191606635153757, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 17419 + }, + { + "epoch": 0.15192478763670614, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 17420 + }, + { + "epoch": 0.15193350892187474, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 17421 + }, + { + "epoch": 0.1519422302070433, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 17422 + }, + { + "epoch": 0.15195095149221188, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 17423 + }, + { + "epoch": 0.15195967277738048, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 17424 + }, + { + "epoch": 0.15196839406254906, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 17425 + }, + { + "epoch": 0.15197711534771763, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 17426 + }, + { + "epoch": 0.15198583663288623, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 17427 + }, + { + "epoch": 0.1519945579180548, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 17428 + }, + { + "epoch": 0.15200327920322337, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 17429 + }, + { + "epoch": 0.15201200048839197, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 17430 + }, + { + "epoch": 0.15202072177356055, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17431 + }, + { + "epoch": 0.15202944305872915, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 17432 + }, + { + "epoch": 0.15203816434389772, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 17433 + }, + { + "epoch": 0.1520468856290663, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17434 + }, + { + "epoch": 0.1520556069142349, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 17435 + }, + { + "epoch": 0.15206432819940346, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 17436 + }, + { + "epoch": 0.15207304948457204, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 17437 + }, + { + "epoch": 0.15208177076974064, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 17438 + }, + { + "epoch": 0.1520904920549092, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 17439 + }, + { + "epoch": 0.15209921334007778, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 17440 + }, + { + "epoch": 0.15210793462524638, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 17441 + }, + { + "epoch": 0.15211665591041496, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 17442 + }, + { + "epoch": 0.15212537719558353, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 17443 + }, + { + "epoch": 0.15213409848075213, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 17444 + }, + { + "epoch": 0.1521428197659207, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 17445 + }, + { + "epoch": 0.1521515410510893, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 17446 + }, + { + "epoch": 0.15216026233625787, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 17447 + }, + { + "epoch": 0.15216898362142645, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 17448 + }, + { + "epoch": 0.15217770490659505, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 17449 + }, + { + "epoch": 0.15218642619176362, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 17450 + }, + { + "epoch": 0.1521951474769322, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 17451 + }, + { + "epoch": 0.1522038687621008, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 17452 + }, + { + "epoch": 0.15221259004726936, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 17453 + }, + { + "epoch": 0.15222131133243794, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17454 + }, + { + "epoch": 0.15223003261760654, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17455 + }, + { + "epoch": 0.1522387539027751, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 17456 + }, + { + "epoch": 0.15224747518794368, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 17457 + }, + { + "epoch": 0.15225619647311228, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 17458 + }, + { + "epoch": 0.15226491775828085, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 17459 + }, + { + "epoch": 0.15227363904344945, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 17460 + }, + { + "epoch": 0.15228236032861803, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17461 + }, + { + "epoch": 0.1522910816137866, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 17462 + }, + { + "epoch": 0.1522998028989552, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 17463 + }, + { + "epoch": 0.15230852418412377, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 17464 + }, + { + "epoch": 0.15231724546929234, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 17465 + }, + { + "epoch": 0.15232596675446095, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 17466 + }, + { + "epoch": 0.15233468803962952, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 17467 + }, + { + "epoch": 0.1523434093247981, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 17468 + }, + { + "epoch": 0.1523521306099667, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 17469 + }, + { + "epoch": 0.15236085189513526, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 17470 + }, + { + "epoch": 0.15236957318030386, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17471 + }, + { + "epoch": 0.15237829446547244, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 17472 + }, + { + "epoch": 0.152387015750641, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 17473 + }, + { + "epoch": 0.1523957370358096, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 17474 + }, + { + "epoch": 0.15240445832097818, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 17475 + }, + { + "epoch": 0.15241317960614675, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 17476 + }, + { + "epoch": 0.15242190089131535, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 17477 + }, + { + "epoch": 0.15243062217648393, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 17478 + }, + { + "epoch": 0.1524393434616525, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 17479 + }, + { + "epoch": 0.1524480647468211, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 17480 + }, + { + "epoch": 0.15245678603198967, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 17481 + }, + { + "epoch": 0.15246550731715824, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 17482 + }, + { + "epoch": 0.15247422860232684, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 17483 + }, + { + "epoch": 0.15248294988749542, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 17484 + }, + { + "epoch": 0.15249167117266402, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 17485 + }, + { + "epoch": 0.1525003924578326, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 17486 + }, + { + "epoch": 0.15250911374300116, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 17487 + }, + { + "epoch": 0.15251783502816976, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 17488 + }, + { + "epoch": 0.15252655631333834, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 17489 + }, + { + "epoch": 0.1525352775985069, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 17490 + }, + { + "epoch": 0.1525439988836755, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 17491 + }, + { + "epoch": 0.15255272016884408, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 17492 + }, + { + "epoch": 0.15256144145401265, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 17493 + }, + { + "epoch": 0.15257016273918125, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 17494 + }, + { + "epoch": 0.15257888402434983, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 17495 + }, + { + "epoch": 0.1525876053095184, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 17496 + }, + { + "epoch": 0.152596326594687, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17497 + }, + { + "epoch": 0.15260504787985557, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 17498 + }, + { + "epoch": 0.15261376916502417, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 17499 + }, + { + "epoch": 0.15262249045019274, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 17500 + }, + { + "epoch": 0.15263121173536132, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 17501 + }, + { + "epoch": 0.15263993302052992, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 17502 + }, + { + "epoch": 0.1526486543056985, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 17503 + }, + { + "epoch": 0.15265737559086706, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 17504 + }, + { + "epoch": 0.15266609687603566, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 17505 + }, + { + "epoch": 0.15267481816120423, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 17506 + }, + { + "epoch": 0.1526835394463728, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 17507 + }, + { + "epoch": 0.1526922607315414, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 17508 + }, + { + "epoch": 0.15270098201670998, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 17509 + }, + { + "epoch": 0.15270970330187855, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 17510 + }, + { + "epoch": 0.15271842458704715, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 17511 + }, + { + "epoch": 0.15272714587221572, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 17512 + }, + { + "epoch": 0.15273586715738433, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 17513 + }, + { + "epoch": 0.1527445884425529, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 17514 + }, + { + "epoch": 0.15275330972772147, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 17515 + }, + { + "epoch": 0.15276203101289007, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 17516 + }, + { + "epoch": 0.15277075229805864, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 17517 + }, + { + "epoch": 0.15277947358322722, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 17518 + }, + { + "epoch": 0.15278819486839582, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 17519 + }, + { + "epoch": 0.1527969161535644, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 17520 + }, + { + "epoch": 0.15280563743873296, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 17521 + }, + { + "epoch": 0.15281435872390156, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 17522 + }, + { + "epoch": 0.15282308000907013, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 17523 + }, + { + "epoch": 0.1528318012942387, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 17524 + }, + { + "epoch": 0.1528405225794073, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 17525 + }, + { + "epoch": 0.15284924386457588, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 17526 + }, + { + "epoch": 0.15285796514974448, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 17527 + }, + { + "epoch": 0.15286668643491305, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 17528 + }, + { + "epoch": 0.15287540772008162, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 17529 + }, + { + "epoch": 0.15288412900525022, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 17530 + }, + { + "epoch": 0.1528928502904188, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 17531 + }, + { + "epoch": 0.15290157157558737, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 17532 + }, + { + "epoch": 0.15291029286075597, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 17533 + }, + { + "epoch": 0.15291901414592454, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 17534 + }, + { + "epoch": 0.15292773543109311, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 17535 + }, + { + "epoch": 0.15293645671626172, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 17536 + }, + { + "epoch": 0.1529451780014303, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 17537 + }, + { + "epoch": 0.15295389928659886, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 17538 + }, + { + "epoch": 0.15296262057176746, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 17539 + }, + { + "epoch": 0.15297134185693603, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 17540 + }, + { + "epoch": 0.15298006314210463, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 17541 + }, + { + "epoch": 0.1529887844272732, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 17542 + }, + { + "epoch": 0.15299750571244178, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 17543 + }, + { + "epoch": 0.15300622699761038, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 17544 + }, + { + "epoch": 0.15301494828277895, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 17545 + }, + { + "epoch": 0.15302366956794752, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 17546 + }, + { + "epoch": 0.15303239085311612, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 17547 + }, + { + "epoch": 0.1530411121382847, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 17548 + }, + { + "epoch": 0.15304983342345327, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17549 + }, + { + "epoch": 0.15305855470862187, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 17550 + }, + { + "epoch": 0.15306727599379044, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 17551 + }, + { + "epoch": 0.15307599727895901, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 17552 + }, + { + "epoch": 0.15308471856412761, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 17553 + }, + { + "epoch": 0.1530934398492962, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17554 + }, + { + "epoch": 0.1531021611344648, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 17555 + }, + { + "epoch": 0.15311088241963336, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 17556 + }, + { + "epoch": 0.15311960370480193, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 17557 + }, + { + "epoch": 0.15312832498997053, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 17558 + }, + { + "epoch": 0.1531370462751391, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 17559 + }, + { + "epoch": 0.15314576756030768, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 17560 + }, + { + "epoch": 0.15315448884547628, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 17561 + }, + { + "epoch": 0.15316321013064485, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 17562 + }, + { + "epoch": 0.15317193141581342, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 17563 + }, + { + "epoch": 0.15318065270098202, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 17564 + }, + { + "epoch": 0.1531893739861506, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17565 + }, + { + "epoch": 0.15319809527131917, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 17566 + }, + { + "epoch": 0.15320681655648777, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 17567 + }, + { + "epoch": 0.15321553784165634, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 17568 + }, + { + "epoch": 0.15322425912682494, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 17569 + }, + { + "epoch": 0.1532329804119935, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 17570 + }, + { + "epoch": 0.15324170169716209, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 17571 + }, + { + "epoch": 0.1532504229823307, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 17572 + }, + { + "epoch": 0.15325914426749926, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 17573 + }, + { + "epoch": 0.15326786555266783, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 17574 + }, + { + "epoch": 0.15327658683783643, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17575 + }, + { + "epoch": 0.153285308123005, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 17576 + }, + { + "epoch": 0.15329402940817358, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 17577 + }, + { + "epoch": 0.15330275069334218, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 17578 + }, + { + "epoch": 0.15331147197851075, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 17579 + }, + { + "epoch": 0.15332019326367935, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 17580 + }, + { + "epoch": 0.15332891454884792, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 17581 + }, + { + "epoch": 0.1533376358340165, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 17582 + }, + { + "epoch": 0.1533463571191851, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 17583 + }, + { + "epoch": 0.15335507840435367, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 17584 + }, + { + "epoch": 0.15336379968952224, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 17585 + }, + { + "epoch": 0.15337252097469084, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 17586 + }, + { + "epoch": 0.1533812422598594, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 17587 + }, + { + "epoch": 0.15338996354502799, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 17588 + }, + { + "epoch": 0.15339868483019659, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 17589 + }, + { + "epoch": 0.15340740611536516, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 17590 + }, + { + "epoch": 0.15341612740053373, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 17591 + }, + { + "epoch": 0.15342484868570233, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 17592 + }, + { + "epoch": 0.1534335699708709, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 17593 + }, + { + "epoch": 0.1534422912560395, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 17594 + }, + { + "epoch": 0.15345101254120808, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 17595 + }, + { + "epoch": 0.15345973382637665, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 17596 + }, + { + "epoch": 0.15346845511154525, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 17597 + }, + { + "epoch": 0.15347717639671382, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 17598 + }, + { + "epoch": 0.1534858976818824, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17599 + }, + { + "epoch": 0.153494618967051, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 17600 + }, + { + "epoch": 0.15350334025221957, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 17601 + }, + { + "epoch": 0.15351206153738814, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 17602 + }, + { + "epoch": 0.15352078282255674, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 17603 + }, + { + "epoch": 0.1535295041077253, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 17604 + }, + { + "epoch": 0.15353822539289388, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 17605 + }, + { + "epoch": 0.15354694667806248, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 17606 + }, + { + "epoch": 0.15355566796323106, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 17607 + }, + { + "epoch": 0.15356438924839966, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 17608 + }, + { + "epoch": 0.15357311053356823, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 17609 + }, + { + "epoch": 0.1535818318187368, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 17610 + }, + { + "epoch": 0.1535905531039054, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 17611 + }, + { + "epoch": 0.15359927438907398, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 17612 + }, + { + "epoch": 0.15360799567424255, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 17613 + }, + { + "epoch": 0.15361671695941115, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 17614 + }, + { + "epoch": 0.15362543824457972, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 17615 + }, + { + "epoch": 0.1536341595297483, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 17616 + }, + { + "epoch": 0.1536428808149169, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 17617 + }, + { + "epoch": 0.15365160210008547, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 17618 + }, + { + "epoch": 0.15366032338525404, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 17619 + }, + { + "epoch": 0.15366904467042264, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17620 + }, + { + "epoch": 0.1536777659555912, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 17621 + }, + { + "epoch": 0.1536864872407598, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 17622 + }, + { + "epoch": 0.15369520852592838, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 17623 + }, + { + "epoch": 0.15370392981109696, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 17624 + }, + { + "epoch": 0.15371265109626556, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 17625 + }, + { + "epoch": 0.15372137238143413, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 17626 + }, + { + "epoch": 0.1537300936666027, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 17627 + }, + { + "epoch": 0.1537388149517713, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 17628 + }, + { + "epoch": 0.15374753623693987, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 17629 + }, + { + "epoch": 0.15375625752210845, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 17630 + }, + { + "epoch": 0.15376497880727705, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17631 + }, + { + "epoch": 0.15377370009244562, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 17632 + }, + { + "epoch": 0.1537824213776142, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 17633 + }, + { + "epoch": 0.1537911426627828, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 17634 + }, + { + "epoch": 0.15379986394795137, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 17635 + }, + { + "epoch": 0.15380858523311997, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 17636 + }, + { + "epoch": 0.15381730651828854, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 17637 + }, + { + "epoch": 0.1538260278034571, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 17638 + }, + { + "epoch": 0.1538347490886257, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17639 + }, + { + "epoch": 0.15384347037379428, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 17640 + }, + { + "epoch": 0.15385219165896286, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 17641 + }, + { + "epoch": 0.15386091294413146, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 17642 + }, + { + "epoch": 0.15386963422930003, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17643 + }, + { + "epoch": 0.1538783555144686, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 17644 + }, + { + "epoch": 0.1538870767996372, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 17645 + }, + { + "epoch": 0.15389579808480577, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 17646 + }, + { + "epoch": 0.15390451936997435, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 17647 + }, + { + "epoch": 0.15391324065514295, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 17648 + }, + { + "epoch": 0.15392196194031152, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 17649 + }, + { + "epoch": 0.15393068322548012, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 17650 + }, + { + "epoch": 0.1539394045106487, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 17651 + }, + { + "epoch": 0.15394812579581726, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 17652 + }, + { + "epoch": 0.15395684708098586, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 17653 + }, + { + "epoch": 0.15396556836615444, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17654 + }, + { + "epoch": 0.153974289651323, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 17655 + }, + { + "epoch": 0.1539830109364916, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 17656 + }, + { + "epoch": 0.15399173222166018, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 17657 + }, + { + "epoch": 0.15400045350682876, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17658 + }, + { + "epoch": 0.15400917479199736, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 17659 + }, + { + "epoch": 0.15401789607716593, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 17660 + }, + { + "epoch": 0.1540266173623345, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 17661 + }, + { + "epoch": 0.1540353386475031, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 17662 + }, + { + "epoch": 0.15404405993267167, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 17663 + }, + { + "epoch": 0.15405278121784027, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 17664 + }, + { + "epoch": 0.15406150250300885, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 17665 + }, + { + "epoch": 0.15407022378817742, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 17666 + }, + { + "epoch": 0.15407894507334602, + "grad_norm": 0.357421875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 17667 + }, + { + "epoch": 0.1540876663585146, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 17668 + }, + { + "epoch": 0.15409638764368316, + "grad_norm": 0.390625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 17669 + }, + { + "epoch": 0.15410510892885176, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 17670 + }, + { + "epoch": 0.15411383021402034, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17671 + }, + { + "epoch": 0.1541225514991889, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 17672 + }, + { + "epoch": 0.1541312727843575, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 17673 + }, + { + "epoch": 0.15413999406952608, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 17674 + }, + { + "epoch": 0.15414871535469465, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 17675 + }, + { + "epoch": 0.15415743663986325, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 17676 + }, + { + "epoch": 0.15416615792503183, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 17677 + }, + { + "epoch": 0.15417487921020043, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 17678 + }, + { + "epoch": 0.154183600495369, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 17679 + }, + { + "epoch": 0.15419232178053757, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 17680 + }, + { + "epoch": 0.15420104306570617, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 17681 + }, + { + "epoch": 0.15420976435087475, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 17682 + }, + { + "epoch": 0.15421848563604332, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 17683 + }, + { + "epoch": 0.15422720692121192, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 17684 + }, + { + "epoch": 0.1542359282063805, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 17685 + }, + { + "epoch": 0.15424464949154906, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 17686 + }, + { + "epoch": 0.15425337077671766, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 17687 + }, + { + "epoch": 0.15426209206188624, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 17688 + }, + { + "epoch": 0.1542708133470548, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 17689 + }, + { + "epoch": 0.1542795346322234, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 17690 + }, + { + "epoch": 0.15428825591739198, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 17691 + }, + { + "epoch": 0.15429697720256058, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 17692 + }, + { + "epoch": 0.15430569848772915, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 17693 + }, + { + "epoch": 0.15431441977289773, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 17694 + }, + { + "epoch": 0.15432314105806633, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 17695 + }, + { + "epoch": 0.1543318623432349, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 17696 + }, + { + "epoch": 0.15434058362840347, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 17697 + }, + { + "epoch": 0.15434930491357207, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 17698 + }, + { + "epoch": 0.15435802619874064, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 17699 + }, + { + "epoch": 0.15436674748390922, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 17700 + }, + { + "epoch": 0.15437546876907782, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 17701 + }, + { + "epoch": 0.1543841900542464, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 17702 + }, + { + "epoch": 0.154392911339415, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17703 + }, + { + "epoch": 0.15440163262458356, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 17704 + }, + { + "epoch": 0.15441035390975213, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 17705 + }, + { + "epoch": 0.15441907519492074, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 17706 + }, + { + "epoch": 0.1544277964800893, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 17707 + }, + { + "epoch": 0.15443651776525788, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 17708 + }, + { + "epoch": 0.15444523905042648, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 17709 + }, + { + "epoch": 0.15445396033559505, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 17710 + }, + { + "epoch": 0.15446268162076363, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 17711 + }, + { + "epoch": 0.15447140290593223, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 17712 + }, + { + "epoch": 0.1544801241911008, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 17713 + }, + { + "epoch": 0.15448884547626937, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17714 + }, + { + "epoch": 0.15449756676143797, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 17715 + }, + { + "epoch": 0.15450628804660654, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 17716 + }, + { + "epoch": 0.15451500933177514, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 17717 + }, + { + "epoch": 0.15452373061694372, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 17718 + }, + { + "epoch": 0.1545324519021123, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17719 + }, + { + "epoch": 0.1545411731872809, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 17720 + }, + { + "epoch": 0.15454989447244946, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 17721 + }, + { + "epoch": 0.15455861575761803, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 17722 + }, + { + "epoch": 0.15456733704278663, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17723 + }, + { + "epoch": 0.1545760583279552, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 17724 + }, + { + "epoch": 0.15458477961312378, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 17725 + }, + { + "epoch": 0.15459350089829238, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 17726 + }, + { + "epoch": 0.15460222218346095, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17727 + }, + { + "epoch": 0.15461094346862952, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 17728 + }, + { + "epoch": 0.15461966475379813, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17729 + }, + { + "epoch": 0.1546283860389667, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 17730 + }, + { + "epoch": 0.1546371073241353, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 17731 + }, + { + "epoch": 0.15464582860930387, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 17732 + }, + { + "epoch": 0.15465454989447244, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 17733 + }, + { + "epoch": 0.15466327117964104, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 17734 + }, + { + "epoch": 0.15467199246480962, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 17735 + }, + { + "epoch": 0.1546807137499782, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 17736 + }, + { + "epoch": 0.1546894350351468, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 17737 + }, + { + "epoch": 0.15469815632031536, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 17738 + }, + { + "epoch": 0.15470687760548393, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 17739 + }, + { + "epoch": 0.15471559889065253, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 17740 + }, + { + "epoch": 0.1547243201758211, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 17741 + }, + { + "epoch": 0.15473304146098968, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 17742 + }, + { + "epoch": 0.15474176274615828, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 17743 + }, + { + "epoch": 0.15475048403132685, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 17744 + }, + { + "epoch": 0.15475920531649545, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 17745 + }, + { + "epoch": 0.15476792660166402, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 17746 + }, + { + "epoch": 0.1547766478868326, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 17747 + }, + { + "epoch": 0.1547853691720012, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 17748 + }, + { + "epoch": 0.15479409045716977, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 17749 + }, + { + "epoch": 0.15480281174233834, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 17750 + }, + { + "epoch": 0.15481153302750694, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 17751 + }, + { + "epoch": 0.15482025431267551, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 17752 + }, + { + "epoch": 0.1548289755978441, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 17753 + }, + { + "epoch": 0.1548376968830127, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 17754 + }, + { + "epoch": 0.15484641816818126, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 17755 + }, + { + "epoch": 0.15485513945334983, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 17756 + }, + { + "epoch": 0.15486386073851843, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 17757 + }, + { + "epoch": 0.154872582023687, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 17758 + }, + { + "epoch": 0.1548813033088556, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 17759 + }, + { + "epoch": 0.15489002459402418, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 17760 + }, + { + "epoch": 0.15489874587919275, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 17761 + }, + { + "epoch": 0.15490746716436135, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 17762 + }, + { + "epoch": 0.15491618844952992, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 17763 + }, + { + "epoch": 0.1549249097346985, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 17764 + }, + { + "epoch": 0.1549336310198671, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 17765 + }, + { + "epoch": 0.15494235230503567, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 17766 + }, + { + "epoch": 0.15495107359020424, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 17767 + }, + { + "epoch": 0.15495979487537284, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 17768 + }, + { + "epoch": 0.15496851616054141, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 17769 + }, + { + "epoch": 0.15497723744571, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 17770 + }, + { + "epoch": 0.1549859587308786, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 17771 + }, + { + "epoch": 0.15499468001604716, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 17772 + }, + { + "epoch": 0.15500340130121576, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 17773 + }, + { + "epoch": 0.15501212258638433, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 17774 + }, + { + "epoch": 0.1550208438715529, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 17775 + }, + { + "epoch": 0.1550295651567215, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 17776 + }, + { + "epoch": 0.15503828644189008, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17777 + }, + { + "epoch": 0.15504700772705865, + "grad_norm": 0.38671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 17778 + }, + { + "epoch": 0.15505572901222725, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 17779 + }, + { + "epoch": 0.15506445029739582, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 17780 + }, + { + "epoch": 0.1550731715825644, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 17781 + }, + { + "epoch": 0.155081892867733, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 17782 + }, + { + "epoch": 0.15509061415290157, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 17783 + }, + { + "epoch": 0.15509933543807014, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 17784 + }, + { + "epoch": 0.15510805672323874, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 17785 + }, + { + "epoch": 0.1551167780084073, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 17786 + }, + { + "epoch": 0.1551254992935759, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 17787 + }, + { + "epoch": 0.15513422057874449, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 17788 + }, + { + "epoch": 0.15514294186391306, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 17789 + }, + { + "epoch": 0.15515166314908166, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 17790 + }, + { + "epoch": 0.15516038443425023, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 17791 + }, + { + "epoch": 0.1551691057194188, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 17792 + }, + { + "epoch": 0.1551778270045874, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 17793 + }, + { + "epoch": 0.15518654828975598, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 17794 + }, + { + "epoch": 0.15519526957492455, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 17795 + }, + { + "epoch": 0.15520399086009315, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17796 + }, + { + "epoch": 0.15521271214526172, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 17797 + }, + { + "epoch": 0.1552214334304303, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 17798 + }, + { + "epoch": 0.1552301547155989, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 17799 + }, + { + "epoch": 0.15523887600076747, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 17800 + }, + { + "epoch": 0.15524759728593607, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17801 + }, + { + "epoch": 0.15525631857110464, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 17802 + }, + { + "epoch": 0.1552650398562732, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 17803 + }, + { + "epoch": 0.1552737611414418, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 17804 + }, + { + "epoch": 0.15528248242661039, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 17805 + }, + { + "epoch": 0.15529120371177896, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 17806 + }, + { + "epoch": 0.15529992499694756, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 17807 + }, + { + "epoch": 0.15530864628211613, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 17808 + }, + { + "epoch": 0.1553173675672847, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 17809 + }, + { + "epoch": 0.1553260888524533, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 17810 + }, + { + "epoch": 0.15533481013762188, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 17811 + }, + { + "epoch": 0.15534353142279048, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 17812 + }, + { + "epoch": 0.15535225270795905, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 17813 + }, + { + "epoch": 0.15536097399312762, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 17814 + }, + { + "epoch": 0.15536969527829622, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 17815 + }, + { + "epoch": 0.1553784165634648, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 17816 + }, + { + "epoch": 0.15538713784863337, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 17817 + }, + { + "epoch": 0.15539585913380197, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 17818 + }, + { + "epoch": 0.15540458041897054, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 17819 + }, + { + "epoch": 0.1554133017041391, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 17820 + }, + { + "epoch": 0.1554220229893077, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 17821 + }, + { + "epoch": 0.15543074427447628, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 17822 + }, + { + "epoch": 0.15543946555964486, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 17823 + }, + { + "epoch": 0.15544818684481346, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17824 + }, + { + "epoch": 0.15545690812998203, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 17825 + }, + { + "epoch": 0.15546562941515063, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 17826 + }, + { + "epoch": 0.1554743507003192, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 17827 + }, + { + "epoch": 0.15548307198548778, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17828 + }, + { + "epoch": 0.15549179327065638, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 17829 + }, + { + "epoch": 0.15550051455582495, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 17830 + }, + { + "epoch": 0.15550923584099352, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 17831 + }, + { + "epoch": 0.15551795712616212, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 17832 + }, + { + "epoch": 0.1555266784113307, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 17833 + }, + { + "epoch": 0.15553539969649927, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17834 + }, + { + "epoch": 0.15554412098166787, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 17835 + }, + { + "epoch": 0.15555284226683644, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 17836 + }, + { + "epoch": 0.155561563552005, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 17837 + }, + { + "epoch": 0.1555702848371736, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 17838 + }, + { + "epoch": 0.15557900612234218, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 17839 + }, + { + "epoch": 0.15558772740751078, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 17840 + }, + { + "epoch": 0.15559644869267936, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 17841 + }, + { + "epoch": 0.15560516997784793, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 17842 + }, + { + "epoch": 0.15561389126301653, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 17843 + }, + { + "epoch": 0.1556226125481851, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 17844 + }, + { + "epoch": 0.15563133383335367, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 17845 + }, + { + "epoch": 0.15564005511852227, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17846 + }, + { + "epoch": 0.15564877640369085, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 17847 + }, + { + "epoch": 0.15565749768885942, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 17848 + }, + { + "epoch": 0.15566621897402802, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 17849 + }, + { + "epoch": 0.1556749402591966, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 17850 + }, + { + "epoch": 0.15568366154436517, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17851 + }, + { + "epoch": 0.15569238282953377, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 17852 + }, + { + "epoch": 0.15570110411470234, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 17853 + }, + { + "epoch": 0.15570982539987094, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 17854 + }, + { + "epoch": 0.1557185466850395, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 17855 + }, + { + "epoch": 0.15572726797020808, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 17856 + }, + { + "epoch": 0.15573598925537668, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 17857 + }, + { + "epoch": 0.15574471054054526, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 17858 + }, + { + "epoch": 0.15575343182571383, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 17859 + }, + { + "epoch": 0.15576215311088243, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 17860 + }, + { + "epoch": 0.155770874396051, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 17861 + }, + { + "epoch": 0.15577959568121957, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 17862 + }, + { + "epoch": 0.15578831696638817, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 17863 + }, + { + "epoch": 0.15579703825155675, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 17864 + }, + { + "epoch": 0.15580575953672532, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 17865 + }, + { + "epoch": 0.15581448082189392, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 17866 + }, + { + "epoch": 0.1558232021070625, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 17867 + }, + { + "epoch": 0.1558319233922311, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 17868 + }, + { + "epoch": 0.15584064467739966, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 17869 + }, + { + "epoch": 0.15584936596256824, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 17870 + }, + { + "epoch": 0.15585808724773684, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 17871 + }, + { + "epoch": 0.1558668085329054, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 17872 + }, + { + "epoch": 0.15587552981807398, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 17873 + }, + { + "epoch": 0.15588425110324258, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 17874 + }, + { + "epoch": 0.15589297238841116, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 17875 + }, + { + "epoch": 0.15590169367357973, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 17876 + }, + { + "epoch": 0.15591041495874833, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17877 + }, + { + "epoch": 0.1559191362439169, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 17878 + }, + { + "epoch": 0.15592785752908547, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 17879 + }, + { + "epoch": 0.15593657881425407, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 17880 + }, + { + "epoch": 0.15594530009942265, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 17881 + }, + { + "epoch": 0.15595402138459125, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 17882 + }, + { + "epoch": 0.15596274266975982, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 17883 + }, + { + "epoch": 0.1559714639549284, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 17884 + }, + { + "epoch": 0.155980185240097, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 17885 + }, + { + "epoch": 0.15598890652526556, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 17886 + }, + { + "epoch": 0.15599762781043414, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 17887 + }, + { + "epoch": 0.15600634909560274, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 17888 + }, + { + "epoch": 0.1560150703807713, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 17889 + }, + { + "epoch": 0.15602379166593988, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 17890 + }, + { + "epoch": 0.15603251295110848, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 17891 + }, + { + "epoch": 0.15604123423627705, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 17892 + }, + { + "epoch": 0.15604995552144563, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 17893 + }, + { + "epoch": 0.15605867680661423, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17894 + }, + { + "epoch": 0.1560673980917828, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 17895 + }, + { + "epoch": 0.1560761193769514, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 17896 + }, + { + "epoch": 0.15608484066211997, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 17897 + }, + { + "epoch": 0.15609356194728854, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 17898 + }, + { + "epoch": 0.15610228323245715, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 17899 + }, + { + "epoch": 0.15611100451762572, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 17900 + }, + { + "epoch": 0.1561197258027943, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 17901 + }, + { + "epoch": 0.1561284470879629, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 17902 + }, + { + "epoch": 0.15613716837313146, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 17903 + }, + { + "epoch": 0.15614588965830004, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 17904 + }, + { + "epoch": 0.15615461094346864, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 17905 + }, + { + "epoch": 0.1561633322286372, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 17906 + }, + { + "epoch": 0.15617205351380578, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 17907 + }, + { + "epoch": 0.15618077479897438, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 17908 + }, + { + "epoch": 0.15618949608414295, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 17909 + }, + { + "epoch": 0.15619821736931155, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 17910 + }, + { + "epoch": 0.15620693865448013, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 17911 + }, + { + "epoch": 0.1562156599396487, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 17912 + }, + { + "epoch": 0.1562243812248173, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 17913 + }, + { + "epoch": 0.15623310250998587, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 17914 + }, + { + "epoch": 0.15624182379515444, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 17915 + }, + { + "epoch": 0.15625054508032304, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17916 + }, + { + "epoch": 0.15625926636549162, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 17917 + }, + { + "epoch": 0.1562679876506602, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 17918 + }, + { + "epoch": 0.1562767089358288, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 17919 + }, + { + "epoch": 0.15628543022099736, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 17920 + }, + { + "epoch": 0.15629415150616593, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 17921 + }, + { + "epoch": 0.15630287279133454, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 17922 + }, + { + "epoch": 0.1563115940765031, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 17923 + }, + { + "epoch": 0.1563203153616717, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 17924 + }, + { + "epoch": 0.15632903664684028, + "grad_norm": 0.42578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 17925 + }, + { + "epoch": 0.15633775793200885, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 17926 + }, + { + "epoch": 0.15634647921717745, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 17927 + }, + { + "epoch": 0.15635520050234603, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 17928 + }, + { + "epoch": 0.1563639217875146, + "grad_norm": 0.5234375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 17929 + }, + { + "epoch": 0.1563726430726832, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 17930 + }, + { + "epoch": 0.15638136435785177, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 17931 + }, + { + "epoch": 0.15639008564302034, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 17932 + }, + { + "epoch": 0.15639880692818894, + "grad_norm": 0.671875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 17933 + }, + { + "epoch": 0.15640752821335752, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 17934 + }, + { + "epoch": 0.15641624949852612, + "grad_norm": 0.7734375, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 17935 + }, + { + "epoch": 0.1564249707836947, + "grad_norm": 0.388671875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 17936 + }, + { + "epoch": 0.15643369206886326, + "grad_norm": 1.140625, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 17937 + }, + { + "epoch": 0.15644241335403186, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 17938 + }, + { + "epoch": 0.15645113463920043, + "grad_norm": 0.921875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 17939 + }, + { + "epoch": 0.156459855924369, + "grad_norm": 0.72265625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 17940 + }, + { + "epoch": 0.1564685772095376, + "grad_norm": 1.09375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 17941 + }, + { + "epoch": 0.15647729849470618, + "grad_norm": 0.7890625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 17942 + }, + { + "epoch": 0.15648601977987475, + "grad_norm": 0.6875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 17943 + }, + { + "epoch": 0.15649474106504335, + "grad_norm": 0.72265625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 17944 + }, + { + "epoch": 0.15650346235021192, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 17945 + }, + { + "epoch": 0.1565121836353805, + "grad_norm": 0.8125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 17946 + }, + { + "epoch": 0.1565209049205491, + "grad_norm": 0.353515625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 17947 + }, + { + "epoch": 0.15652962620571767, + "grad_norm": 0.5703125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 17948 + }, + { + "epoch": 0.15653834749088627, + "grad_norm": 0.6796875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 17949 + }, + { + "epoch": 0.15654706877605484, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 17950 + }, + { + "epoch": 0.15655579006122342, + "grad_norm": 0.8046875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 17951 + }, + { + "epoch": 0.15656451134639202, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 17952 + }, + { + "epoch": 0.1565732326315606, + "grad_norm": 0.51171875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 17953 + }, + { + "epoch": 0.15658195391672916, + "grad_norm": 0.4921875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 17954 + }, + { + "epoch": 0.15659067520189776, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 17955 + }, + { + "epoch": 0.15659939648706633, + "grad_norm": 0.44921875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 17956 + }, + { + "epoch": 0.1566081177722349, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 17957 + }, + { + "epoch": 0.1566168390574035, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 17958 + }, + { + "epoch": 0.15662556034257208, + "grad_norm": 0.435546875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 17959 + }, + { + "epoch": 0.15663428162774065, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17960 + }, + { + "epoch": 0.15664300291290925, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 17961 + }, + { + "epoch": 0.15665172419807782, + "grad_norm": 0.453125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 17962 + }, + { + "epoch": 0.15666044548324642, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 17963 + }, + { + "epoch": 0.156669166768415, + "grad_norm": 0.470703125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 17964 + }, + { + "epoch": 0.15667788805358357, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 17965 + }, + { + "epoch": 0.15668660933875217, + "grad_norm": 0.390625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 17966 + }, + { + "epoch": 0.15669533062392074, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 17967 + }, + { + "epoch": 0.15670405190908931, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 17968 + }, + { + "epoch": 0.15671277319425791, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 17969 + }, + { + "epoch": 0.1567214944794265, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 17970 + }, + { + "epoch": 0.15673021576459506, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 17971 + }, + { + "epoch": 0.15673893704976366, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 17972 + }, + { + "epoch": 0.15674765833493223, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 17973 + }, + { + "epoch": 0.1567563796201008, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 17974 + }, + { + "epoch": 0.1567651009052694, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 17975 + }, + { + "epoch": 0.15677382219043798, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 17976 + }, + { + "epoch": 0.15678254347560658, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 17977 + }, + { + "epoch": 0.15679126476077515, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 17978 + }, + { + "epoch": 0.15679998604594372, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 17979 + }, + { + "epoch": 0.15680870733111232, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 17980 + }, + { + "epoch": 0.1568174286162809, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 17981 + }, + { + "epoch": 0.15682614990144947, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 17982 + }, + { + "epoch": 0.15683487118661807, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 17983 + }, + { + "epoch": 0.15684359247178664, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 17984 + }, + { + "epoch": 0.15685231375695521, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 17985 + }, + { + "epoch": 0.15686103504212381, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 17986 + }, + { + "epoch": 0.1568697563272924, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 17987 + }, + { + "epoch": 0.15687847761246096, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 17988 + }, + { + "epoch": 0.15688719889762956, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 17989 + }, + { + "epoch": 0.15689592018279813, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 17990 + }, + { + "epoch": 0.15690464146796673, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 17991 + }, + { + "epoch": 0.1569133627531353, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 17992 + }, + { + "epoch": 0.15692208403830388, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 17993 + }, + { + "epoch": 0.15693080532347248, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 17994 + }, + { + "epoch": 0.15693952660864105, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 17995 + }, + { + "epoch": 0.15694824789380962, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 17996 + }, + { + "epoch": 0.15695696917897822, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 17997 + }, + { + "epoch": 0.1569656904641468, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 17998 + }, + { + "epoch": 0.15697441174931537, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 17999 + }, + { + "epoch": 0.15698313303448397, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 18000 + }, + { + "epoch": 0.15699185431965254, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 18001 + }, + { + "epoch": 0.1570005756048211, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 18002 + }, + { + "epoch": 0.1570092968899897, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 18003 + }, + { + "epoch": 0.15701801817515829, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 18004 + }, + { + "epoch": 0.1570267394603269, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 18005 + }, + { + "epoch": 0.15703546074549546, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 18006 + }, + { + "epoch": 0.15704418203066403, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 18007 + }, + { + "epoch": 0.15705290331583263, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 18008 + }, + { + "epoch": 0.1570616246010012, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 18009 + }, + { + "epoch": 0.15707034588616978, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 18010 + }, + { + "epoch": 0.15707906717133838, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 18011 + }, + { + "epoch": 0.15708778845650695, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 18012 + }, + { + "epoch": 0.15709650974167552, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 18013 + }, + { + "epoch": 0.15710523102684412, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 18014 + }, + { + "epoch": 0.1571139523120127, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 18015 + }, + { + "epoch": 0.15712267359718127, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 18016 + }, + { + "epoch": 0.15713139488234987, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 18017 + }, + { + "epoch": 0.15714011616751844, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 18018 + }, + { + "epoch": 0.15714883745268704, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 18019 + }, + { + "epoch": 0.1571575587378556, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 18020 + }, + { + "epoch": 0.15716628002302419, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 18021 + }, + { + "epoch": 0.15717500130819279, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 18022 + }, + { + "epoch": 0.15718372259336136, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 18023 + }, + { + "epoch": 0.15719244387852993, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 18024 + }, + { + "epoch": 0.15720116516369853, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 18025 + }, + { + "epoch": 0.1572098864488671, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 18026 + }, + { + "epoch": 0.15721860773403568, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 18027 + }, + { + "epoch": 0.15722732901920428, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 18028 + }, + { + "epoch": 0.15723605030437285, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 18029 + }, + { + "epoch": 0.15724477158954142, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 18030 + }, + { + "epoch": 0.15725349287471002, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 18031 + }, + { + "epoch": 0.1572622141598786, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 18032 + }, + { + "epoch": 0.1572709354450472, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 18033 + }, + { + "epoch": 0.15727965673021577, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 18034 + }, + { + "epoch": 0.15728837801538434, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 18035 + }, + { + "epoch": 0.15729709930055294, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 18036 + }, + { + "epoch": 0.1573058205857215, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 18037 + }, + { + "epoch": 0.15731454187089008, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 18038 + }, + { + "epoch": 0.15732326315605868, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 18039 + }, + { + "epoch": 0.15733198444122726, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 18040 + }, + { + "epoch": 0.15734070572639583, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 18041 + }, + { + "epoch": 0.15734942701156443, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 18042 + }, + { + "epoch": 0.157358148296733, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 18043 + }, + { + "epoch": 0.1573668695819016, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 18044 + }, + { + "epoch": 0.15737559086707018, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 18045 + }, + { + "epoch": 0.15738431215223875, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 18046 + }, + { + "epoch": 0.15739303343740735, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18047 + }, + { + "epoch": 0.15740175472257592, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 18048 + }, + { + "epoch": 0.1574104760077445, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18049 + }, + { + "epoch": 0.1574191972929131, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 18050 + }, + { + "epoch": 0.15742791857808167, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 18051 + }, + { + "epoch": 0.15743663986325024, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 18052 + }, + { + "epoch": 0.15744536114841884, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 18053 + }, + { + "epoch": 0.1574540824335874, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 18054 + }, + { + "epoch": 0.15746280371875598, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 18055 + }, + { + "epoch": 0.15747152500392458, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18056 + }, + { + "epoch": 0.15748024628909316, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 18057 + }, + { + "epoch": 0.15748896757426176, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 18058 + }, + { + "epoch": 0.15749768885943033, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 18059 + }, + { + "epoch": 0.1575064101445989, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 18060 + }, + { + "epoch": 0.1575151314297675, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 18061 + }, + { + "epoch": 0.15752385271493607, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 18062 + }, + { + "epoch": 0.15753257400010465, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 18063 + }, + { + "epoch": 0.15754129528527325, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 18064 + }, + { + "epoch": 0.15755001657044182, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 18065 + }, + { + "epoch": 0.1575587378556104, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 18066 + }, + { + "epoch": 0.157567459140779, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 18067 + }, + { + "epoch": 0.15757618042594757, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 18068 + }, + { + "epoch": 0.15758490171111614, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 18069 + }, + { + "epoch": 0.15759362299628474, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 18070 + }, + { + "epoch": 0.1576023442814533, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 18071 + }, + { + "epoch": 0.1576110655666219, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 18072 + }, + { + "epoch": 0.15761978685179048, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 18073 + }, + { + "epoch": 0.15762850813695906, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 18074 + }, + { + "epoch": 0.15763722942212766, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 18075 + }, + { + "epoch": 0.15764595070729623, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 18076 + }, + { + "epoch": 0.1576546719924648, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18077 + }, + { + "epoch": 0.1576633932776334, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 18078 + }, + { + "epoch": 0.15767211456280197, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 18079 + }, + { + "epoch": 0.15768083584797055, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 18080 + }, + { + "epoch": 0.15768955713313915, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 18081 + }, + { + "epoch": 0.15769827841830772, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 18082 + }, + { + "epoch": 0.1577069997034763, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 18083 + }, + { + "epoch": 0.1577157209886449, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 18084 + }, + { + "epoch": 0.15772444227381346, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 18085 + }, + { + "epoch": 0.15773316355898206, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 18086 + }, + { + "epoch": 0.15774188484415064, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 18087 + }, + { + "epoch": 0.1577506061293192, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 18088 + }, + { + "epoch": 0.1577593274144878, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18089 + }, + { + "epoch": 0.15776804869965638, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 18090 + }, + { + "epoch": 0.15777676998482495, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 18091 + }, + { + "epoch": 0.15778549126999356, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 18092 + }, + { + "epoch": 0.15779421255516213, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 18093 + }, + { + "epoch": 0.1578029338403307, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 18094 + }, + { + "epoch": 0.1578116551254993, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 18095 + }, + { + "epoch": 0.15782037641066787, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 18096 + }, + { + "epoch": 0.15782909769583645, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 18097 + }, + { + "epoch": 0.15783781898100505, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 18098 + }, + { + "epoch": 0.15784654026617362, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 18099 + }, + { + "epoch": 0.15785526155134222, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 18100 + }, + { + "epoch": 0.1578639828365108, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 18101 + }, + { + "epoch": 0.15787270412167936, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 18102 + }, + { + "epoch": 0.15788142540684796, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 18103 + }, + { + "epoch": 0.15789014669201654, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 18104 + }, + { + "epoch": 0.1578988679771851, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 18105 + }, + { + "epoch": 0.1579075892623537, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 18106 + }, + { + "epoch": 0.15791631054752228, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 18107 + }, + { + "epoch": 0.15792503183269085, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 18108 + }, + { + "epoch": 0.15793375311785945, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 18109 + }, + { + "epoch": 0.15794247440302803, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 18110 + }, + { + "epoch": 0.1579511956881966, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 18111 + }, + { + "epoch": 0.1579599169733652, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 18112 + }, + { + "epoch": 0.15796863825853377, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18113 + }, + { + "epoch": 0.15797735954370237, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 18114 + }, + { + "epoch": 0.15798608082887095, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 18115 + }, + { + "epoch": 0.15799480211403952, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 18116 + }, + { + "epoch": 0.15800352339920812, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18117 + }, + { + "epoch": 0.1580122446843767, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 18118 + }, + { + "epoch": 0.15802096596954526, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 18119 + }, + { + "epoch": 0.15802968725471386, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 18120 + }, + { + "epoch": 0.15803840853988244, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 18121 + }, + { + "epoch": 0.158047129825051, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 18122 + }, + { + "epoch": 0.1580558511102196, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 18123 + }, + { + "epoch": 0.15806457239538818, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 18124 + }, + { + "epoch": 0.15807329368055675, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 18125 + }, + { + "epoch": 0.15808201496572535, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 18126 + }, + { + "epoch": 0.15809073625089393, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 18127 + }, + { + "epoch": 0.15809945753606253, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 18128 + }, + { + "epoch": 0.1581081788212311, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 18129 + }, + { + "epoch": 0.15811690010639967, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 18130 + }, + { + "epoch": 0.15812562139156827, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 18131 + }, + { + "epoch": 0.15813434267673684, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 18132 + }, + { + "epoch": 0.15814306396190542, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 18133 + }, + { + "epoch": 0.15815178524707402, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 18134 + }, + { + "epoch": 0.1581605065322426, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 18135 + }, + { + "epoch": 0.15816922781741116, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 18136 + }, + { + "epoch": 0.15817794910257976, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 18137 + }, + { + "epoch": 0.15818667038774833, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 18138 + }, + { + "epoch": 0.1581953916729169, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 18139 + }, + { + "epoch": 0.1582041129580855, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 18140 + }, + { + "epoch": 0.15821283424325408, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 18141 + }, + { + "epoch": 0.15822155552842268, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 18142 + }, + { + "epoch": 0.15823027681359125, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 18143 + }, + { + "epoch": 0.15823899809875983, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 18144 + }, + { + "epoch": 0.15824771938392843, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 18145 + }, + { + "epoch": 0.158256440669097, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 18146 + }, + { + "epoch": 0.15826516195426557, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 18147 + }, + { + "epoch": 0.15827388323943417, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 18148 + }, + { + "epoch": 0.15828260452460274, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 18149 + }, + { + "epoch": 0.15829132580977132, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18150 + }, + { + "epoch": 0.15830004709493992, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 18151 + }, + { + "epoch": 0.1583087683801085, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 18152 + }, + { + "epoch": 0.1583174896652771, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 18153 + }, + { + "epoch": 0.15832621095044566, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 18154 + }, + { + "epoch": 0.15833493223561423, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 18155 + }, + { + "epoch": 0.15834365352078283, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 18156 + }, + { + "epoch": 0.1583523748059514, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 18157 + }, + { + "epoch": 0.15836109609111998, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 18158 + }, + { + "epoch": 0.15836981737628858, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 18159 + }, + { + "epoch": 0.15837853866145715, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 18160 + }, + { + "epoch": 0.15838725994662572, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 18161 + }, + { + "epoch": 0.15839598123179432, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 18162 + }, + { + "epoch": 0.1584047025169629, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 18163 + }, + { + "epoch": 0.15841342380213147, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 18164 + }, + { + "epoch": 0.15842214508730007, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 18165 + }, + { + "epoch": 0.15843086637246864, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 18166 + }, + { + "epoch": 0.15843958765763724, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 18167 + }, + { + "epoch": 0.15844830894280582, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 18168 + }, + { + "epoch": 0.1584570302279744, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 18169 + }, + { + "epoch": 0.158465751513143, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 18170 + }, + { + "epoch": 0.15847447279831156, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 18171 + }, + { + "epoch": 0.15848319408348013, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 18172 + }, + { + "epoch": 0.15849191536864873, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 18173 + }, + { + "epoch": 0.1585006366538173, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 18174 + }, + { + "epoch": 0.15850935793898588, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 18175 + }, + { + "epoch": 0.15851807922415448, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 18176 + }, + { + "epoch": 0.15852680050932305, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 18177 + }, + { + "epoch": 0.15853552179449162, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 18178 + }, + { + "epoch": 0.15854424307966022, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 18179 + }, + { + "epoch": 0.1585529643648288, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 18180 + }, + { + "epoch": 0.1585616856499974, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 18181 + }, + { + "epoch": 0.15857040693516597, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 18182 + }, + { + "epoch": 0.15857912822033454, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 18183 + }, + { + "epoch": 0.15858784950550314, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 18184 + }, + { + "epoch": 0.15859657079067171, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 18185 + }, + { + "epoch": 0.1586052920758403, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 18186 + }, + { + "epoch": 0.1586140133610089, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 18187 + }, + { + "epoch": 0.15862273464617746, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 18188 + }, + { + "epoch": 0.15863145593134603, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 18189 + }, + { + "epoch": 0.15864017721651463, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 18190 + }, + { + "epoch": 0.1586488985016832, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18191 + }, + { + "epoch": 0.15865761978685178, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 18192 + }, + { + "epoch": 0.15866634107202038, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 18193 + }, + { + "epoch": 0.15867506235718895, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 18194 + }, + { + "epoch": 0.15868378364235755, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 18195 + }, + { + "epoch": 0.15869250492752612, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 18196 + }, + { + "epoch": 0.1587012262126947, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 18197 + }, + { + "epoch": 0.1587099474978633, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 18198 + }, + { + "epoch": 0.15871866878303187, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 18199 + }, + { + "epoch": 0.15872739006820044, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 18200 + }, + { + "epoch": 0.15873611135336904, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 18201 + }, + { + "epoch": 0.15874483263853761, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 18202 + }, + { + "epoch": 0.1587535539237062, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 18203 + }, + { + "epoch": 0.1587622752088748, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 18204 + }, + { + "epoch": 0.15877099649404336, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 18205 + }, + { + "epoch": 0.15877971777921193, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 18206 + }, + { + "epoch": 0.15878843906438053, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 18207 + }, + { + "epoch": 0.1587971603495491, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 18208 + }, + { + "epoch": 0.1588058816347177, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 18209 + }, + { + "epoch": 0.15881460291988628, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 18210 + }, + { + "epoch": 0.15882332420505485, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 18211 + }, + { + "epoch": 0.15883204549022345, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 18212 + }, + { + "epoch": 0.15884076677539202, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 18213 + }, + { + "epoch": 0.1588494880605606, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 18214 + }, + { + "epoch": 0.1588582093457292, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 18215 + }, + { + "epoch": 0.15886693063089777, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 18216 + }, + { + "epoch": 0.15887565191606634, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 18217 + }, + { + "epoch": 0.15888437320123494, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 18218 + }, + { + "epoch": 0.1588930944864035, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 18219 + }, + { + "epoch": 0.15890181577157209, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 18220 + }, + { + "epoch": 0.15891053705674069, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 18221 + }, + { + "epoch": 0.15891925834190926, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 18222 + }, + { + "epoch": 0.15892797962707786, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 18223 + }, + { + "epoch": 0.15893670091224643, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 18224 + }, + { + "epoch": 0.158945422197415, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 18225 + }, + { + "epoch": 0.1589541434825836, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 18226 + }, + { + "epoch": 0.15896286476775218, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 18227 + }, + { + "epoch": 0.15897158605292075, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 18228 + }, + { + "epoch": 0.15898030733808935, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18229 + }, + { + "epoch": 0.15898902862325792, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18230 + }, + { + "epoch": 0.1589977499084265, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 18231 + }, + { + "epoch": 0.1590064711935951, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 18232 + }, + { + "epoch": 0.15901519247876367, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 18233 + }, + { + "epoch": 0.15902391376393224, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 18234 + }, + { + "epoch": 0.15903263504910084, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 18235 + }, + { + "epoch": 0.1590413563342694, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 18236 + }, + { + "epoch": 0.159050077619438, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 18237 + }, + { + "epoch": 0.15905879890460659, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18238 + }, + { + "epoch": 0.15906752018977516, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 18239 + }, + { + "epoch": 0.15907624147494376, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 18240 + }, + { + "epoch": 0.15908496276011233, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 18241 + }, + { + "epoch": 0.1590936840452809, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 18242 + }, + { + "epoch": 0.1591024053304495, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 18243 + }, + { + "epoch": 0.15911112661561808, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 18244 + }, + { + "epoch": 0.15911984790078665, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 18245 + }, + { + "epoch": 0.15912856918595525, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 18246 + }, + { + "epoch": 0.15913729047112382, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 18247 + }, + { + "epoch": 0.1591460117562924, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 18248 + }, + { + "epoch": 0.159154733041461, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 18249 + }, + { + "epoch": 0.15916345432662957, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 18250 + }, + { + "epoch": 0.15917217561179817, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 18251 + }, + { + "epoch": 0.15918089689696674, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 18252 + }, + { + "epoch": 0.1591896181821353, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 18253 + }, + { + "epoch": 0.1591983394673039, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 18254 + }, + { + "epoch": 0.15920706075247248, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 18255 + }, + { + "epoch": 0.15921578203764106, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 18256 + }, + { + "epoch": 0.15922450332280966, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 18257 + }, + { + "epoch": 0.15923322460797823, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 18258 + }, + { + "epoch": 0.1592419458931468, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 18259 + }, + { + "epoch": 0.1592506671783154, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 18260 + }, + { + "epoch": 0.15925938846348398, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 18261 + }, + { + "epoch": 0.15926810974865255, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 18262 + }, + { + "epoch": 0.15927683103382115, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 18263 + }, + { + "epoch": 0.15928555231898972, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 18264 + }, + { + "epoch": 0.15929427360415832, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 18265 + }, + { + "epoch": 0.1593029948893269, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18266 + }, + { + "epoch": 0.15931171617449547, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 18267 + }, + { + "epoch": 0.15932043745966407, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 18268 + }, + { + "epoch": 0.15932915874483264, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 18269 + }, + { + "epoch": 0.1593378800300012, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 18270 + }, + { + "epoch": 0.1593466013151698, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 18271 + }, + { + "epoch": 0.15935532260033838, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 18272 + }, + { + "epoch": 0.15936404388550696, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 18273 + }, + { + "epoch": 0.15937276517067556, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 18274 + }, + { + "epoch": 0.15938148645584413, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 18275 + }, + { + "epoch": 0.15939020774101273, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 18276 + }, + { + "epoch": 0.1593989290261813, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 18277 + }, + { + "epoch": 0.15940765031134987, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18278 + }, + { + "epoch": 0.15941637159651847, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 18279 + }, + { + "epoch": 0.15942509288168705, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 18280 + }, + { + "epoch": 0.15943381416685562, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 18281 + }, + { + "epoch": 0.15944253545202422, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 18282 + }, + { + "epoch": 0.1594512567371928, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 18283 + }, + { + "epoch": 0.15945997802236136, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 18284 + }, + { + "epoch": 0.15946869930752997, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 18285 + }, + { + "epoch": 0.15947742059269854, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 18286 + }, + { + "epoch": 0.1594861418778671, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18287 + }, + { + "epoch": 0.1594948631630357, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 18288 + }, + { + "epoch": 0.15950358444820428, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18289 + }, + { + "epoch": 0.15951230573337288, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 18290 + }, + { + "epoch": 0.15952102701854146, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 18291 + }, + { + "epoch": 0.15952974830371003, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 18292 + }, + { + "epoch": 0.15953846958887863, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 18293 + }, + { + "epoch": 0.1595471908740472, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 18294 + }, + { + "epoch": 0.15955591215921577, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 18295 + }, + { + "epoch": 0.15956463344438437, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 18296 + }, + { + "epoch": 0.15957335472955295, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 18297 + }, + { + "epoch": 0.15958207601472152, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 18298 + }, + { + "epoch": 0.15959079729989012, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 18299 + }, + { + "epoch": 0.1595995185850587, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 18300 + }, + { + "epoch": 0.15960823987022726, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 18301 + }, + { + "epoch": 0.15961696115539586, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 18302 + }, + { + "epoch": 0.15962568244056444, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 18303 + }, + { + "epoch": 0.15963440372573304, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18304 + }, + { + "epoch": 0.1596431250109016, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 18305 + }, + { + "epoch": 0.15965184629607018, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 18306 + }, + { + "epoch": 0.15966056758123878, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 18307 + }, + { + "epoch": 0.15966928886640736, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 18308 + }, + { + "epoch": 0.15967801015157593, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 18309 + }, + { + "epoch": 0.15968673143674453, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 18310 + }, + { + "epoch": 0.1596954527219131, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 18311 + }, + { + "epoch": 0.15970417400708167, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 18312 + }, + { + "epoch": 0.15971289529225027, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 18313 + }, + { + "epoch": 0.15972161657741885, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 18314 + }, + { + "epoch": 0.15973033786258742, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 18315 + }, + { + "epoch": 0.15973905914775602, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 18316 + }, + { + "epoch": 0.1597477804329246, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 18317 + }, + { + "epoch": 0.1597565017180932, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18318 + }, + { + "epoch": 0.15976522300326176, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 18319 + }, + { + "epoch": 0.15977394428843034, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 18320 + }, + { + "epoch": 0.15978266557359894, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 18321 + }, + { + "epoch": 0.1597913868587675, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18322 + }, + { + "epoch": 0.15980010814393608, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 18323 + }, + { + "epoch": 0.15980882942910468, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 18324 + }, + { + "epoch": 0.15981755071427325, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 18325 + }, + { + "epoch": 0.15982627199944183, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 18326 + }, + { + "epoch": 0.15983499328461043, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 18327 + }, + { + "epoch": 0.159843714569779, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 18328 + }, + { + "epoch": 0.15985243585494757, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 18329 + }, + { + "epoch": 0.15986115714011617, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 18330 + }, + { + "epoch": 0.15986987842528474, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 18331 + }, + { + "epoch": 0.15987859971045335, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 18332 + }, + { + "epoch": 0.15988732099562192, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 18333 + }, + { + "epoch": 0.1598960422807905, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18334 + }, + { + "epoch": 0.1599047635659591, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 18335 + }, + { + "epoch": 0.15991348485112766, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 18336 + }, + { + "epoch": 0.15992220613629624, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 18337 + }, + { + "epoch": 0.15993092742146484, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 18338 + }, + { + "epoch": 0.1599396487066334, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 18339 + }, + { + "epoch": 0.15994836999180198, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 18340 + }, + { + "epoch": 0.15995709127697058, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 18341 + }, + { + "epoch": 0.15996581256213915, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 18342 + }, + { + "epoch": 0.15997453384730773, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 18343 + }, + { + "epoch": 0.15998325513247633, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 18344 + }, + { + "epoch": 0.1599919764176449, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 18345 + }, + { + "epoch": 0.1600006977028135, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18346 + }, + { + "epoch": 0.16000941898798207, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 18347 + }, + { + "epoch": 0.16001814027315064, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 18348 + }, + { + "epoch": 0.16002686155831924, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 18349 + }, + { + "epoch": 0.16003558284348782, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 18350 + }, + { + "epoch": 0.1600443041286564, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 18351 + }, + { + "epoch": 0.160053025413825, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 18352 + }, + { + "epoch": 0.16006174669899356, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 18353 + }, + { + "epoch": 0.16007046798416213, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 18354 + }, + { + "epoch": 0.16007918926933074, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 18355 + }, + { + "epoch": 0.1600879105544993, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 18356 + }, + { + "epoch": 0.16009663183966788, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 18357 + }, + { + "epoch": 0.16010535312483648, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18358 + }, + { + "epoch": 0.16011407441000505, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 18359 + }, + { + "epoch": 0.16012279569517365, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 18360 + }, + { + "epoch": 0.16013151698034223, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 18361 + }, + { + "epoch": 0.1601402382655108, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 18362 + }, + { + "epoch": 0.1601489595506794, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 18363 + }, + { + "epoch": 0.16015768083584797, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 18364 + }, + { + "epoch": 0.16016640212101654, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 18365 + }, + { + "epoch": 0.16017512340618514, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 18366 + }, + { + "epoch": 0.16018384469135372, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 18367 + }, + { + "epoch": 0.1601925659765223, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 18368 + }, + { + "epoch": 0.1602012872616909, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 18369 + }, + { + "epoch": 0.16021000854685946, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 18370 + }, + { + "epoch": 0.16021872983202803, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 18371 + }, + { + "epoch": 0.16022745111719663, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 18372 + }, + { + "epoch": 0.1602361724023652, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 18373 + }, + { + "epoch": 0.1602448936875338, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 18374 + }, + { + "epoch": 0.16025361497270238, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 18375 + }, + { + "epoch": 0.16026233625787095, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 18376 + }, + { + "epoch": 0.16027105754303955, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 18377 + }, + { + "epoch": 0.16027977882820812, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 18378 + }, + { + "epoch": 0.1602885001133767, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 18379 + }, + { + "epoch": 0.1602972213985453, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 18380 + }, + { + "epoch": 0.16030594268371387, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 18381 + }, + { + "epoch": 0.16031466396888244, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 18382 + }, + { + "epoch": 0.16032338525405104, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 18383 + }, + { + "epoch": 0.16033210653921962, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 18384 + }, + { + "epoch": 0.16034082782438822, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 18385 + }, + { + "epoch": 0.1603495491095568, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 18386 + }, + { + "epoch": 0.16035827039472536, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 18387 + }, + { + "epoch": 0.16036699167989396, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18388 + }, + { + "epoch": 0.16037571296506253, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 18389 + }, + { + "epoch": 0.1603844342502311, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 18390 + }, + { + "epoch": 0.1603931555353997, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 18391 + }, + { + "epoch": 0.16040187682056828, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 18392 + }, + { + "epoch": 0.16041059810573685, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 18393 + }, + { + "epoch": 0.16041931939090545, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 18394 + }, + { + "epoch": 0.16042804067607402, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 18395 + }, + { + "epoch": 0.1604367619612426, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18396 + }, + { + "epoch": 0.1604454832464112, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 18397 + }, + { + "epoch": 0.16045420453157977, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 18398 + }, + { + "epoch": 0.16046292581674837, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 18399 + }, + { + "epoch": 0.16047164710191694, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 18400 + }, + { + "epoch": 0.16048036838708551, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 18401 + }, + { + "epoch": 0.16048908967225411, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 18402 + }, + { + "epoch": 0.1604978109574227, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 18403 + }, + { + "epoch": 0.16050653224259126, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 18404 + }, + { + "epoch": 0.16051525352775986, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 18405 + }, + { + "epoch": 0.16052397481292843, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 18406 + }, + { + "epoch": 0.160532696098097, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 18407 + }, + { + "epoch": 0.1605414173832656, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 18408 + }, + { + "epoch": 0.16055013866843418, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 18409 + }, + { + "epoch": 0.16055885995360275, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 18410 + }, + { + "epoch": 0.16056758123877135, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 18411 + }, + { + "epoch": 0.16057630252393992, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 18412 + }, + { + "epoch": 0.16058502380910852, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 18413 + }, + { + "epoch": 0.1605937450942771, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 18414 + }, + { + "epoch": 0.16060246637944567, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 18415 + }, + { + "epoch": 0.16061118766461427, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18416 + }, + { + "epoch": 0.16061990894978284, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 18417 + }, + { + "epoch": 0.16062863023495141, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 18418 + }, + { + "epoch": 0.16063735152012001, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 18419 + }, + { + "epoch": 0.1606460728052886, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 18420 + }, + { + "epoch": 0.16065479409045716, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 18421 + }, + { + "epoch": 0.16066351537562576, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 18422 + }, + { + "epoch": 0.16067223666079433, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 18423 + }, + { + "epoch": 0.1606809579459629, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 18424 + }, + { + "epoch": 0.1606896792311315, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 18425 + }, + { + "epoch": 0.16069840051630008, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 18426 + }, + { + "epoch": 0.16070712180146868, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 18427 + }, + { + "epoch": 0.16071584308663725, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 18428 + }, + { + "epoch": 0.16072456437180582, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 18429 + }, + { + "epoch": 0.16073328565697442, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 18430 + }, + { + "epoch": 0.160742006942143, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 18431 + }, + { + "epoch": 0.16075072822731157, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 18432 + }, + { + "epoch": 0.16075944951248017, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 18433 + }, + { + "epoch": 0.16076817079764874, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 18434 + }, + { + "epoch": 0.1607768920828173, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 18435 + }, + { + "epoch": 0.1607856133679859, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 18436 + }, + { + "epoch": 0.16079433465315449, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 18437 + }, + { + "epoch": 0.16080305593832306, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 18438 + }, + { + "epoch": 0.16081177722349166, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 18439 + }, + { + "epoch": 0.16082049850866023, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 18440 + }, + { + "epoch": 0.16082921979382883, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 18441 + }, + { + "epoch": 0.1608379410789974, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 18442 + }, + { + "epoch": 0.16084666236416598, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 18443 + }, + { + "epoch": 0.16085538364933458, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 18444 + }, + { + "epoch": 0.16086410493450315, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 18445 + }, + { + "epoch": 0.16087282621967172, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 18446 + }, + { + "epoch": 0.16088154750484032, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18447 + }, + { + "epoch": 0.1608902687900089, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18448 + }, + { + "epoch": 0.16089899007517747, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 18449 + }, + { + "epoch": 0.16090771136034607, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 18450 + }, + { + "epoch": 0.16091643264551464, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 18451 + }, + { + "epoch": 0.1609251539306832, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 18452 + }, + { + "epoch": 0.1609338752158518, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 18453 + }, + { + "epoch": 0.16094259650102039, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 18454 + }, + { + "epoch": 0.16095131778618899, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 18455 + }, + { + "epoch": 0.16096003907135756, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 18456 + }, + { + "epoch": 0.16096876035652613, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 18457 + }, + { + "epoch": 0.16097748164169473, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 18458 + }, + { + "epoch": 0.1609862029268633, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 18459 + }, + { + "epoch": 0.16099492421203188, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 18460 + }, + { + "epoch": 0.16100364549720048, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 18461 + }, + { + "epoch": 0.16101236678236905, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 18462 + }, + { + "epoch": 0.16102108806753762, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 18463 + }, + { + "epoch": 0.16102980935270622, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 18464 + }, + { + "epoch": 0.1610385306378748, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 18465 + }, + { + "epoch": 0.16104725192304337, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 18466 + }, + { + "epoch": 0.16105597320821197, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 18467 + }, + { + "epoch": 0.16106469449338054, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 18468 + }, + { + "epoch": 0.16107341577854914, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18469 + }, + { + "epoch": 0.1610821370637177, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 18470 + }, + { + "epoch": 0.16109085834888628, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 18471 + }, + { + "epoch": 0.16109957963405488, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 18472 + }, + { + "epoch": 0.16110830091922346, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 18473 + }, + { + "epoch": 0.16111702220439203, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 18474 + }, + { + "epoch": 0.16112574348956063, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 18475 + }, + { + "epoch": 0.1611344647747292, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 18476 + }, + { + "epoch": 0.16114318605989777, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18477 + }, + { + "epoch": 0.16115190734506638, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 18478 + }, + { + "epoch": 0.16116062863023495, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 18479 + }, + { + "epoch": 0.16116934991540352, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 18480 + }, + { + "epoch": 0.16117807120057212, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 18481 + }, + { + "epoch": 0.1611867924857407, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 18482 + }, + { + "epoch": 0.1611955137709093, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 18483 + }, + { + "epoch": 0.16120423505607787, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 18484 + }, + { + "epoch": 0.16121295634124644, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 18485 + }, + { + "epoch": 0.16122167762641504, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 18486 + }, + { + "epoch": 0.1612303989115836, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 18487 + }, + { + "epoch": 0.16123912019675218, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 18488 + }, + { + "epoch": 0.16124784148192078, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 18489 + }, + { + "epoch": 0.16125656276708936, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 18490 + }, + { + "epoch": 0.16126528405225793, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 18491 + }, + { + "epoch": 0.16127400533742653, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 18492 + }, + { + "epoch": 0.1612827266225951, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18493 + }, + { + "epoch": 0.16129144790776367, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 18494 + }, + { + "epoch": 0.16130016919293227, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 18495 + }, + { + "epoch": 0.16130889047810085, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 18496 + }, + { + "epoch": 0.16131761176326945, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 18497 + }, + { + "epoch": 0.16132633304843802, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 18498 + }, + { + "epoch": 0.1613350543336066, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 18499 + }, + { + "epoch": 0.1613437756187752, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 18500 + }, + { + "epoch": 0.16135249690394377, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18501 + }, + { + "epoch": 0.16136121818911234, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 18502 + }, + { + "epoch": 0.16136993947428094, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 18503 + }, + { + "epoch": 0.1613786607594495, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 18504 + }, + { + "epoch": 0.16138738204461808, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 18505 + }, + { + "epoch": 0.16139610332978668, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 18506 + }, + { + "epoch": 0.16140482461495526, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 18507 + }, + { + "epoch": 0.16141354590012386, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 18508 + }, + { + "epoch": 0.16142226718529243, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 18509 + }, + { + "epoch": 0.161430988470461, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 18510 + }, + { + "epoch": 0.1614397097556296, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 18511 + }, + { + "epoch": 0.16144843104079817, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 18512 + }, + { + "epoch": 0.16145715232596675, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 18513 + }, + { + "epoch": 0.16146587361113535, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 18514 + }, + { + "epoch": 0.16147459489630392, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 18515 + }, + { + "epoch": 0.1614833161814725, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 18516 + }, + { + "epoch": 0.1614920374666411, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 18517 + }, + { + "epoch": 0.16150075875180966, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 18518 + }, + { + "epoch": 0.16150948003697824, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 18519 + }, + { + "epoch": 0.16151820132214684, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 18520 + }, + { + "epoch": 0.1615269226073154, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 18521 + }, + { + "epoch": 0.161535643892484, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 18522 + }, + { + "epoch": 0.16154436517765258, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 18523 + }, + { + "epoch": 0.16155308646282115, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 18524 + }, + { + "epoch": 0.16156180774798976, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 18525 + }, + { + "epoch": 0.16157052903315833, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 18526 + }, + { + "epoch": 0.1615792503183269, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 18527 + }, + { + "epoch": 0.1615879716034955, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 18528 + }, + { + "epoch": 0.16159669288866407, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18529 + }, + { + "epoch": 0.16160541417383265, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 18530 + }, + { + "epoch": 0.16161413545900125, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 18531 + }, + { + "epoch": 0.16162285674416982, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 18532 + }, + { + "epoch": 0.1616315780293384, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 18533 + }, + { + "epoch": 0.161640299314507, + "grad_norm": 0.408203125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 18534 + }, + { + "epoch": 0.16164902059967556, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 18535 + }, + { + "epoch": 0.16165774188484416, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 18536 + }, + { + "epoch": 0.16166646317001274, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 18537 + }, + { + "epoch": 0.1616751844551813, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 18538 + }, + { + "epoch": 0.1616839057403499, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 18539 + }, + { + "epoch": 0.16169262702551848, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 18540 + }, + { + "epoch": 0.16170134831068705, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 18541 + }, + { + "epoch": 0.16171006959585565, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 18542 + }, + { + "epoch": 0.16171879088102423, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 18543 + }, + { + "epoch": 0.1617275121661928, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 18544 + }, + { + "epoch": 0.1617362334513614, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 18545 + }, + { + "epoch": 0.16174495473652997, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 18546 + }, + { + "epoch": 0.16175367602169854, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 18547 + }, + { + "epoch": 0.16176239730686715, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 18548 + }, + { + "epoch": 0.16177111859203572, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 18549 + }, + { + "epoch": 0.16177983987720432, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 18550 + }, + { + "epoch": 0.1617885611623729, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 18551 + }, + { + "epoch": 0.16179728244754146, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 18552 + }, + { + "epoch": 0.16180600373271006, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 18553 + }, + { + "epoch": 0.16181472501787864, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 18554 + }, + { + "epoch": 0.1618234463030472, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 18555 + }, + { + "epoch": 0.1618321675882158, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 18556 + }, + { + "epoch": 0.16184088887338438, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 18557 + }, + { + "epoch": 0.16184961015855295, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18558 + }, + { + "epoch": 0.16185833144372155, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 18559 + }, + { + "epoch": 0.16186705272889013, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 18560 + }, + { + "epoch": 0.1618757740140587, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 18561 + }, + { + "epoch": 0.1618844952992273, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18562 + }, + { + "epoch": 0.16189321658439587, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 18563 + }, + { + "epoch": 0.16190193786956447, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 18564 + }, + { + "epoch": 0.16191065915473304, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18565 + }, + { + "epoch": 0.16191938043990162, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 18566 + }, + { + "epoch": 0.16192810172507022, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 18567 + }, + { + "epoch": 0.1619368230102388, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 18568 + }, + { + "epoch": 0.16194554429540736, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 18569 + }, + { + "epoch": 0.16195426558057596, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 18570 + }, + { + "epoch": 0.16196298686574453, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 18571 + }, + { + "epoch": 0.1619717081509131, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 18572 + }, + { + "epoch": 0.1619804294360817, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 18573 + }, + { + "epoch": 0.16198915072125028, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 18574 + }, + { + "epoch": 0.16199787200641885, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 18575 + }, + { + "epoch": 0.16200659329158745, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 18576 + }, + { + "epoch": 0.16201531457675603, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 18577 + }, + { + "epoch": 0.16202403586192463, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18578 + }, + { + "epoch": 0.1620327571470932, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 18579 + }, + { + "epoch": 0.16204147843226177, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 18580 + }, + { + "epoch": 0.16205019971743037, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 18581 + }, + { + "epoch": 0.16205892100259894, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18582 + }, + { + "epoch": 0.16206764228776752, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 18583 + }, + { + "epoch": 0.16207636357293612, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18584 + }, + { + "epoch": 0.1620850848581047, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 18585 + }, + { + "epoch": 0.16209380614327326, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 18586 + }, + { + "epoch": 0.16210252742844186, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 18587 + }, + { + "epoch": 0.16211124871361043, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 18588 + }, + { + "epoch": 0.162119969998779, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 18589 + }, + { + "epoch": 0.1621286912839476, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 18590 + }, + { + "epoch": 0.16213741256911618, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 18591 + }, + { + "epoch": 0.16214613385428478, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 18592 + }, + { + "epoch": 0.16215485513945335, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 18593 + }, + { + "epoch": 0.16216357642462192, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 18594 + }, + { + "epoch": 0.16217229770979052, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 18595 + }, + { + "epoch": 0.1621810189949591, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 18596 + }, + { + "epoch": 0.16218974028012767, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 18597 + }, + { + "epoch": 0.16219846156529627, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 18598 + }, + { + "epoch": 0.16220718285046484, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18599 + }, + { + "epoch": 0.16221590413563342, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 18600 + }, + { + "epoch": 0.16222462542080202, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 18601 + }, + { + "epoch": 0.1622333467059706, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18602 + }, + { + "epoch": 0.16224206799113916, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 18603 + }, + { + "epoch": 0.16225078927630776, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 18604 + }, + { + "epoch": 0.16225951056147633, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 18605 + }, + { + "epoch": 0.16226823184664493, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 18606 + }, + { + "epoch": 0.1622769531318135, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 18607 + }, + { + "epoch": 0.16228567441698208, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18608 + }, + { + "epoch": 0.16229439570215068, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 18609 + }, + { + "epoch": 0.16230311698731925, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 18610 + }, + { + "epoch": 0.16231183827248782, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 18611 + }, + { + "epoch": 0.16232055955765642, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 18612 + }, + { + "epoch": 0.162329280842825, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 18613 + }, + { + "epoch": 0.16233800212799357, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 18614 + }, + { + "epoch": 0.16234672341316217, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 18615 + }, + { + "epoch": 0.16235544469833074, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 18616 + }, + { + "epoch": 0.16236416598349934, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 18617 + }, + { + "epoch": 0.16237288726866791, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 18618 + }, + { + "epoch": 0.1623816085538365, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 18619 + }, + { + "epoch": 0.1623903298390051, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 18620 + }, + { + "epoch": 0.16239905112417366, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 18621 + }, + { + "epoch": 0.16240777240934223, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18622 + }, + { + "epoch": 0.16241649369451083, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 18623 + }, + { + "epoch": 0.1624252149796794, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 18624 + }, + { + "epoch": 0.16243393626484798, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 18625 + }, + { + "epoch": 0.16244265755001658, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 18626 + }, + { + "epoch": 0.16245137883518515, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 18627 + }, + { + "epoch": 0.16246010012035372, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 18628 + }, + { + "epoch": 0.16246882140552232, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 18629 + }, + { + "epoch": 0.1624775426906909, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 18630 + }, + { + "epoch": 0.1624862639758595, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 18631 + }, + { + "epoch": 0.16249498526102807, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 18632 + }, + { + "epoch": 0.16250370654619664, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 18633 + }, + { + "epoch": 0.16251242783136524, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 18634 + }, + { + "epoch": 0.16252114911653381, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 18635 + }, + { + "epoch": 0.1625298704017024, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 18636 + }, + { + "epoch": 0.162538591686871, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 18637 + }, + { + "epoch": 0.16254731297203956, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 18638 + }, + { + "epoch": 0.16255603425720813, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 18639 + }, + { + "epoch": 0.16256475554237673, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 18640 + }, + { + "epoch": 0.1625734768275453, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18641 + }, + { + "epoch": 0.16258219811271388, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 18642 + }, + { + "epoch": 0.16259091939788248, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 18643 + }, + { + "epoch": 0.16259964068305105, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18644 + }, + { + "epoch": 0.16260836196821965, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18645 + }, + { + "epoch": 0.16261708325338822, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 18646 + }, + { + "epoch": 0.1626258045385568, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 18647 + }, + { + "epoch": 0.1626345258237254, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 18648 + }, + { + "epoch": 0.16264324710889397, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 18649 + }, + { + "epoch": 0.16265196839406254, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 18650 + }, + { + "epoch": 0.16266068967923114, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 18651 + }, + { + "epoch": 0.1626694109643997, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 18652 + }, + { + "epoch": 0.16267813224956829, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 18653 + }, + { + "epoch": 0.16268685353473689, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 18654 + }, + { + "epoch": 0.16269557481990546, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 18655 + }, + { + "epoch": 0.16270429610507403, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 18656 + }, + { + "epoch": 0.16271301739024263, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 18657 + }, + { + "epoch": 0.1627217386754112, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 18658 + }, + { + "epoch": 0.1627304599605798, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 18659 + }, + { + "epoch": 0.16273918124574838, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 18660 + }, + { + "epoch": 0.16274790253091695, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 18661 + }, + { + "epoch": 0.16275662381608555, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 18662 + }, + { + "epoch": 0.16276534510125412, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 18663 + }, + { + "epoch": 0.1627740663864227, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 18664 + }, + { + "epoch": 0.1627827876715913, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 18665 + }, + { + "epoch": 0.16279150895675987, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18666 + }, + { + "epoch": 0.16280023024192844, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 18667 + }, + { + "epoch": 0.16280895152709704, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 18668 + }, + { + "epoch": 0.1628176728122656, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 18669 + }, + { + "epoch": 0.16282639409743419, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 18670 + }, + { + "epoch": 0.16283511538260279, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 18671 + }, + { + "epoch": 0.16284383666777136, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 18672 + }, + { + "epoch": 0.16285255795293996, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 18673 + }, + { + "epoch": 0.16286127923810853, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 18674 + }, + { + "epoch": 0.1628700005232771, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 18675 + }, + { + "epoch": 0.1628787218084457, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 18676 + }, + { + "epoch": 0.16288744309361428, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 18677 + }, + { + "epoch": 0.16289616437878285, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 18678 + }, + { + "epoch": 0.16290488566395145, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 18679 + }, + { + "epoch": 0.16291360694912002, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 18680 + }, + { + "epoch": 0.1629223282342886, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 18681 + }, + { + "epoch": 0.1629310495194572, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 18682 + }, + { + "epoch": 0.16293977080462577, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 18683 + }, + { + "epoch": 0.16294849208979434, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 18684 + }, + { + "epoch": 0.16295721337496294, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 18685 + }, + { + "epoch": 0.1629659346601315, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 18686 + }, + { + "epoch": 0.1629746559453001, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 18687 + }, + { + "epoch": 0.16298337723046868, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 18688 + }, + { + "epoch": 0.16299209851563726, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 18689 + }, + { + "epoch": 0.16300081980080586, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 18690 + }, + { + "epoch": 0.16300954108597443, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 18691 + }, + { + "epoch": 0.163018262371143, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18692 + }, + { + "epoch": 0.1630269836563116, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 18693 + }, + { + "epoch": 0.16303570494148018, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 18694 + }, + { + "epoch": 0.16304442622664875, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 18695 + }, + { + "epoch": 0.16305314751181735, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 18696 + }, + { + "epoch": 0.16306186879698592, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 18697 + }, + { + "epoch": 0.1630705900821545, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 18698 + }, + { + "epoch": 0.1630793113673231, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 18699 + }, + { + "epoch": 0.16308803265249167, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 18700 + }, + { + "epoch": 0.16309675393766027, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 18701 + }, + { + "epoch": 0.16310547522282884, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 18702 + }, + { + "epoch": 0.1631141965079974, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 18703 + }, + { + "epoch": 0.163122917793166, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 18704 + }, + { + "epoch": 0.16313163907833458, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 18705 + }, + { + "epoch": 0.16314036036350316, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 18706 + }, + { + "epoch": 0.16314908164867176, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 18707 + }, + { + "epoch": 0.16315780293384033, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 18708 + }, + { + "epoch": 0.1631665242190089, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 18709 + }, + { + "epoch": 0.1631752455041775, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 18710 + }, + { + "epoch": 0.16318396678934607, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 18711 + }, + { + "epoch": 0.16319268807451465, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 18712 + }, + { + "epoch": 0.16320140935968325, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 18713 + }, + { + "epoch": 0.16321013064485182, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 18714 + }, + { + "epoch": 0.16321885193002042, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 18715 + }, + { + "epoch": 0.163227573215189, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 18716 + }, + { + "epoch": 0.16323629450035756, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18717 + }, + { + "epoch": 0.16324501578552617, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 18718 + }, + { + "epoch": 0.16325373707069474, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 18719 + }, + { + "epoch": 0.1632624583558633, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 18720 + }, + { + "epoch": 0.1632711796410319, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 18721 + }, + { + "epoch": 0.16327990092620048, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 18722 + }, + { + "epoch": 0.16328862221136906, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 18723 + }, + { + "epoch": 0.16329734349653766, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 18724 + }, + { + "epoch": 0.16330606478170623, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 18725 + }, + { + "epoch": 0.1633147860668748, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 18726 + }, + { + "epoch": 0.1633235073520434, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 18727 + }, + { + "epoch": 0.16333222863721197, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 18728 + }, + { + "epoch": 0.16334094992238057, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 18729 + }, + { + "epoch": 0.16334967120754915, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 18730 + }, + { + "epoch": 0.16335839249271772, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 18731 + }, + { + "epoch": 0.16336711377788632, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 18732 + }, + { + "epoch": 0.1633758350630549, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18733 + }, + { + "epoch": 0.16338455634822346, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 18734 + }, + { + "epoch": 0.16339327763339206, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 18735 + }, + { + "epoch": 0.16340199891856064, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 18736 + }, + { + "epoch": 0.1634107202037292, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 18737 + }, + { + "epoch": 0.1634194414888978, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 18738 + }, + { + "epoch": 0.16342816277406638, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18739 + }, + { + "epoch": 0.16343688405923498, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 18740 + }, + { + "epoch": 0.16344560534440356, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 18741 + }, + { + "epoch": 0.16345432662957213, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 18742 + }, + { + "epoch": 0.16346304791474073, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 18743 + }, + { + "epoch": 0.1634717691999093, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 18744 + }, + { + "epoch": 0.16348049048507787, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 18745 + }, + { + "epoch": 0.16348921177024647, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 18746 + }, + { + "epoch": 0.16349793305541505, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 18747 + }, + { + "epoch": 0.16350665434058362, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 18748 + }, + { + "epoch": 0.16351537562575222, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 18749 + }, + { + "epoch": 0.1635240969109208, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 18750 + }, + { + "epoch": 0.16353281819608936, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 18751 + }, + { + "epoch": 0.16354153948125796, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 18752 + }, + { + "epoch": 0.16355026076642654, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 18753 + }, + { + "epoch": 0.16355898205159514, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 18754 + }, + { + "epoch": 0.1635677033367637, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 18755 + }, + { + "epoch": 0.16357642462193228, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 18756 + }, + { + "epoch": 0.16358514590710088, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 18757 + }, + { + "epoch": 0.16359386719226945, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 18758 + }, + { + "epoch": 0.16360258847743803, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 18759 + }, + { + "epoch": 0.16361130976260663, + "grad_norm": 0.443359375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 18760 + }, + { + "epoch": 0.1636200310477752, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 18761 + }, + { + "epoch": 0.16362875233294377, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 18762 + }, + { + "epoch": 0.16363747361811237, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 18763 + }, + { + "epoch": 0.16364619490328094, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 18764 + }, + { + "epoch": 0.16365491618844952, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 18765 + }, + { + "epoch": 0.16366363747361812, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 18766 + }, + { + "epoch": 0.1636723587587867, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 18767 + }, + { + "epoch": 0.1636810800439553, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 18768 + }, + { + "epoch": 0.16368980132912386, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 18769 + }, + { + "epoch": 0.16369852261429244, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 18770 + }, + { + "epoch": 0.16370724389946104, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 18771 + }, + { + "epoch": 0.1637159651846296, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 18772 + }, + { + "epoch": 0.16372468646979818, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 18773 + }, + { + "epoch": 0.16373340775496678, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 18774 + }, + { + "epoch": 0.16374212904013535, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18775 + }, + { + "epoch": 0.16375085032530393, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 18776 + }, + { + "epoch": 0.16375957161047253, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 18777 + }, + { + "epoch": 0.1637682928956411, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 18778 + }, + { + "epoch": 0.16377701418080967, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 18779 + }, + { + "epoch": 0.16378573546597827, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 18780 + }, + { + "epoch": 0.16379445675114684, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 18781 + }, + { + "epoch": 0.16380317803631544, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 18782 + }, + { + "epoch": 0.16381189932148402, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 18783 + }, + { + "epoch": 0.1638206206066526, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 18784 + }, + { + "epoch": 0.1638293418918212, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 18785 + }, + { + "epoch": 0.16383806317698976, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 18786 + }, + { + "epoch": 0.16384678446215833, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 18787 + }, + { + "epoch": 0.16385550574732693, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 18788 + }, + { + "epoch": 0.1638642270324955, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18789 + }, + { + "epoch": 0.16387294831766408, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 18790 + }, + { + "epoch": 0.16388166960283268, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 18791 + }, + { + "epoch": 0.16389039088800125, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 18792 + }, + { + "epoch": 0.16389911217316983, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 18793 + }, + { + "epoch": 0.16390783345833843, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 18794 + }, + { + "epoch": 0.163916554743507, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 18795 + }, + { + "epoch": 0.1639252760286756, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 18796 + }, + { + "epoch": 0.16393399731384417, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 18797 + }, + { + "epoch": 0.16394271859901274, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 18798 + }, + { + "epoch": 0.16395143988418134, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 18799 + }, + { + "epoch": 0.16396016116934992, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 18800 + }, + { + "epoch": 0.1639688824545185, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 18801 + }, + { + "epoch": 0.1639776037396871, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 18802 + }, + { + "epoch": 0.16398632502485566, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 18803 + }, + { + "epoch": 0.16399504631002423, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 18804 + }, + { + "epoch": 0.16400376759519283, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 18805 + }, + { + "epoch": 0.1640124888803614, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 18806 + }, + { + "epoch": 0.16402121016552998, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18807 + }, + { + "epoch": 0.16402993145069858, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 18808 + }, + { + "epoch": 0.16403865273586715, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 18809 + }, + { + "epoch": 0.16404737402103575, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 18810 + }, + { + "epoch": 0.16405609530620432, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 18811 + }, + { + "epoch": 0.1640648165913729, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 18812 + }, + { + "epoch": 0.1640735378765415, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 18813 + }, + { + "epoch": 0.16408225916171007, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 18814 + }, + { + "epoch": 0.16409098044687864, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 18815 + }, + { + "epoch": 0.16409970173204724, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 18816 + }, + { + "epoch": 0.16410842301721582, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 18817 + }, + { + "epoch": 0.1641171443023844, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 18818 + }, + { + "epoch": 0.164125865587553, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 18819 + }, + { + "epoch": 0.16413458687272156, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 18820 + }, + { + "epoch": 0.16414330815789013, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 18821 + }, + { + "epoch": 0.16415202944305873, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 18822 + }, + { + "epoch": 0.1641607507282273, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 18823 + }, + { + "epoch": 0.1641694720133959, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 18824 + }, + { + "epoch": 0.16417819329856448, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 18825 + }, + { + "epoch": 0.16418691458373305, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18826 + }, + { + "epoch": 0.16419563586890165, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 18827 + }, + { + "epoch": 0.16420435715407022, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 18828 + }, + { + "epoch": 0.1642130784392388, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 18829 + }, + { + "epoch": 0.1642217997244074, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 18830 + }, + { + "epoch": 0.16423052100957597, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 18831 + }, + { + "epoch": 0.16423924229474454, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 18832 + }, + { + "epoch": 0.16424796357991314, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 18833 + }, + { + "epoch": 0.16425668486508171, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 18834 + }, + { + "epoch": 0.1642654061502503, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 18835 + }, + { + "epoch": 0.1642741274354189, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 18836 + }, + { + "epoch": 0.16428284872058746, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 18837 + }, + { + "epoch": 0.16429157000575606, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 18838 + }, + { + "epoch": 0.16430029129092463, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 18839 + }, + { + "epoch": 0.1643090125760932, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 18840 + }, + { + "epoch": 0.1643177338612618, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 18841 + }, + { + "epoch": 0.16432645514643038, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 18842 + }, + { + "epoch": 0.16433517643159895, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 18843 + }, + { + "epoch": 0.16434389771676755, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 18844 + }, + { + "epoch": 0.16435261900193612, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 18845 + }, + { + "epoch": 0.1643613402871047, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 18846 + }, + { + "epoch": 0.1643700615722733, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 18847 + }, + { + "epoch": 0.16437878285744187, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 18848 + }, + { + "epoch": 0.16438750414261047, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 18849 + }, + { + "epoch": 0.16439622542777904, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 18850 + }, + { + "epoch": 0.1644049467129476, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 18851 + }, + { + "epoch": 0.16441366799811621, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 18852 + }, + { + "epoch": 0.1644223892832848, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 18853 + }, + { + "epoch": 0.16443111056845336, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 18854 + }, + { + "epoch": 0.16443983185362196, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 18855 + }, + { + "epoch": 0.16444855313879053, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 18856 + }, + { + "epoch": 0.1644572744239591, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 18857 + }, + { + "epoch": 0.1644659957091277, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 18858 + }, + { + "epoch": 0.16447471699429628, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 18859 + }, + { + "epoch": 0.16448343827946485, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 18860 + }, + { + "epoch": 0.16449215956463345, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 18861 + }, + { + "epoch": 0.16450088084980202, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 18862 + }, + { + "epoch": 0.16450960213497062, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 18863 + }, + { + "epoch": 0.1645183234201392, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 18864 + }, + { + "epoch": 0.16452704470530777, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 18865 + }, + { + "epoch": 0.16453576599047637, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 18866 + }, + { + "epoch": 0.16454448727564494, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 18867 + }, + { + "epoch": 0.1645532085608135, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 18868 + }, + { + "epoch": 0.1645619298459821, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 18869 + }, + { + "epoch": 0.16457065113115069, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 18870 + }, + { + "epoch": 0.16457937241631926, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 18871 + }, + { + "epoch": 0.16458809370148786, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 18872 + }, + { + "epoch": 0.16459681498665643, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 18873 + }, + { + "epoch": 0.164605536271825, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 18874 + }, + { + "epoch": 0.1646142575569936, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 18875 + }, + { + "epoch": 0.16462297884216218, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 18876 + }, + { + "epoch": 0.16463170012733078, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 18877 + }, + { + "epoch": 0.16464042141249935, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 18878 + }, + { + "epoch": 0.16464914269766792, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 18879 + }, + { + "epoch": 0.16465786398283652, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 18880 + }, + { + "epoch": 0.1646665852680051, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 18881 + }, + { + "epoch": 0.16467530655317367, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 18882 + }, + { + "epoch": 0.16468402783834227, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 18883 + }, + { + "epoch": 0.16469274912351084, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 18884 + }, + { + "epoch": 0.1647014704086794, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 18885 + }, + { + "epoch": 0.164710191693848, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 18886 + }, + { + "epoch": 0.16471891297901659, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 18887 + }, + { + "epoch": 0.16472763426418516, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 18888 + }, + { + "epoch": 0.16473635554935376, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 18889 + }, + { + "epoch": 0.16474507683452233, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 18890 + }, + { + "epoch": 0.16475379811969093, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 18891 + }, + { + "epoch": 0.1647625194048595, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 18892 + }, + { + "epoch": 0.16477124069002808, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 18893 + }, + { + "epoch": 0.16477996197519668, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 18894 + }, + { + "epoch": 0.16478868326036525, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 18895 + }, + { + "epoch": 0.16479740454553382, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 18896 + }, + { + "epoch": 0.16480612583070242, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 18897 + }, + { + "epoch": 0.164814847115871, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 18898 + }, + { + "epoch": 0.16482356840103957, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 18899 + }, + { + "epoch": 0.16483228968620817, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 18900 + }, + { + "epoch": 0.16484101097137674, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 18901 + }, + { + "epoch": 0.1648497322565453, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 18902 + }, + { + "epoch": 0.1648584535417139, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 18903 + }, + { + "epoch": 0.16486717482688248, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 18904 + }, + { + "epoch": 0.16487589611205108, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 18905 + }, + { + "epoch": 0.16488461739721966, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 18906 + }, + { + "epoch": 0.16489333868238823, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 18907 + }, + { + "epoch": 0.16490205996755683, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 18908 + }, + { + "epoch": 0.1649107812527254, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 18909 + }, + { + "epoch": 0.16491950253789397, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 18910 + }, + { + "epoch": 0.16492822382306258, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 18911 + }, + { + "epoch": 0.16493694510823115, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 18912 + }, + { + "epoch": 0.16494566639339972, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 18913 + }, + { + "epoch": 0.16495438767856832, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 18914 + }, + { + "epoch": 0.1649631089637369, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 18915 + }, + { + "epoch": 0.16497183024890547, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 18916 + }, + { + "epoch": 0.16498055153407407, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 18917 + }, + { + "epoch": 0.16498927281924264, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 18918 + }, + { + "epoch": 0.16499799410441124, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 18919 + }, + { + "epoch": 0.1650067153895798, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 18920 + }, + { + "epoch": 0.16501543667474838, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 18921 + }, + { + "epoch": 0.16502415795991698, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 18922 + }, + { + "epoch": 0.16503287924508556, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 18923 + }, + { + "epoch": 0.16504160053025413, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 18924 + }, + { + "epoch": 0.16505032181542273, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 18925 + }, + { + "epoch": 0.1650590431005913, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 18926 + }, + { + "epoch": 0.16506776438575987, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 18927 + }, + { + "epoch": 0.16507648567092847, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 18928 + }, + { + "epoch": 0.16508520695609705, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 18929 + }, + { + "epoch": 0.16509392824126562, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 18930 + }, + { + "epoch": 0.16510264952643422, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 18931 + }, + { + "epoch": 0.1651113708116028, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 18932 + }, + { + "epoch": 0.1651200920967714, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 18933 + }, + { + "epoch": 0.16512881338193997, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 18934 + }, + { + "epoch": 0.16513753466710854, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 18935 + }, + { + "epoch": 0.16514625595227714, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 18936 + }, + { + "epoch": 0.1651549772374457, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 18937 + }, + { + "epoch": 0.16516369852261428, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 18938 + }, + { + "epoch": 0.16517241980778288, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 18939 + }, + { + "epoch": 0.16518114109295146, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 18940 + }, + { + "epoch": 0.16518986237812003, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 18941 + }, + { + "epoch": 0.16519858366328863, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 18942 + }, + { + "epoch": 0.1652073049484572, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 18943 + }, + { + "epoch": 0.16521602623362577, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 18944 + }, + { + "epoch": 0.16522474751879437, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 18945 + }, + { + "epoch": 0.16523346880396295, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 18946 + }, + { + "epoch": 0.16524219008913155, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 18947 + }, + { + "epoch": 0.16525091137430012, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 18948 + }, + { + "epoch": 0.1652596326594687, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 18949 + }, + { + "epoch": 0.1652683539446373, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 18950 + }, + { + "epoch": 0.16527707522980586, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 18951 + }, + { + "epoch": 0.16528579651497444, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 18952 + }, + { + "epoch": 0.16529451780014304, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 18953 + }, + { + "epoch": 0.1653032390853116, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 18954 + }, + { + "epoch": 0.16531196037048018, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 18955 + }, + { + "epoch": 0.16532068165564878, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 18956 + }, + { + "epoch": 0.16532940294081735, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 18957 + }, + { + "epoch": 0.16533812422598596, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 18958 + }, + { + "epoch": 0.16534684551115453, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 18959 + }, + { + "epoch": 0.1653555667963231, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 18960 + }, + { + "epoch": 0.1653642880814917, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 18961 + }, + { + "epoch": 0.16537300936666027, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 18962 + }, + { + "epoch": 0.16538173065182885, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 18963 + }, + { + "epoch": 0.16539045193699745, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 18964 + }, + { + "epoch": 0.16539917322216602, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 18965 + }, + { + "epoch": 0.1654078945073346, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 18966 + }, + { + "epoch": 0.1654166157925032, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 18967 + }, + { + "epoch": 0.16542533707767176, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 18968 + }, + { + "epoch": 0.16543405836284034, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 18969 + }, + { + "epoch": 0.16544277964800894, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 18970 + }, + { + "epoch": 0.1654515009331775, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 18971 + }, + { + "epoch": 0.1654602222183461, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 18972 + }, + { + "epoch": 0.16546894350351468, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 18973 + }, + { + "epoch": 0.16547766478868325, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 18974 + }, + { + "epoch": 0.16548638607385185, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 18975 + }, + { + "epoch": 0.16549510735902043, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 18976 + }, + { + "epoch": 0.165503828644189, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 18977 + }, + { + "epoch": 0.1655125499293576, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 18978 + }, + { + "epoch": 0.16552127121452617, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 18979 + }, + { + "epoch": 0.16552999249969474, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 18980 + }, + { + "epoch": 0.16553871378486334, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 18981 + }, + { + "epoch": 0.16554743507003192, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 18982 + }, + { + "epoch": 0.1655561563552005, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 18983 + }, + { + "epoch": 0.1655648776403691, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 18984 + }, + { + "epoch": 0.16557359892553766, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 18985 + }, + { + "epoch": 0.16558232021070626, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 18986 + }, + { + "epoch": 0.16559104149587484, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 18987 + }, + { + "epoch": 0.1655997627810434, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 18988 + }, + { + "epoch": 0.165608484066212, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 18989 + }, + { + "epoch": 0.16561720535138058, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 18990 + }, + { + "epoch": 0.16562592663654915, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 18991 + }, + { + "epoch": 0.16563464792171775, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 18992 + }, + { + "epoch": 0.16564336920688633, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 18993 + }, + { + "epoch": 0.1656520904920549, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 18994 + }, + { + "epoch": 0.1656608117772235, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 18995 + }, + { + "epoch": 0.16566953306239207, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 18996 + }, + { + "epoch": 0.16567825434756064, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 18997 + }, + { + "epoch": 0.16568697563272924, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 18998 + }, + { + "epoch": 0.16569569691789782, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 18999 + }, + { + "epoch": 0.16570441820306642, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 19000 + }, + { + "epoch": 0.165713139488235, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 19001 + }, + { + "epoch": 0.16572186077340356, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 19002 + }, + { + "epoch": 0.16573058205857216, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 19003 + }, + { + "epoch": 0.16573930334374073, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 19004 + }, + { + "epoch": 0.1657480246289093, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 19005 + }, + { + "epoch": 0.1657567459140779, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 19006 + }, + { + "epoch": 0.16576546719924648, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 19007 + }, + { + "epoch": 0.16577418848441505, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 19008 + }, + { + "epoch": 0.16578290976958365, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 19009 + }, + { + "epoch": 0.16579163105475223, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 19010 + }, + { + "epoch": 0.1658003523399208, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 19011 + }, + { + "epoch": 0.1658090736250894, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 19012 + }, + { + "epoch": 0.16581779491025797, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 19013 + }, + { + "epoch": 0.16582651619542657, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 19014 + }, + { + "epoch": 0.16583523748059514, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 19015 + }, + { + "epoch": 0.16584395876576372, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 19016 + }, + { + "epoch": 0.16585268005093232, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 19017 + }, + { + "epoch": 0.1658614013361009, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 19018 + }, + { + "epoch": 0.16587012262126946, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 19019 + }, + { + "epoch": 0.16587884390643806, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19020 + }, + { + "epoch": 0.16588756519160663, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 19021 + }, + { + "epoch": 0.1658962864767752, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 19022 + }, + { + "epoch": 0.1659050077619438, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19023 + }, + { + "epoch": 0.16591372904711238, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 19024 + }, + { + "epoch": 0.16592245033228095, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 19025 + }, + { + "epoch": 0.16593117161744955, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 19026 + }, + { + "epoch": 0.16593989290261812, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 19027 + }, + { + "epoch": 0.16594861418778672, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 19028 + }, + { + "epoch": 0.1659573354729553, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 19029 + }, + { + "epoch": 0.16596605675812387, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 19030 + }, + { + "epoch": 0.16597477804329247, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 19031 + }, + { + "epoch": 0.16598349932846104, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 19032 + }, + { + "epoch": 0.16599222061362962, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 19033 + }, + { + "epoch": 0.16600094189879822, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 19034 + }, + { + "epoch": 0.1660096631839668, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 19035 + }, + { + "epoch": 0.16601838446913536, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 19036 + }, + { + "epoch": 0.16602710575430396, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 19037 + }, + { + "epoch": 0.16603582703947253, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 19038 + }, + { + "epoch": 0.1660445483246411, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 19039 + }, + { + "epoch": 0.1660532696098097, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 19040 + }, + { + "epoch": 0.16606199089497828, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 19041 + }, + { + "epoch": 0.16607071218014688, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 19042 + }, + { + "epoch": 0.16607943346531545, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 19043 + }, + { + "epoch": 0.16608815475048402, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 19044 + }, + { + "epoch": 0.16609687603565262, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 19045 + }, + { + "epoch": 0.1661055973208212, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 19046 + }, + { + "epoch": 0.16611431860598977, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 19047 + }, + { + "epoch": 0.16612303989115837, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 19048 + }, + { + "epoch": 0.16613176117632694, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 19049 + }, + { + "epoch": 0.16614048246149551, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19050 + }, + { + "epoch": 0.16614920374666411, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 19051 + }, + { + "epoch": 0.1661579250318327, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 19052 + }, + { + "epoch": 0.16616664631700126, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 19053 + }, + { + "epoch": 0.16617536760216986, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 19054 + }, + { + "epoch": 0.16618408888733843, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 19055 + }, + { + "epoch": 0.16619281017250703, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 19056 + }, + { + "epoch": 0.1662015314576756, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 19057 + }, + { + "epoch": 0.16621025274284418, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 19058 + }, + { + "epoch": 0.16621897402801278, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 19059 + }, + { + "epoch": 0.16622769531318135, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 19060 + }, + { + "epoch": 0.16623641659834992, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 19061 + }, + { + "epoch": 0.16624513788351852, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 19062 + }, + { + "epoch": 0.1662538591686871, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 19063 + }, + { + "epoch": 0.16626258045385567, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 19064 + }, + { + "epoch": 0.16627130173902427, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 19065 + }, + { + "epoch": 0.16628002302419284, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 19066 + }, + { + "epoch": 0.1662887443093614, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 19067 + }, + { + "epoch": 0.16629746559453001, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 19068 + }, + { + "epoch": 0.1663061868796986, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 19069 + }, + { + "epoch": 0.1663149081648672, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 19070 + }, + { + "epoch": 0.16632362945003576, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 19071 + }, + { + "epoch": 0.16633235073520433, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 19072 + }, + { + "epoch": 0.16634107202037293, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 19073 + }, + { + "epoch": 0.1663497933055415, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 19074 + }, + { + "epoch": 0.16635851459071008, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 19075 + }, + { + "epoch": 0.16636723587587868, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 19076 + }, + { + "epoch": 0.16637595716104725, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 19077 + }, + { + "epoch": 0.16638467844621582, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 19078 + }, + { + "epoch": 0.16639339973138442, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 19079 + }, + { + "epoch": 0.166402121016553, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 19080 + }, + { + "epoch": 0.1664108423017216, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 19081 + }, + { + "epoch": 0.16641956358689017, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 19082 + }, + { + "epoch": 0.16642828487205874, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 19083 + }, + { + "epoch": 0.16643700615722734, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 19084 + }, + { + "epoch": 0.1664457274423959, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 19085 + }, + { + "epoch": 0.16645444872756449, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 19086 + }, + { + "epoch": 0.16646317001273309, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 19087 + }, + { + "epoch": 0.16647189129790166, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 19088 + }, + { + "epoch": 0.16648061258307023, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 19089 + }, + { + "epoch": 0.16648933386823883, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 19090 + }, + { + "epoch": 0.1664980551534074, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 19091 + }, + { + "epoch": 0.16650677643857598, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 19092 + }, + { + "epoch": 0.16651549772374458, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 19093 + }, + { + "epoch": 0.16652421900891315, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 19094 + }, + { + "epoch": 0.16653294029408175, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 19095 + }, + { + "epoch": 0.16654166157925032, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 19096 + }, + { + "epoch": 0.1665503828644189, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 19097 + }, + { + "epoch": 0.1665591041495875, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 19098 + }, + { + "epoch": 0.16656782543475607, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 19099 + }, + { + "epoch": 0.16657654671992464, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 19100 + }, + { + "epoch": 0.16658526800509324, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19101 + }, + { + "epoch": 0.1665939892902618, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 19102 + }, + { + "epoch": 0.16660271057543038, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19103 + }, + { + "epoch": 0.16661143186059899, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 19104 + }, + { + "epoch": 0.16662015314576756, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 19105 + }, + { + "epoch": 0.16662887443093613, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 19106 + }, + { + "epoch": 0.16663759571610473, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 19107 + }, + { + "epoch": 0.1666463170012733, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 19108 + }, + { + "epoch": 0.1666550382864419, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 19109 + }, + { + "epoch": 0.16666375957161048, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 19110 + }, + { + "epoch": 0.16667248085677905, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 19111 + }, + { + "epoch": 0.16668120214194765, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 19112 + }, + { + "epoch": 0.16668992342711622, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 19113 + }, + { + "epoch": 0.1666986447122848, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 19114 + }, + { + "epoch": 0.1667073659974534, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 19115 + }, + { + "epoch": 0.16671608728262197, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19116 + }, + { + "epoch": 0.16672480856779054, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 19117 + }, + { + "epoch": 0.16673352985295914, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 19118 + }, + { + "epoch": 0.1667422511381277, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 19119 + }, + { + "epoch": 0.16675097242329628, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 19120 + }, + { + "epoch": 0.16675969370846488, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19121 + }, + { + "epoch": 0.16676841499363346, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 19122 + }, + { + "epoch": 0.16677713627880206, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 19123 + }, + { + "epoch": 0.16678585756397063, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 19124 + }, + { + "epoch": 0.1667945788491392, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 19125 + }, + { + "epoch": 0.1668033001343078, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 19126 + }, + { + "epoch": 0.16681202141947638, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 19127 + }, + { + "epoch": 0.16682074270464495, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 19128 + }, + { + "epoch": 0.16682946398981355, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 19129 + }, + { + "epoch": 0.16683818527498212, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 19130 + }, + { + "epoch": 0.1668469065601507, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 19131 + }, + { + "epoch": 0.1668556278453193, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 19132 + }, + { + "epoch": 0.16686434913048787, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 19133 + }, + { + "epoch": 0.16687307041565644, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 19134 + }, + { + "epoch": 0.16688179170082504, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 19135 + }, + { + "epoch": 0.1668905129859936, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 19136 + }, + { + "epoch": 0.1668992342711622, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 19137 + }, + { + "epoch": 0.16690795555633078, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 19138 + }, + { + "epoch": 0.16691667684149936, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 19139 + }, + { + "epoch": 0.16692539812666796, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 19140 + }, + { + "epoch": 0.16693411941183653, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 19141 + }, + { + "epoch": 0.1669428406970051, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 19142 + }, + { + "epoch": 0.1669515619821737, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 19143 + }, + { + "epoch": 0.16696028326734227, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 19144 + }, + { + "epoch": 0.16696900455251085, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 19145 + }, + { + "epoch": 0.16697772583767945, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 19146 + }, + { + "epoch": 0.16698644712284802, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 19147 + }, + { + "epoch": 0.1669951684080166, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 19148 + }, + { + "epoch": 0.1670038896931852, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 19149 + }, + { + "epoch": 0.16701261097835376, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 19150 + }, + { + "epoch": 0.16702133226352237, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 19151 + }, + { + "epoch": 0.16703005354869094, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 19152 + }, + { + "epoch": 0.1670387748338595, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 19153 + }, + { + "epoch": 0.1670474961190281, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 19154 + }, + { + "epoch": 0.16705621740419668, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 19155 + }, + { + "epoch": 0.16706493868936526, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 19156 + }, + { + "epoch": 0.16707365997453386, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 19157 + }, + { + "epoch": 0.16708238125970243, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 19158 + }, + { + "epoch": 0.167091102544871, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 19159 + }, + { + "epoch": 0.1670998238300396, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 19160 + }, + { + "epoch": 0.16710854511520817, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 19161 + }, + { + "epoch": 0.16711726640037675, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 19162 + }, + { + "epoch": 0.16712598768554535, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 19163 + }, + { + "epoch": 0.16713470897071392, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 19164 + }, + { + "epoch": 0.16714343025588252, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 19165 + }, + { + "epoch": 0.1671521515410511, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 19166 + }, + { + "epoch": 0.16716087282621966, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 19167 + }, + { + "epoch": 0.16716959411138826, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 19168 + }, + { + "epoch": 0.16717831539655684, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 19169 + }, + { + "epoch": 0.1671870366817254, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 19170 + }, + { + "epoch": 0.167195757966894, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 19171 + }, + { + "epoch": 0.16720447925206258, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 19172 + }, + { + "epoch": 0.16721320053723115, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 19173 + }, + { + "epoch": 0.16722192182239975, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 19174 + }, + { + "epoch": 0.16723064310756833, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 19175 + }, + { + "epoch": 0.1672393643927369, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 19176 + }, + { + "epoch": 0.1672480856779055, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 19177 + }, + { + "epoch": 0.16725680696307407, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 19178 + }, + { + "epoch": 0.16726552824824267, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 19179 + }, + { + "epoch": 0.16727424953341125, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 19180 + }, + { + "epoch": 0.16728297081857982, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 19181 + }, + { + "epoch": 0.16729169210374842, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 19182 + }, + { + "epoch": 0.167300413388917, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 19183 + }, + { + "epoch": 0.16730913467408556, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 19184 + }, + { + "epoch": 0.16731785595925416, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 19185 + }, + { + "epoch": 0.16732657724442274, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 19186 + }, + { + "epoch": 0.1673352985295913, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 19187 + }, + { + "epoch": 0.1673440198147599, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 19188 + }, + { + "epoch": 0.16735274109992848, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 19189 + }, + { + "epoch": 0.16736146238509708, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 19190 + }, + { + "epoch": 0.16737018367026565, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 19191 + }, + { + "epoch": 0.16737890495543423, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 19192 + }, + { + "epoch": 0.16738762624060283, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 19193 + }, + { + "epoch": 0.1673963475257714, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 19194 + }, + { + "epoch": 0.16740506881093997, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 19195 + }, + { + "epoch": 0.16741379009610857, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 19196 + }, + { + "epoch": 0.16742251138127714, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 19197 + }, + { + "epoch": 0.16743123266644572, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 19198 + }, + { + "epoch": 0.16743995395161432, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 19199 + }, + { + "epoch": 0.1674486752367829, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 19200 + }, + { + "epoch": 0.16745739652195146, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 19201 + }, + { + "epoch": 0.16746611780712006, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 19202 + }, + { + "epoch": 0.16747483909228864, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19203 + }, + { + "epoch": 0.16748356037745724, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 19204 + }, + { + "epoch": 0.1674922816626258, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 19205 + }, + { + "epoch": 0.16750100294779438, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 19206 + }, + { + "epoch": 0.16750972423296298, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 19207 + }, + { + "epoch": 0.16751844551813155, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 19208 + }, + { + "epoch": 0.16752716680330013, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 19209 + }, + { + "epoch": 0.16753588808846873, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 19210 + }, + { + "epoch": 0.1675446093736373, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 19211 + }, + { + "epoch": 0.16755333065880587, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 19212 + }, + { + "epoch": 0.16756205194397447, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 19213 + }, + { + "epoch": 0.16757077322914304, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 19214 + }, + { + "epoch": 0.16757949451431162, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 19215 + }, + { + "epoch": 0.16758821579948022, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 19216 + }, + { + "epoch": 0.1675969370846488, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 19217 + }, + { + "epoch": 0.1676056583698174, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19218 + }, + { + "epoch": 0.16761437965498596, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 19219 + }, + { + "epoch": 0.16762310094015453, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 19220 + }, + { + "epoch": 0.16763182222532313, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 19221 + }, + { + "epoch": 0.1676405435104917, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 19222 + }, + { + "epoch": 0.16764926479566028, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 19223 + }, + { + "epoch": 0.16765798608082888, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 19224 + }, + { + "epoch": 0.16766670736599745, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 19225 + }, + { + "epoch": 0.16767542865116603, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 19226 + }, + { + "epoch": 0.16768414993633463, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 19227 + }, + { + "epoch": 0.1676928712215032, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 19228 + }, + { + "epoch": 0.16770159250667177, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 19229 + }, + { + "epoch": 0.16771031379184037, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 19230 + }, + { + "epoch": 0.16771903507700894, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 19231 + }, + { + "epoch": 0.16772775636217754, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 19232 + }, + { + "epoch": 0.16773647764734612, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 19233 + }, + { + "epoch": 0.1677451989325147, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 19234 + }, + { + "epoch": 0.1677539202176833, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 19235 + }, + { + "epoch": 0.16776264150285186, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 19236 + }, + { + "epoch": 0.16777136278802043, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 19237 + }, + { + "epoch": 0.16778008407318903, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 19238 + }, + { + "epoch": 0.1677888053583576, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 19239 + }, + { + "epoch": 0.16779752664352618, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 19240 + }, + { + "epoch": 0.16780624792869478, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 19241 + }, + { + "epoch": 0.16781496921386335, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 19242 + }, + { + "epoch": 0.16782369049903192, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 19243 + }, + { + "epoch": 0.16783241178420052, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 19244 + }, + { + "epoch": 0.1678411330693691, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 19245 + }, + { + "epoch": 0.1678498543545377, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 19246 + }, + { + "epoch": 0.16785857563970627, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 19247 + }, + { + "epoch": 0.16786729692487484, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 19248 + }, + { + "epoch": 0.16787601821004344, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 19249 + }, + { + "epoch": 0.16788473949521202, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 19250 + }, + { + "epoch": 0.1678934607803806, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 19251 + }, + { + "epoch": 0.1679021820655492, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 19252 + }, + { + "epoch": 0.16791090335071776, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 19253 + }, + { + "epoch": 0.16791962463588633, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 19254 + }, + { + "epoch": 0.16792834592105493, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 19255 + }, + { + "epoch": 0.1679370672062235, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 19256 + }, + { + "epoch": 0.16794578849139208, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 19257 + }, + { + "epoch": 0.16795450977656068, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 19258 + }, + { + "epoch": 0.16796323106172925, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 19259 + }, + { + "epoch": 0.16797195234689785, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 19260 + }, + { + "epoch": 0.16798067363206642, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 19261 + }, + { + "epoch": 0.167989394917235, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 19262 + }, + { + "epoch": 0.1679981162024036, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 19263 + }, + { + "epoch": 0.16800683748757217, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 19264 + }, + { + "epoch": 0.16801555877274074, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 19265 + }, + { + "epoch": 0.16802428005790934, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 19266 + }, + { + "epoch": 0.16803300134307791, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 19267 + }, + { + "epoch": 0.1680417226282465, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 19268 + }, + { + "epoch": 0.1680504439134151, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 19269 + }, + { + "epoch": 0.16805916519858366, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 19270 + }, + { + "epoch": 0.16806788648375223, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 19271 + }, + { + "epoch": 0.16807660776892083, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 19272 + }, + { + "epoch": 0.1680853290540894, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 19273 + }, + { + "epoch": 0.168094050339258, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 19274 + }, + { + "epoch": 0.16810277162442658, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 19275 + }, + { + "epoch": 0.16811149290959515, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 19276 + }, + { + "epoch": 0.16812021419476375, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 19277 + }, + { + "epoch": 0.16812893547993232, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 19278 + }, + { + "epoch": 0.1681376567651009, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 19279 + }, + { + "epoch": 0.1681463780502695, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 19280 + }, + { + "epoch": 0.16815509933543807, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 19281 + }, + { + "epoch": 0.16816382062060664, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 19282 + }, + { + "epoch": 0.16817254190577524, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 19283 + }, + { + "epoch": 0.1681812631909438, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 19284 + }, + { + "epoch": 0.1681899844761124, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 19285 + }, + { + "epoch": 0.168198705761281, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 19286 + }, + { + "epoch": 0.16820742704644956, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 19287 + }, + { + "epoch": 0.16821614833161816, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 19288 + }, + { + "epoch": 0.16822486961678673, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 19289 + }, + { + "epoch": 0.1682335909019553, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 19290 + }, + { + "epoch": 0.1682423121871239, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 19291 + }, + { + "epoch": 0.16825103347229248, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 19292 + }, + { + "epoch": 0.16825975475746105, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 19293 + }, + { + "epoch": 0.16826847604262965, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 19294 + }, + { + "epoch": 0.16827719732779822, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 19295 + }, + { + "epoch": 0.1682859186129668, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 19296 + }, + { + "epoch": 0.1682946398981354, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 19297 + }, + { + "epoch": 0.16830336118330397, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 19298 + }, + { + "epoch": 0.16831208246847254, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 19299 + }, + { + "epoch": 0.16832080375364114, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 19300 + }, + { + "epoch": 0.1683295250388097, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 19301 + }, + { + "epoch": 0.1683382463239783, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 19302 + }, + { + "epoch": 0.16834696760914689, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 19303 + }, + { + "epoch": 0.16835568889431546, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 19304 + }, + { + "epoch": 0.16836441017948406, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 19305 + }, + { + "epoch": 0.16837313146465263, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 19306 + }, + { + "epoch": 0.1683818527498212, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 19307 + }, + { + "epoch": 0.1683905740349898, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 19308 + }, + { + "epoch": 0.16839929532015838, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 19309 + }, + { + "epoch": 0.16840801660532695, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 19310 + }, + { + "epoch": 0.16841673789049555, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 19311 + }, + { + "epoch": 0.16842545917566412, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 19312 + }, + { + "epoch": 0.16843418046083272, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 19313 + }, + { + "epoch": 0.1684429017460013, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 19314 + }, + { + "epoch": 0.16845162303116987, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 19315 + }, + { + "epoch": 0.16846034431633847, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 19316 + }, + { + "epoch": 0.16846906560150704, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 19317 + }, + { + "epoch": 0.1684777868866756, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 19318 + }, + { + "epoch": 0.1684865081718442, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 19319 + }, + { + "epoch": 0.16849522945701279, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 19320 + }, + { + "epoch": 0.16850395074218136, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 19321 + }, + { + "epoch": 0.16851267202734996, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 19322 + }, + { + "epoch": 0.16852139331251853, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 19323 + }, + { + "epoch": 0.1685301145976871, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 19324 + }, + { + "epoch": 0.1685388358828557, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 19325 + }, + { + "epoch": 0.16854755716802428, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 19326 + }, + { + "epoch": 0.16855627845319288, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 19327 + }, + { + "epoch": 0.16856499973836145, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 19328 + }, + { + "epoch": 0.16857372102353002, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19329 + }, + { + "epoch": 0.16858244230869862, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 19330 + }, + { + "epoch": 0.1685911635938672, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 19331 + }, + { + "epoch": 0.16859988487903577, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 19332 + }, + { + "epoch": 0.16860860616420437, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 19333 + }, + { + "epoch": 0.16861732744937294, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 19334 + }, + { + "epoch": 0.1686260487345415, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 19335 + }, + { + "epoch": 0.1686347700197101, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 19336 + }, + { + "epoch": 0.16864349130487868, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 19337 + }, + { + "epoch": 0.16865221259004726, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 19338 + }, + { + "epoch": 0.16866093387521586, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 19339 + }, + { + "epoch": 0.16866965516038443, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 19340 + }, + { + "epoch": 0.16867837644555303, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 19341 + }, + { + "epoch": 0.1686870977307216, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 19342 + }, + { + "epoch": 0.16869581901589017, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 19343 + }, + { + "epoch": 0.16870454030105878, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 19344 + }, + { + "epoch": 0.16871326158622735, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19345 + }, + { + "epoch": 0.16872198287139592, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 19346 + }, + { + "epoch": 0.16873070415656452, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 19347 + }, + { + "epoch": 0.1687394254417331, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 19348 + }, + { + "epoch": 0.16874814672690167, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 19349 + }, + { + "epoch": 0.16875686801207027, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 19350 + }, + { + "epoch": 0.16876558929723884, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 19351 + }, + { + "epoch": 0.1687743105824074, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 19352 + }, + { + "epoch": 0.168783031867576, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19353 + }, + { + "epoch": 0.16879175315274458, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 19354 + }, + { + "epoch": 0.16880047443791318, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 19355 + }, + { + "epoch": 0.16880919572308176, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 19356 + }, + { + "epoch": 0.16881791700825033, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 19357 + }, + { + "epoch": 0.16882663829341893, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 19358 + }, + { + "epoch": 0.1688353595785875, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 19359 + }, + { + "epoch": 0.16884408086375607, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 19360 + }, + { + "epoch": 0.16885280214892467, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 19361 + }, + { + "epoch": 0.16886152343409325, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 19362 + }, + { + "epoch": 0.16887024471926182, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 19363 + }, + { + "epoch": 0.16887896600443042, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 19364 + }, + { + "epoch": 0.168887687289599, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 19365 + }, + { + "epoch": 0.16889640857476756, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 19366 + }, + { + "epoch": 0.16890512985993617, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 19367 + }, + { + "epoch": 0.16891385114510474, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 19368 + }, + { + "epoch": 0.16892257243027334, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 19369 + }, + { + "epoch": 0.1689312937154419, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 19370 + }, + { + "epoch": 0.16894001500061048, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 19371 + }, + { + "epoch": 0.16894873628577908, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 19372 + }, + { + "epoch": 0.16895745757094766, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 19373 + }, + { + "epoch": 0.16896617885611623, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 19374 + }, + { + "epoch": 0.16897490014128483, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 19375 + }, + { + "epoch": 0.1689836214264534, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19376 + }, + { + "epoch": 0.16899234271162197, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 19377 + }, + { + "epoch": 0.16900106399679057, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 19378 + }, + { + "epoch": 0.16900978528195915, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19379 + }, + { + "epoch": 0.16901850656712772, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 19380 + }, + { + "epoch": 0.16902722785229632, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 19381 + }, + { + "epoch": 0.1690359491374649, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 19382 + }, + { + "epoch": 0.1690446704226335, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 19383 + }, + { + "epoch": 0.16905339170780206, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 19384 + }, + { + "epoch": 0.16906211299297064, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 19385 + }, + { + "epoch": 0.16907083427813924, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 19386 + }, + { + "epoch": 0.1690795555633078, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 19387 + }, + { + "epoch": 0.16908827684847638, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 19388 + }, + { + "epoch": 0.16909699813364498, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 19389 + }, + { + "epoch": 0.16910571941881355, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 19390 + }, + { + "epoch": 0.16911444070398213, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 19391 + }, + { + "epoch": 0.16912316198915073, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 19392 + }, + { + "epoch": 0.1691318832743193, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 19393 + }, + { + "epoch": 0.16914060455948787, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 19394 + }, + { + "epoch": 0.16914932584465647, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 19395 + }, + { + "epoch": 0.16915804712982505, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 19396 + }, + { + "epoch": 0.16916676841499365, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 19397 + }, + { + "epoch": 0.16917548970016222, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 19398 + }, + { + "epoch": 0.1691842109853308, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 19399 + }, + { + "epoch": 0.1691929322704994, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 19400 + }, + { + "epoch": 0.16920165355566796, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 19401 + }, + { + "epoch": 0.16921037484083654, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 19402 + }, + { + "epoch": 0.16921909612600514, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 19403 + }, + { + "epoch": 0.1692278174111737, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 19404 + }, + { + "epoch": 0.16923653869634228, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 19405 + }, + { + "epoch": 0.16924525998151088, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 19406 + }, + { + "epoch": 0.16925398126667945, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 19407 + }, + { + "epoch": 0.16926270255184803, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 19408 + }, + { + "epoch": 0.16927142383701663, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 19409 + }, + { + "epoch": 0.1692801451221852, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 19410 + }, + { + "epoch": 0.1692888664073538, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 19411 + }, + { + "epoch": 0.16929758769252237, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 19412 + }, + { + "epoch": 0.16930630897769094, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 19413 + }, + { + "epoch": 0.16931503026285954, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 19414 + }, + { + "epoch": 0.16932375154802812, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 19415 + }, + { + "epoch": 0.1693324728331967, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 19416 + }, + { + "epoch": 0.1693411941183653, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 19417 + }, + { + "epoch": 0.16934991540353386, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 19418 + }, + { + "epoch": 0.16935863668870244, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 19419 + }, + { + "epoch": 0.16936735797387104, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 19420 + }, + { + "epoch": 0.1693760792590396, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 19421 + }, + { + "epoch": 0.1693848005442082, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 19422 + }, + { + "epoch": 0.16939352182937678, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 19423 + }, + { + "epoch": 0.16940224311454535, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 19424 + }, + { + "epoch": 0.16941096439971395, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 19425 + }, + { + "epoch": 0.16941968568488253, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 19426 + }, + { + "epoch": 0.1694284069700511, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 19427 + }, + { + "epoch": 0.1694371282552197, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 19428 + }, + { + "epoch": 0.16944584954038827, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 19429 + }, + { + "epoch": 0.16945457082555684, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 19430 + }, + { + "epoch": 0.16946329211072544, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 19431 + }, + { + "epoch": 0.16947201339589402, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 19432 + }, + { + "epoch": 0.1694807346810626, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 19433 + }, + { + "epoch": 0.1694894559662312, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 19434 + }, + { + "epoch": 0.16949817725139976, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 19435 + }, + { + "epoch": 0.16950689853656836, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 19436 + }, + { + "epoch": 0.16951561982173693, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 19437 + }, + { + "epoch": 0.1695243411069055, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 19438 + }, + { + "epoch": 0.1695330623920741, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 19439 + }, + { + "epoch": 0.16954178367724268, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 19440 + }, + { + "epoch": 0.16955050496241125, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 19441 + }, + { + "epoch": 0.16955922624757985, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 19442 + }, + { + "epoch": 0.16956794753274843, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 19443 + }, + { + "epoch": 0.169576668817917, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 19444 + }, + { + "epoch": 0.1695853901030856, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 19445 + }, + { + "epoch": 0.16959411138825417, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 19446 + }, + { + "epoch": 0.16960283267342274, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 19447 + }, + { + "epoch": 0.16961155395859134, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 19448 + }, + { + "epoch": 0.16962027524375992, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 19449 + }, + { + "epoch": 0.16962899652892852, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 19450 + }, + { + "epoch": 0.1696377178140971, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 19451 + }, + { + "epoch": 0.16964643909926566, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 19452 + }, + { + "epoch": 0.16965516038443426, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 19453 + }, + { + "epoch": 0.16966388166960283, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 19454 + }, + { + "epoch": 0.1696726029547714, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 19455 + }, + { + "epoch": 0.16968132423994, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 19456 + }, + { + "epoch": 0.16969004552510858, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 19457 + }, + { + "epoch": 0.16969876681027715, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 19458 + }, + { + "epoch": 0.16970748809544575, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 19459 + }, + { + "epoch": 0.16971620938061432, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 19460 + }, + { + "epoch": 0.1697249306657829, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 19461 + }, + { + "epoch": 0.1697336519509515, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 19462 + }, + { + "epoch": 0.16974237323612007, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 19463 + }, + { + "epoch": 0.16975109452128867, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 19464 + }, + { + "epoch": 0.16975981580645724, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 19465 + }, + { + "epoch": 0.16976853709162582, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 19466 + }, + { + "epoch": 0.16977725837679442, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 19467 + }, + { + "epoch": 0.169785979661963, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 19468 + }, + { + "epoch": 0.16979470094713156, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 19469 + }, + { + "epoch": 0.16980342223230016, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 19470 + }, + { + "epoch": 0.16981214351746873, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 19471 + }, + { + "epoch": 0.1698208648026373, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 19472 + }, + { + "epoch": 0.1698295860878059, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 19473 + }, + { + "epoch": 0.16983830737297448, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 19474 + }, + { + "epoch": 0.16984702865814305, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 19475 + }, + { + "epoch": 0.16985574994331165, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 19476 + }, + { + "epoch": 0.16986447122848022, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 19477 + }, + { + "epoch": 0.16987319251364882, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 19478 + }, + { + "epoch": 0.1698819137988174, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 19479 + }, + { + "epoch": 0.16989063508398597, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 19480 + }, + { + "epoch": 0.16989935636915457, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19481 + }, + { + "epoch": 0.16990807765432314, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 19482 + }, + { + "epoch": 0.16991679893949171, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 19483 + }, + { + "epoch": 0.16992552022466031, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 19484 + }, + { + "epoch": 0.1699342415098289, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 19485 + }, + { + "epoch": 0.16994296279499746, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 19486 + }, + { + "epoch": 0.16995168408016606, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 19487 + }, + { + "epoch": 0.16996040536533463, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 19488 + }, + { + "epoch": 0.1699691266505032, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 19489 + }, + { + "epoch": 0.1699778479356718, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 19490 + }, + { + "epoch": 0.16998656922084038, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 19491 + }, + { + "epoch": 0.16999529050600898, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 19492 + }, + { + "epoch": 0.17000401179117755, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 19493 + }, + { + "epoch": 0.17001273307634612, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 19494 + }, + { + "epoch": 0.17002145436151472, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 19495 + }, + { + "epoch": 0.1700301756466833, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 19496 + }, + { + "epoch": 0.17003889693185187, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 19497 + }, + { + "epoch": 0.17004761821702047, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 19498 + }, + { + "epoch": 0.17005633950218904, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 19499 + }, + { + "epoch": 0.1700650607873576, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 19500 + }, + { + "epoch": 0.17007378207252621, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 19501 + }, + { + "epoch": 0.1700825033576948, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 19502 + }, + { + "epoch": 0.17009122464286336, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 19503 + }, + { + "epoch": 0.17009994592803196, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 19504 + }, + { + "epoch": 0.17010866721320053, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 19505 + }, + { + "epoch": 0.17011738849836913, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 19506 + }, + { + "epoch": 0.1701261097835377, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 19507 + }, + { + "epoch": 0.17013483106870628, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 19508 + }, + { + "epoch": 0.17014355235387488, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 19509 + }, + { + "epoch": 0.17015227363904345, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 19510 + }, + { + "epoch": 0.17016099492421202, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 19511 + }, + { + "epoch": 0.17016971620938062, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 19512 + }, + { + "epoch": 0.1701784374945492, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 19513 + }, + { + "epoch": 0.17018715877971777, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 19514 + }, + { + "epoch": 0.17019588006488637, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 19515 + }, + { + "epoch": 0.17020460135005494, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 19516 + }, + { + "epoch": 0.1702133226352235, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 19517 + }, + { + "epoch": 0.1702220439203921, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 19518 + }, + { + "epoch": 0.17023076520556069, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 19519 + }, + { + "epoch": 0.17023948649072929, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 19520 + }, + { + "epoch": 0.17024820777589786, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 19521 + }, + { + "epoch": 0.17025692906106643, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 19522 + }, + { + "epoch": 0.17026565034623503, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 19523 + }, + { + "epoch": 0.1702743716314036, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 19524 + }, + { + "epoch": 0.17028309291657218, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 19525 + }, + { + "epoch": 0.17029181420174078, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 19526 + }, + { + "epoch": 0.17030053548690935, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 19527 + }, + { + "epoch": 0.17030925677207792, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 19528 + }, + { + "epoch": 0.17031797805724652, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 19529 + }, + { + "epoch": 0.1703266993424151, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 19530 + }, + { + "epoch": 0.17033542062758367, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 19531 + }, + { + "epoch": 0.17034414191275227, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 19532 + }, + { + "epoch": 0.17035286319792084, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 19533 + }, + { + "epoch": 0.17036158448308944, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 19534 + }, + { + "epoch": 0.170370305768258, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19535 + }, + { + "epoch": 0.17037902705342658, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 19536 + }, + { + "epoch": 0.17038774833859519, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 19537 + }, + { + "epoch": 0.17039646962376376, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 19538 + }, + { + "epoch": 0.17040519090893233, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 19539 + }, + { + "epoch": 0.17041391219410093, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 19540 + }, + { + "epoch": 0.1704226334792695, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 19541 + }, + { + "epoch": 0.17043135476443808, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 19542 + }, + { + "epoch": 0.17044007604960668, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 19543 + }, + { + "epoch": 0.17044879733477525, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 19544 + }, + { + "epoch": 0.17045751861994385, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 19545 + }, + { + "epoch": 0.17046623990511242, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 19546 + }, + { + "epoch": 0.170474961190281, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 19547 + }, + { + "epoch": 0.1704836824754496, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 19548 + }, + { + "epoch": 0.17049240376061817, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 19549 + }, + { + "epoch": 0.17050112504578674, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 19550 + }, + { + "epoch": 0.17050984633095534, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 19551 + }, + { + "epoch": 0.1705185676161239, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 19552 + }, + { + "epoch": 0.17052728890129248, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 19553 + }, + { + "epoch": 0.17053601018646108, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 19554 + }, + { + "epoch": 0.17054473147162966, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 19555 + }, + { + "epoch": 0.17055345275679823, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 19556 + }, + { + "epoch": 0.17056217404196683, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 19557 + }, + { + "epoch": 0.1705708953271354, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 19558 + }, + { + "epoch": 0.170579616612304, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 19559 + }, + { + "epoch": 0.17058833789747258, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 19560 + }, + { + "epoch": 0.17059705918264115, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 19561 + }, + { + "epoch": 0.17060578046780975, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 19562 + }, + { + "epoch": 0.17061450175297832, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 19563 + }, + { + "epoch": 0.1706232230381469, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 19564 + }, + { + "epoch": 0.1706319443233155, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 19565 + }, + { + "epoch": 0.17064066560848407, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 19566 + }, + { + "epoch": 0.17064938689365264, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 19567 + }, + { + "epoch": 0.17065810817882124, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 19568 + }, + { + "epoch": 0.1706668294639898, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 19569 + }, + { + "epoch": 0.17067555074915838, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 19570 + }, + { + "epoch": 0.17068427203432698, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 19571 + }, + { + "epoch": 0.17069299331949556, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 19572 + }, + { + "epoch": 0.17070171460466416, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 19573 + }, + { + "epoch": 0.17071043588983273, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 19574 + }, + { + "epoch": 0.1707191571750013, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 19575 + }, + { + "epoch": 0.1707278784601699, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 19576 + }, + { + "epoch": 0.17073659974533847, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 19577 + }, + { + "epoch": 0.17074532103050705, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 19578 + }, + { + "epoch": 0.17075404231567565, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 19579 + }, + { + "epoch": 0.17076276360084422, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 19580 + }, + { + "epoch": 0.1707714848860128, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 19581 + }, + { + "epoch": 0.1707802061711814, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 19582 + }, + { + "epoch": 0.17078892745634996, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 19583 + }, + { + "epoch": 0.17079764874151854, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 19584 + }, + { + "epoch": 0.17080637002668714, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 19585 + }, + { + "epoch": 0.1708150913118557, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 19586 + }, + { + "epoch": 0.1708238125970243, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 19587 + }, + { + "epoch": 0.17083253388219288, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 19588 + }, + { + "epoch": 0.17084125516736146, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 19589 + }, + { + "epoch": 0.17084997645253006, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 19590 + }, + { + "epoch": 0.17085869773769863, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 19591 + }, + { + "epoch": 0.1708674190228672, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 19592 + }, + { + "epoch": 0.1708761403080358, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 19593 + }, + { + "epoch": 0.17088486159320437, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 19594 + }, + { + "epoch": 0.17089358287837295, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 19595 + }, + { + "epoch": 0.17090230416354155, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 19596 + }, + { + "epoch": 0.17091102544871012, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 19597 + }, + { + "epoch": 0.1709197467338787, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 19598 + }, + { + "epoch": 0.1709284680190473, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 19599 + }, + { + "epoch": 0.17093718930421586, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 19600 + }, + { + "epoch": 0.17094591058938446, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 19601 + }, + { + "epoch": 0.17095463187455304, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 19602 + }, + { + "epoch": 0.1709633531597216, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 19603 + }, + { + "epoch": 0.1709720744448902, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 19604 + }, + { + "epoch": 0.17098079573005878, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 19605 + }, + { + "epoch": 0.17098951701522735, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 19606 + }, + { + "epoch": 0.17099823830039595, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 19607 + }, + { + "epoch": 0.17100695958556453, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 19608 + }, + { + "epoch": 0.1710156808707331, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 19609 + }, + { + "epoch": 0.1710244021559017, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 19610 + }, + { + "epoch": 0.17103312344107027, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 19611 + }, + { + "epoch": 0.17104184472623885, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 19612 + }, + { + "epoch": 0.17105056601140745, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 19613 + }, + { + "epoch": 0.17105928729657602, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 19614 + }, + { + "epoch": 0.17106800858174462, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 19615 + }, + { + "epoch": 0.1710767298669132, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 19616 + }, + { + "epoch": 0.17108545115208176, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19617 + }, + { + "epoch": 0.17109417243725036, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 19618 + }, + { + "epoch": 0.17110289372241894, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 19619 + }, + { + "epoch": 0.1711116150075875, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 19620 + }, + { + "epoch": 0.1711203362927561, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 19621 + }, + { + "epoch": 0.17112905757792468, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 19622 + }, + { + "epoch": 0.17113777886309325, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 19623 + }, + { + "epoch": 0.17114650014826185, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 19624 + }, + { + "epoch": 0.17115522143343043, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 19625 + }, + { + "epoch": 0.171163942718599, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 19626 + }, + { + "epoch": 0.1711726640037676, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 19627 + }, + { + "epoch": 0.17118138528893617, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 19628 + }, + { + "epoch": 0.17119010657410477, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 19629 + }, + { + "epoch": 0.17119882785927334, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 19630 + }, + { + "epoch": 0.17120754914444192, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 19631 + }, + { + "epoch": 0.17121627042961052, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 19632 + }, + { + "epoch": 0.1712249917147791, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 19633 + }, + { + "epoch": 0.17123371299994766, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 19634 + }, + { + "epoch": 0.17124243428511626, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 19635 + }, + { + "epoch": 0.17125115557028484, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 19636 + }, + { + "epoch": 0.1712598768554534, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 19637 + }, + { + "epoch": 0.171268598140622, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 19638 + }, + { + "epoch": 0.17127731942579058, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 19639 + }, + { + "epoch": 0.17128604071095915, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 19640 + }, + { + "epoch": 0.17129476199612775, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 19641 + }, + { + "epoch": 0.17130348328129633, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 19642 + }, + { + "epoch": 0.17131220456646493, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 19643 + }, + { + "epoch": 0.1713209258516335, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 19644 + }, + { + "epoch": 0.17132964713680207, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 19645 + }, + { + "epoch": 0.17133836842197067, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 19646 + }, + { + "epoch": 0.17134708970713924, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 19647 + }, + { + "epoch": 0.17135581099230782, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 19648 + }, + { + "epoch": 0.17136453227747642, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 19649 + }, + { + "epoch": 0.171373253562645, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 19650 + }, + { + "epoch": 0.17138197484781356, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 19651 + }, + { + "epoch": 0.17139069613298216, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 19652 + }, + { + "epoch": 0.17139941741815073, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 19653 + }, + { + "epoch": 0.17140813870331933, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 19654 + }, + { + "epoch": 0.1714168599884879, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 19655 + }, + { + "epoch": 0.17142558127365648, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 19656 + }, + { + "epoch": 0.17143430255882508, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 19657 + }, + { + "epoch": 0.17144302384399365, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 19658 + }, + { + "epoch": 0.17145174512916223, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 19659 + }, + { + "epoch": 0.17146046641433083, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 19660 + }, + { + "epoch": 0.1714691876994994, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 19661 + }, + { + "epoch": 0.17147790898466797, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 19662 + }, + { + "epoch": 0.17148663026983657, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 19663 + }, + { + "epoch": 0.17149535155500514, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19664 + }, + { + "epoch": 0.17150407284017372, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 19665 + }, + { + "epoch": 0.17151279412534232, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 19666 + }, + { + "epoch": 0.1715215154105109, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 19667 + }, + { + "epoch": 0.1715302366956795, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 19668 + }, + { + "epoch": 0.17153895798084806, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 19669 + }, + { + "epoch": 0.17154767926601663, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 19670 + }, + { + "epoch": 0.17155640055118523, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 19671 + }, + { + "epoch": 0.1715651218363538, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 19672 + }, + { + "epoch": 0.17157384312152238, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 19673 + }, + { + "epoch": 0.17158256440669098, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 19674 + }, + { + "epoch": 0.17159128569185955, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 19675 + }, + { + "epoch": 0.17160000697702812, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 19676 + }, + { + "epoch": 0.17160872826219672, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 19677 + }, + { + "epoch": 0.1716174495473653, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 19678 + }, + { + "epoch": 0.17162617083253387, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 19679 + }, + { + "epoch": 0.17163489211770247, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 19680 + }, + { + "epoch": 0.17164361340287104, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 19681 + }, + { + "epoch": 0.17165233468803964, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 19682 + }, + { + "epoch": 0.17166105597320822, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 19683 + }, + { + "epoch": 0.1716697772583768, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 19684 + }, + { + "epoch": 0.1716784985435454, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 19685 + }, + { + "epoch": 0.17168721982871396, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 19686 + }, + { + "epoch": 0.17169594111388253, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 19687 + }, + { + "epoch": 0.17170466239905113, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 19688 + }, + { + "epoch": 0.1717133836842197, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 19689 + }, + { + "epoch": 0.17172210496938828, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 19690 + }, + { + "epoch": 0.17173082625455688, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 19691 + }, + { + "epoch": 0.17173954753972545, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 19692 + }, + { + "epoch": 0.17174826882489402, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 19693 + }, + { + "epoch": 0.17175699011006262, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 19694 + }, + { + "epoch": 0.1717657113952312, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 19695 + }, + { + "epoch": 0.1717744326803998, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 19696 + }, + { + "epoch": 0.17178315396556837, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 19697 + }, + { + "epoch": 0.17179187525073694, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 19698 + }, + { + "epoch": 0.17180059653590554, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 19699 + }, + { + "epoch": 0.17180931782107411, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19700 + }, + { + "epoch": 0.1718180391062427, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 19701 + }, + { + "epoch": 0.1718267603914113, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 19702 + }, + { + "epoch": 0.17183548167657986, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 19703 + }, + { + "epoch": 0.17184420296174843, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 19704 + }, + { + "epoch": 0.17185292424691703, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 19705 + }, + { + "epoch": 0.1718616455320856, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 19706 + }, + { + "epoch": 0.17187036681725418, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 19707 + }, + { + "epoch": 0.17187908810242278, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 19708 + }, + { + "epoch": 0.17188780938759135, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 19709 + }, + { + "epoch": 0.17189653067275995, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 19710 + }, + { + "epoch": 0.17190525195792852, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 19711 + }, + { + "epoch": 0.1719139732430971, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 19712 + }, + { + "epoch": 0.1719226945282657, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 19713 + }, + { + "epoch": 0.17193141581343427, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 19714 + }, + { + "epoch": 0.17194013709860284, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 19715 + }, + { + "epoch": 0.17194885838377144, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 19716 + }, + { + "epoch": 0.17195757966894, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 19717 + }, + { + "epoch": 0.1719663009541086, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 19718 + }, + { + "epoch": 0.1719750222392772, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 19719 + }, + { + "epoch": 0.17198374352444576, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 19720 + }, + { + "epoch": 0.17199246480961433, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 19721 + }, + { + "epoch": 0.17200118609478293, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19722 + }, + { + "epoch": 0.1720099073799515, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 19723 + }, + { + "epoch": 0.1720186286651201, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 19724 + }, + { + "epoch": 0.17202734995028868, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 19725 + }, + { + "epoch": 0.17203607123545725, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 19726 + }, + { + "epoch": 0.17204479252062585, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 19727 + }, + { + "epoch": 0.17205351380579442, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 19728 + }, + { + "epoch": 0.172062235090963, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 19729 + }, + { + "epoch": 0.1720709563761316, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 19730 + }, + { + "epoch": 0.17207967766130017, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 19731 + }, + { + "epoch": 0.17208839894646874, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 19732 + }, + { + "epoch": 0.17209712023163734, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 19733 + }, + { + "epoch": 0.1721058415168059, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 19734 + }, + { + "epoch": 0.17211456280197449, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 19735 + }, + { + "epoch": 0.17212328408714309, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 19736 + }, + { + "epoch": 0.17213200537231166, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 19737 + }, + { + "epoch": 0.17214072665748026, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 19738 + }, + { + "epoch": 0.17214944794264883, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 19739 + }, + { + "epoch": 0.1721581692278174, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 19740 + }, + { + "epoch": 0.172166890512986, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 19741 + }, + { + "epoch": 0.17217561179815458, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 19742 + }, + { + "epoch": 0.17218433308332315, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 19743 + }, + { + "epoch": 0.17219305436849175, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 19744 + }, + { + "epoch": 0.17220177565366032, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 19745 + }, + { + "epoch": 0.1722104969388289, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 19746 + }, + { + "epoch": 0.1722192182239975, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 19747 + }, + { + "epoch": 0.17222793950916607, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 19748 + }, + { + "epoch": 0.17223666079433464, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 19749 + }, + { + "epoch": 0.17224538207950324, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 19750 + }, + { + "epoch": 0.1722541033646718, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 19751 + }, + { + "epoch": 0.1722628246498404, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 19752 + }, + { + "epoch": 0.17227154593500899, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 19753 + }, + { + "epoch": 0.17228026722017756, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 19754 + }, + { + "epoch": 0.17228898850534616, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 19755 + }, + { + "epoch": 0.17229770979051473, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 19756 + }, + { + "epoch": 0.1723064310756833, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 19757 + }, + { + "epoch": 0.1723151523608519, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 19758 + }, + { + "epoch": 0.17232387364602048, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 19759 + }, + { + "epoch": 0.17233259493118905, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 19760 + }, + { + "epoch": 0.17234131621635765, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 19761 + }, + { + "epoch": 0.17235003750152622, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 19762 + }, + { + "epoch": 0.17235875878669482, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 19763 + }, + { + "epoch": 0.1723674800718634, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 19764 + }, + { + "epoch": 0.17237620135703197, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 19765 + }, + { + "epoch": 0.17238492264220057, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 19766 + }, + { + "epoch": 0.17239364392736914, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 19767 + }, + { + "epoch": 0.1724023652125377, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 19768 + }, + { + "epoch": 0.1724110864977063, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 19769 + }, + { + "epoch": 0.17241980778287488, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 19770 + }, + { + "epoch": 0.17242852906804346, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 19771 + }, + { + "epoch": 0.17243725035321206, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 19772 + }, + { + "epoch": 0.17244597163838063, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 19773 + }, + { + "epoch": 0.1724546929235492, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 19774 + }, + { + "epoch": 0.1724634142087178, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19775 + }, + { + "epoch": 0.17247213549388637, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 19776 + }, + { + "epoch": 0.17248085677905498, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 19777 + }, + { + "epoch": 0.17248957806422355, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 19778 + }, + { + "epoch": 0.17249829934939212, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 19779 + }, + { + "epoch": 0.17250702063456072, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 19780 + }, + { + "epoch": 0.1725157419197293, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 19781 + }, + { + "epoch": 0.17252446320489787, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 19782 + }, + { + "epoch": 0.17253318449006647, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 19783 + }, + { + "epoch": 0.17254190577523504, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 19784 + }, + { + "epoch": 0.1725506270604036, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 19785 + }, + { + "epoch": 0.1725593483455722, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 19786 + }, + { + "epoch": 0.17256806963074078, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 19787 + }, + { + "epoch": 0.17257679091590936, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 19788 + }, + { + "epoch": 0.17258551220107796, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 19789 + }, + { + "epoch": 0.17259423348624653, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 19790 + }, + { + "epoch": 0.17260295477141513, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 19791 + }, + { + "epoch": 0.1726116760565837, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 19792 + }, + { + "epoch": 0.17262039734175227, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 19793 + }, + { + "epoch": 0.17262911862692087, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 19794 + }, + { + "epoch": 0.17263783991208945, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 19795 + }, + { + "epoch": 0.17264656119725802, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 19796 + }, + { + "epoch": 0.17265528248242662, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 19797 + }, + { + "epoch": 0.1726640037675952, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 19798 + }, + { + "epoch": 0.17267272505276376, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 19799 + }, + { + "epoch": 0.17268144633793236, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 19800 + }, + { + "epoch": 0.17269016762310094, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 19801 + }, + { + "epoch": 0.1726988889082695, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 19802 + }, + { + "epoch": 0.1727076101934381, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 19803 + }, + { + "epoch": 0.17271633147860668, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 19804 + }, + { + "epoch": 0.17272505276377528, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 19805 + }, + { + "epoch": 0.17273377404894386, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 19806 + }, + { + "epoch": 0.17274249533411243, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 19807 + }, + { + "epoch": 0.17275121661928103, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 19808 + }, + { + "epoch": 0.1727599379044496, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 19809 + }, + { + "epoch": 0.17276865918961817, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 19810 + }, + { + "epoch": 0.17277738047478677, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 19811 + }, + { + "epoch": 0.17278610175995535, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 19812 + }, + { + "epoch": 0.17279482304512392, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 19813 + }, + { + "epoch": 0.17280354433029252, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 19814 + }, + { + "epoch": 0.1728122656154611, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 19815 + }, + { + "epoch": 0.17282098690062966, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 19816 + }, + { + "epoch": 0.17282970818579826, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 19817 + }, + { + "epoch": 0.17283842947096684, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 19818 + }, + { + "epoch": 0.17284715075613544, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 19819 + }, + { + "epoch": 0.172855872041304, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 19820 + }, + { + "epoch": 0.17286459332647258, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 19821 + }, + { + "epoch": 0.17287331461164118, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 19822 + }, + { + "epoch": 0.17288203589680975, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 19823 + }, + { + "epoch": 0.17289075718197833, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 19824 + }, + { + "epoch": 0.17289947846714693, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 19825 + }, + { + "epoch": 0.1729081997523155, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 19826 + }, + { + "epoch": 0.17291692103748407, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 19827 + }, + { + "epoch": 0.17292564232265267, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 19828 + }, + { + "epoch": 0.17293436360782125, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 19829 + }, + { + "epoch": 0.17294308489298982, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 19830 + }, + { + "epoch": 0.17295180617815842, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 19831 + }, + { + "epoch": 0.172960527463327, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 19832 + }, + { + "epoch": 0.1729692487484956, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 19833 + }, + { + "epoch": 0.17297797003366416, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 19834 + }, + { + "epoch": 0.17298669131883274, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 19835 + }, + { + "epoch": 0.17299541260400134, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 19836 + }, + { + "epoch": 0.1730041338891699, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 19837 + }, + { + "epoch": 0.17301285517433848, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 19838 + }, + { + "epoch": 0.17302157645950708, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 19839 + }, + { + "epoch": 0.17303029774467565, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 19840 + }, + { + "epoch": 0.17303901902984423, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 19841 + }, + { + "epoch": 0.17304774031501283, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 19842 + }, + { + "epoch": 0.1730564616001814, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 19843 + }, + { + "epoch": 0.17306518288534997, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 19844 + }, + { + "epoch": 0.17307390417051857, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 19845 + }, + { + "epoch": 0.17308262545568714, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 19846 + }, + { + "epoch": 0.17309134674085574, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 19847 + }, + { + "epoch": 0.17310006802602432, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 19848 + }, + { + "epoch": 0.1731087893111929, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 19849 + }, + { + "epoch": 0.1731175105963615, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 19850 + }, + { + "epoch": 0.17312623188153006, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 19851 + }, + { + "epoch": 0.17313495316669864, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 19852 + }, + { + "epoch": 0.17314367445186724, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 19853 + }, + { + "epoch": 0.1731523957370358, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 19854 + }, + { + "epoch": 0.17316111702220438, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 19855 + }, + { + "epoch": 0.17316983830737298, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 19856 + }, + { + "epoch": 0.17317855959254155, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 19857 + }, + { + "epoch": 0.17318728087771013, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 19858 + }, + { + "epoch": 0.17319600216287873, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 19859 + }, + { + "epoch": 0.1732047234480473, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 19860 + }, + { + "epoch": 0.1732134447332159, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 19861 + }, + { + "epoch": 0.17322216601838447, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 19862 + }, + { + "epoch": 0.17323088730355304, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 19863 + }, + { + "epoch": 0.17323960858872164, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 19864 + }, + { + "epoch": 0.17324832987389022, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 19865 + }, + { + "epoch": 0.1732570511590588, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 19866 + }, + { + "epoch": 0.1732657724442274, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 19867 + }, + { + "epoch": 0.17327449372939596, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 19868 + }, + { + "epoch": 0.17328321501456453, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 19869 + }, + { + "epoch": 0.17329193629973313, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 19870 + }, + { + "epoch": 0.1733006575849017, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 19871 + }, + { + "epoch": 0.17330937887007028, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 19872 + }, + { + "epoch": 0.17331810015523888, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 19873 + }, + { + "epoch": 0.17332682144040745, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 19874 + }, + { + "epoch": 0.17333554272557605, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 19875 + }, + { + "epoch": 0.17334426401074463, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 19876 + }, + { + "epoch": 0.1733529852959132, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 19877 + }, + { + "epoch": 0.1733617065810818, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 19878 + }, + { + "epoch": 0.17337042786625037, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 19879 + }, + { + "epoch": 0.17337914915141894, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 19880 + }, + { + "epoch": 0.17338787043658754, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 19881 + }, + { + "epoch": 0.17339659172175612, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 19882 + }, + { + "epoch": 0.1734053130069247, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 19883 + }, + { + "epoch": 0.1734140342920933, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 19884 + }, + { + "epoch": 0.17342275557726186, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 19885 + }, + { + "epoch": 0.17343147686243046, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 19886 + }, + { + "epoch": 0.17344019814759903, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 19887 + }, + { + "epoch": 0.1734489194327676, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 19888 + }, + { + "epoch": 0.1734576407179362, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 19889 + }, + { + "epoch": 0.17346636200310478, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 19890 + }, + { + "epoch": 0.17347508328827335, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 19891 + }, + { + "epoch": 0.17348380457344195, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 19892 + }, + { + "epoch": 0.17349252585861052, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 19893 + }, + { + "epoch": 0.1735012471437791, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 19894 + }, + { + "epoch": 0.1735099684289477, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 19895 + }, + { + "epoch": 0.17351868971411627, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 19896 + }, + { + "epoch": 0.17352741099928484, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 19897 + }, + { + "epoch": 0.17353613228445344, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 19898 + }, + { + "epoch": 0.17354485356962202, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 19899 + }, + { + "epoch": 0.17355357485479062, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 19900 + }, + { + "epoch": 0.1735622961399592, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 19901 + }, + { + "epoch": 0.17357101742512776, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 19902 + }, + { + "epoch": 0.17357973871029636, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 19903 + }, + { + "epoch": 0.17358845999546493, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 19904 + }, + { + "epoch": 0.1735971812806335, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 19905 + }, + { + "epoch": 0.1736059025658021, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 19906 + }, + { + "epoch": 0.17361462385097068, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 19907 + }, + { + "epoch": 0.17362334513613925, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 19908 + }, + { + "epoch": 0.17363206642130785, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 19909 + }, + { + "epoch": 0.17364078770647642, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 19910 + }, + { + "epoch": 0.173649508991645, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 19911 + }, + { + "epoch": 0.1736582302768136, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 19912 + }, + { + "epoch": 0.17366695156198217, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 19913 + }, + { + "epoch": 0.17367567284715077, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 19914 + }, + { + "epoch": 0.17368439413231934, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 19915 + }, + { + "epoch": 0.17369311541748791, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 19916 + }, + { + "epoch": 0.17370183670265651, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 19917 + }, + { + "epoch": 0.1737105579878251, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 19918 + }, + { + "epoch": 0.17371927927299366, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 19919 + }, + { + "epoch": 0.17372800055816226, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 19920 + }, + { + "epoch": 0.17373672184333083, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 19921 + }, + { + "epoch": 0.1737454431284994, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 19922 + }, + { + "epoch": 0.173754164413668, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 19923 + }, + { + "epoch": 0.17376288569883658, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 19924 + }, + { + "epoch": 0.17377160698400515, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 19925 + }, + { + "epoch": 0.17378032826917375, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 19926 + }, + { + "epoch": 0.17378904955434232, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 19927 + }, + { + "epoch": 0.17379777083951092, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 19928 + }, + { + "epoch": 0.1738064921246795, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 19929 + }, + { + "epoch": 0.17381521340984807, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 19930 + }, + { + "epoch": 0.17382393469501667, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 19931 + }, + { + "epoch": 0.17383265598018524, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 19932 + }, + { + "epoch": 0.1738413772653538, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 19933 + }, + { + "epoch": 0.17385009855052241, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 19934 + }, + { + "epoch": 0.173858819835691, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 19935 + }, + { + "epoch": 0.17386754112085956, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 19936 + }, + { + "epoch": 0.17387626240602816, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 19937 + }, + { + "epoch": 0.17388498369119673, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 19938 + }, + { + "epoch": 0.1738937049763653, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 19939 + }, + { + "epoch": 0.1739024262615339, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 19940 + }, + { + "epoch": 0.17391114754670248, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 19941 + }, + { + "epoch": 0.17391986883187108, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 19942 + }, + { + "epoch": 0.17392859011703965, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 19943 + }, + { + "epoch": 0.17393731140220822, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 19944 + }, + { + "epoch": 0.17394603268737682, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 19945 + }, + { + "epoch": 0.1739547539725454, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 19946 + }, + { + "epoch": 0.17396347525771397, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 19947 + }, + { + "epoch": 0.17397219654288257, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 19948 + }, + { + "epoch": 0.17398091782805114, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 19949 + }, + { + "epoch": 0.1739896391132197, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 19950 + }, + { + "epoch": 0.1739983603983883, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 19951 + }, + { + "epoch": 0.17400708168355689, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 19952 + }, + { + "epoch": 0.17401580296872546, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 19953 + }, + { + "epoch": 0.17402452425389406, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 19954 + }, + { + "epoch": 0.17403324553906263, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 19955 + }, + { + "epoch": 0.17404196682423123, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 19956 + }, + { + "epoch": 0.1740506881093998, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 19957 + }, + { + "epoch": 0.17405940939456838, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 19958 + }, + { + "epoch": 0.17406813067973698, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 19959 + }, + { + "epoch": 0.17407685196490555, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 19960 + }, + { + "epoch": 0.17408557325007412, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 19961 + }, + { + "epoch": 0.17409429453524272, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 19962 + }, + { + "epoch": 0.1741030158204113, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 19963 + }, + { + "epoch": 0.17411173710557987, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 19964 + }, + { + "epoch": 0.17412045839074847, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 19965 + }, + { + "epoch": 0.17412917967591704, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 19966 + }, + { + "epoch": 0.1741379009610856, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 19967 + }, + { + "epoch": 0.1741466222462542, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 19968 + }, + { + "epoch": 0.17415534353142278, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 19969 + }, + { + "epoch": 0.17416406481659139, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 19970 + }, + { + "epoch": 0.17417278610175996, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 19971 + }, + { + "epoch": 0.17418150738692853, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 19972 + }, + { + "epoch": 0.17419022867209713, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 19973 + }, + { + "epoch": 0.1741989499572657, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 19974 + }, + { + "epoch": 0.17420767124243428, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 19975 + }, + { + "epoch": 0.17421639252760288, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 19976 + }, + { + "epoch": 0.17422511381277145, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 19977 + }, + { + "epoch": 0.17423383509794002, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 19978 + }, + { + "epoch": 0.17424255638310862, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 19979 + }, + { + "epoch": 0.1742512776682772, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 19980 + }, + { + "epoch": 0.17425999895344577, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 19981 + }, + { + "epoch": 0.17426872023861437, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 19982 + }, + { + "epoch": 0.17427744152378294, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 19983 + }, + { + "epoch": 0.17428616280895154, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 19984 + }, + { + "epoch": 0.1742948840941201, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 19985 + }, + { + "epoch": 0.17430360537928868, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 19986 + }, + { + "epoch": 0.17431232666445728, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 19987 + }, + { + "epoch": 0.17432104794962586, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 19988 + }, + { + "epoch": 0.17432976923479443, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 19989 + }, + { + "epoch": 0.17433849051996303, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 19990 + }, + { + "epoch": 0.1743472118051316, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 19991 + }, + { + "epoch": 0.17435593309030017, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 19992 + }, + { + "epoch": 0.17436465437546877, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 19993 + }, + { + "epoch": 0.17437337566063735, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 19994 + }, + { + "epoch": 0.17438209694580595, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 19995 + }, + { + "epoch": 0.17439081823097452, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 19996 + }, + { + "epoch": 0.1743995395161431, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 19997 + }, + { + "epoch": 0.1744082608013117, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 19998 + }, + { + "epoch": 0.17441698208648027, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 19999 + }, + { + "epoch": 0.17442570337164884, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 20000 + }, + { + "epoch": 0.17443442465681744, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 20001 + }, + { + "epoch": 0.174443145941986, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 20002 + }, + { + "epoch": 0.17445186722715458, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 20003 + }, + { + "epoch": 0.17446058851232318, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 20004 + }, + { + "epoch": 0.17446930979749176, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 20005 + }, + { + "epoch": 0.17447803108266033, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 20006 + }, + { + "epoch": 0.17448675236782893, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 20007 + }, + { + "epoch": 0.1744954736529975, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 20008 + }, + { + "epoch": 0.1745041949381661, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 20009 + }, + { + "epoch": 0.17451291622333467, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 20010 + }, + { + "epoch": 0.17452163750850325, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 20011 + }, + { + "epoch": 0.17453035879367185, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 20012 + }, + { + "epoch": 0.17453908007884042, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 20013 + }, + { + "epoch": 0.174547801364009, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 20014 + }, + { + "epoch": 0.1745565226491776, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 20015 + }, + { + "epoch": 0.17456524393434616, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 20016 + }, + { + "epoch": 0.17457396521951474, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 20017 + }, + { + "epoch": 0.17458268650468334, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 20018 + }, + { + "epoch": 0.1745914077898519, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 20019 + }, + { + "epoch": 0.17460012907502048, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 20020 + }, + { + "epoch": 0.17460885036018908, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 20021 + }, + { + "epoch": 0.17461757164535766, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 20022 + }, + { + "epoch": 0.17462629293052626, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 20023 + }, + { + "epoch": 0.17463501421569483, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 20024 + }, + { + "epoch": 0.1746437355008634, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 20025 + }, + { + "epoch": 0.174652456786032, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 20026 + }, + { + "epoch": 0.17466117807120057, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 20027 + }, + { + "epoch": 0.17466989935636915, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 20028 + }, + { + "epoch": 0.17467862064153775, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 20029 + }, + { + "epoch": 0.17468734192670632, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 20030 + }, + { + "epoch": 0.1746960632118749, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 20031 + }, + { + "epoch": 0.1747047844970435, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 20032 + }, + { + "epoch": 0.17471350578221206, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 20033 + }, + { + "epoch": 0.17472222706738064, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 20034 + }, + { + "epoch": 0.17473094835254924, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 20035 + }, + { + "epoch": 0.1747396696377178, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 20036 + }, + { + "epoch": 0.1747483909228864, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 20037 + }, + { + "epoch": 0.17475711220805498, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20038 + }, + { + "epoch": 0.17476583349322355, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 20039 + }, + { + "epoch": 0.17477455477839215, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 20040 + }, + { + "epoch": 0.17478327606356073, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 20041 + }, + { + "epoch": 0.1747919973487293, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 20042 + }, + { + "epoch": 0.1748007186338979, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 20043 + }, + { + "epoch": 0.17480943991906647, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 20044 + }, + { + "epoch": 0.17481816120423505, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 20045 + }, + { + "epoch": 0.17482688248940365, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 20046 + }, + { + "epoch": 0.17483560377457222, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 20047 + }, + { + "epoch": 0.1748443250597408, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 20048 + }, + { + "epoch": 0.1748530463449094, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 20049 + }, + { + "epoch": 0.17486176763007796, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 20050 + }, + { + "epoch": 0.17487048891524656, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20051 + }, + { + "epoch": 0.17487921020041514, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 20052 + }, + { + "epoch": 0.1748879314855837, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 20053 + }, + { + "epoch": 0.1748966527707523, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20054 + }, + { + "epoch": 0.17490537405592088, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 20055 + }, + { + "epoch": 0.17491409534108945, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 20056 + }, + { + "epoch": 0.17492281662625805, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20057 + }, + { + "epoch": 0.17493153791142663, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 20058 + }, + { + "epoch": 0.1749402591965952, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 20059 + }, + { + "epoch": 0.1749489804817638, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 20060 + }, + { + "epoch": 0.17495770176693237, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 20061 + }, + { + "epoch": 0.17496642305210094, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 20062 + }, + { + "epoch": 0.17497514433726954, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 20063 + }, + { + "epoch": 0.17498386562243812, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 20064 + }, + { + "epoch": 0.17499258690760672, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 20065 + }, + { + "epoch": 0.1750013081927753, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 20066 + }, + { + "epoch": 0.17501002947794386, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 20067 + }, + { + "epoch": 0.17501875076311246, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 20068 + }, + { + "epoch": 0.17502747204828104, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 20069 + }, + { + "epoch": 0.1750361933334496, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 20070 + }, + { + "epoch": 0.1750449146186182, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 20071 + }, + { + "epoch": 0.17505363590378678, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 20072 + }, + { + "epoch": 0.17506235718895535, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 20073 + }, + { + "epoch": 0.17507107847412395, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 20074 + }, + { + "epoch": 0.17507979975929253, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 20075 + }, + { + "epoch": 0.1750885210444611, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 20076 + }, + { + "epoch": 0.1750972423296297, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 20077 + }, + { + "epoch": 0.17510596361479827, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 20078 + }, + { + "epoch": 0.17511468489996687, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 20079 + }, + { + "epoch": 0.17512340618513544, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 20080 + }, + { + "epoch": 0.17513212747030402, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 20081 + }, + { + "epoch": 0.17514084875547262, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 20082 + }, + { + "epoch": 0.1751495700406412, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 20083 + }, + { + "epoch": 0.17515829132580976, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 20084 + }, + { + "epoch": 0.17516701261097836, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 20085 + }, + { + "epoch": 0.17517573389614693, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 20086 + }, + { + "epoch": 0.1751844551813155, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 20087 + }, + { + "epoch": 0.1751931764664841, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 20088 + }, + { + "epoch": 0.17520189775165268, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 20089 + }, + { + "epoch": 0.17521061903682125, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 20090 + }, + { + "epoch": 0.17521934032198985, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 20091 + }, + { + "epoch": 0.17522806160715843, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 20092 + }, + { + "epoch": 0.17523678289232703, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 20093 + }, + { + "epoch": 0.1752455041774956, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 20094 + }, + { + "epoch": 0.17525422546266417, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 20095 + }, + { + "epoch": 0.17526294674783277, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 20096 + }, + { + "epoch": 0.17527166803300134, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20097 + }, + { + "epoch": 0.17528038931816992, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20098 + }, + { + "epoch": 0.17528911060333852, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 20099 + }, + { + "epoch": 0.1752978318885071, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 20100 + }, + { + "epoch": 0.17530655317367566, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 20101 + }, + { + "epoch": 0.17531527445884426, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 20102 + }, + { + "epoch": 0.17532399574401283, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 20103 + }, + { + "epoch": 0.1753327170291814, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 20104 + }, + { + "epoch": 0.17534143831435, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 20105 + }, + { + "epoch": 0.17535015959951858, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20106 + }, + { + "epoch": 0.17535888088468718, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 20107 + }, + { + "epoch": 0.17536760216985575, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20108 + }, + { + "epoch": 0.17537632345502432, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 20109 + }, + { + "epoch": 0.17538504474019292, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 20110 + }, + { + "epoch": 0.1753937660253615, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 20111 + }, + { + "epoch": 0.17540248731053007, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 20112 + }, + { + "epoch": 0.17541120859569867, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 20113 + }, + { + "epoch": 0.17541992988086724, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 20114 + }, + { + "epoch": 0.17542865116603581, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 20115 + }, + { + "epoch": 0.17543737245120442, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 20116 + }, + { + "epoch": 0.175446093736373, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 20117 + }, + { + "epoch": 0.1754548150215416, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 20118 + }, + { + "epoch": 0.17546353630671016, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 20119 + }, + { + "epoch": 0.17547225759187873, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 20120 + }, + { + "epoch": 0.17548097887704733, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 20121 + }, + { + "epoch": 0.1754897001622159, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 20122 + }, + { + "epoch": 0.17549842144738448, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 20123 + }, + { + "epoch": 0.17550714273255308, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 20124 + }, + { + "epoch": 0.17551586401772165, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20125 + }, + { + "epoch": 0.17552458530289022, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 20126 + }, + { + "epoch": 0.17553330658805882, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 20127 + }, + { + "epoch": 0.1755420278732274, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 20128 + }, + { + "epoch": 0.17555074915839597, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 20129 + }, + { + "epoch": 0.17555947044356457, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 20130 + }, + { + "epoch": 0.17556819172873314, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 20131 + }, + { + "epoch": 0.17557691301390174, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 20132 + }, + { + "epoch": 0.17558563429907031, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 20133 + }, + { + "epoch": 0.1755943555842389, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 20134 + }, + { + "epoch": 0.1756030768694075, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 20135 + }, + { + "epoch": 0.17561179815457606, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 20136 + }, + { + "epoch": 0.17562051943974463, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 20137 + }, + { + "epoch": 0.17562924072491323, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 20138 + }, + { + "epoch": 0.1756379620100818, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 20139 + }, + { + "epoch": 0.17564668329525038, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 20140 + }, + { + "epoch": 0.17565540458041898, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 20141 + }, + { + "epoch": 0.17566412586558755, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 20142 + }, + { + "epoch": 0.17567284715075612, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 20143 + }, + { + "epoch": 0.17568156843592472, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 20144 + }, + { + "epoch": 0.1756902897210933, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 20145 + }, + { + "epoch": 0.1756990110062619, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 20146 + }, + { + "epoch": 0.17570773229143047, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 20147 + }, + { + "epoch": 0.17571645357659904, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 20148 + }, + { + "epoch": 0.17572517486176764, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 20149 + }, + { + "epoch": 0.1757338961469362, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 20150 + }, + { + "epoch": 0.1757426174321048, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 20151 + }, + { + "epoch": 0.1757513387172734, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 20152 + }, + { + "epoch": 0.17576006000244196, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 20153 + }, + { + "epoch": 0.17576878128761053, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 20154 + }, + { + "epoch": 0.17577750257277913, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 20155 + }, + { + "epoch": 0.1757862238579477, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20156 + }, + { + "epoch": 0.17579494514311628, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 20157 + }, + { + "epoch": 0.17580366642828488, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 20158 + }, + { + "epoch": 0.17581238771345345, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 20159 + }, + { + "epoch": 0.17582110899862205, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 20160 + }, + { + "epoch": 0.17582983028379062, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 20161 + }, + { + "epoch": 0.1758385515689592, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 20162 + }, + { + "epoch": 0.1758472728541278, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 20163 + }, + { + "epoch": 0.17585599413929637, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 20164 + }, + { + "epoch": 0.17586471542446494, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 20165 + }, + { + "epoch": 0.17587343670963354, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 20166 + }, + { + "epoch": 0.1758821579948021, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 20167 + }, + { + "epoch": 0.17589087927997069, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 20168 + }, + { + "epoch": 0.17589960056513929, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 20169 + }, + { + "epoch": 0.17590832185030786, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 20170 + }, + { + "epoch": 0.17591704313547643, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 20171 + }, + { + "epoch": 0.17592576442064503, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 20172 + }, + { + "epoch": 0.1759344857058136, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 20173 + }, + { + "epoch": 0.1759432069909822, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 20174 + }, + { + "epoch": 0.17595192827615078, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 20175 + }, + { + "epoch": 0.17596064956131935, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 20176 + }, + { + "epoch": 0.17596937084648795, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 20177 + }, + { + "epoch": 0.17597809213165652, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 20178 + }, + { + "epoch": 0.1759868134168251, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 20179 + }, + { + "epoch": 0.1759955347019937, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 20180 + }, + { + "epoch": 0.17600425598716227, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 20181 + }, + { + "epoch": 0.17601297727233084, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 20182 + }, + { + "epoch": 0.17602169855749944, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 20183 + }, + { + "epoch": 0.176030419842668, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 20184 + }, + { + "epoch": 0.17603914112783658, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 20185 + }, + { + "epoch": 0.17604786241300519, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 20186 + }, + { + "epoch": 0.17605658369817376, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 20187 + }, + { + "epoch": 0.17606530498334236, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20188 + }, + { + "epoch": 0.17607402626851093, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 20189 + }, + { + "epoch": 0.1760827475536795, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 20190 + }, + { + "epoch": 0.1760914688388481, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 20191 + }, + { + "epoch": 0.17610019012401668, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 20192 + }, + { + "epoch": 0.17610891140918525, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 20193 + }, + { + "epoch": 0.17611763269435385, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 20194 + }, + { + "epoch": 0.17612635397952242, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 20195 + }, + { + "epoch": 0.176135075264691, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 20196 + }, + { + "epoch": 0.1761437965498596, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 20197 + }, + { + "epoch": 0.17615251783502817, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 20198 + }, + { + "epoch": 0.17616123912019674, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20199 + }, + { + "epoch": 0.17616996040536534, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 20200 + }, + { + "epoch": 0.1761786816905339, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 20201 + }, + { + "epoch": 0.1761874029757025, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 20202 + }, + { + "epoch": 0.17619612426087108, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 20203 + }, + { + "epoch": 0.17620484554603966, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 20204 + }, + { + "epoch": 0.17621356683120826, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 20205 + }, + { + "epoch": 0.17622228811637683, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 20206 + }, + { + "epoch": 0.1762310094015454, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 20207 + }, + { + "epoch": 0.176239730686714, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 20208 + }, + { + "epoch": 0.17624845197188257, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 20209 + }, + { + "epoch": 0.17625717325705115, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 20210 + }, + { + "epoch": 0.17626589454221975, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 20211 + }, + { + "epoch": 0.17627461582738832, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 20212 + }, + { + "epoch": 0.1762833371125569, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 20213 + }, + { + "epoch": 0.1762920583977255, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 20214 + }, + { + "epoch": 0.17630077968289407, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 20215 + }, + { + "epoch": 0.17630950096806267, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 20216 + }, + { + "epoch": 0.17631822225323124, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 20217 + }, + { + "epoch": 0.1763269435383998, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 20218 + }, + { + "epoch": 0.1763356648235684, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 20219 + }, + { + "epoch": 0.17634438610873698, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 20220 + }, + { + "epoch": 0.17635310739390556, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 20221 + }, + { + "epoch": 0.17636182867907416, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 20222 + }, + { + "epoch": 0.17637054996424273, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 20223 + }, + { + "epoch": 0.1763792712494113, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 20224 + }, + { + "epoch": 0.1763879925345799, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 20225 + }, + { + "epoch": 0.17639671381974847, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 20226 + }, + { + "epoch": 0.17640543510491707, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 20227 + }, + { + "epoch": 0.17641415639008565, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 20228 + }, + { + "epoch": 0.17642287767525422, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 20229 + }, + { + "epoch": 0.17643159896042282, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 20230 + }, + { + "epoch": 0.1764403202455914, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 20231 + }, + { + "epoch": 0.17644904153075996, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 20232 + }, + { + "epoch": 0.17645776281592856, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 20233 + }, + { + "epoch": 0.17646648410109714, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 20234 + }, + { + "epoch": 0.1764752053862657, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 20235 + }, + { + "epoch": 0.1764839266714343, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 20236 + }, + { + "epoch": 0.17649264795660288, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 20237 + }, + { + "epoch": 0.17650136924177146, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 20238 + }, + { + "epoch": 0.17651009052694006, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 20239 + }, + { + "epoch": 0.17651881181210863, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 20240 + }, + { + "epoch": 0.17652753309727723, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 20241 + }, + { + "epoch": 0.1765362543824458, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 20242 + }, + { + "epoch": 0.17654497566761437, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 20243 + }, + { + "epoch": 0.17655369695278297, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 20244 + }, + { + "epoch": 0.17656241823795155, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 20245 + }, + { + "epoch": 0.17657113952312012, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 20246 + }, + { + "epoch": 0.17657986080828872, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 20247 + }, + { + "epoch": 0.1765885820934573, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 20248 + }, + { + "epoch": 0.17659730337862586, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 20249 + }, + { + "epoch": 0.17660602466379446, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 20250 + }, + { + "epoch": 0.17661474594896304, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20251 + }, + { + "epoch": 0.1766234672341316, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 20252 + }, + { + "epoch": 0.1766321885193002, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 20253 + }, + { + "epoch": 0.17664090980446878, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20254 + }, + { + "epoch": 0.17664963108963738, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 20255 + }, + { + "epoch": 0.17665835237480595, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 20256 + }, + { + "epoch": 0.17666707365997453, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 20257 + }, + { + "epoch": 0.17667579494514313, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 20258 + }, + { + "epoch": 0.1766845162303117, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 20259 + }, + { + "epoch": 0.17669323751548027, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 20260 + }, + { + "epoch": 0.17670195880064887, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 20261 + }, + { + "epoch": 0.17671068008581745, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 20262 + }, + { + "epoch": 0.17671940137098602, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 20263 + }, + { + "epoch": 0.17672812265615462, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 20264 + }, + { + "epoch": 0.1767368439413232, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 20265 + }, + { + "epoch": 0.17674556522649176, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 20266 + }, + { + "epoch": 0.17675428651166036, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 20267 + }, + { + "epoch": 0.17676300779682894, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 20268 + }, + { + "epoch": 0.17677172908199754, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 20269 + }, + { + "epoch": 0.1767804503671661, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20270 + }, + { + "epoch": 0.17678917165233468, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 20271 + }, + { + "epoch": 0.17679789293750328, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 20272 + }, + { + "epoch": 0.17680661422267185, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 20273 + }, + { + "epoch": 0.17681533550784043, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 20274 + }, + { + "epoch": 0.17682405679300903, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 20275 + }, + { + "epoch": 0.1768327780781776, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 20276 + }, + { + "epoch": 0.17684149936334617, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 20277 + }, + { + "epoch": 0.17685022064851477, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 20278 + }, + { + "epoch": 0.17685894193368334, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 20279 + }, + { + "epoch": 0.17686766321885192, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 20280 + }, + { + "epoch": 0.17687638450402052, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 20281 + }, + { + "epoch": 0.1768851057891891, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 20282 + }, + { + "epoch": 0.1768938270743577, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 20283 + }, + { + "epoch": 0.17690254835952626, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 20284 + }, + { + "epoch": 0.17691126964469484, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20285 + }, + { + "epoch": 0.17691999092986344, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 20286 + }, + { + "epoch": 0.176928712215032, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 20287 + }, + { + "epoch": 0.17693743350020058, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 20288 + }, + { + "epoch": 0.17694615478536918, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 20289 + }, + { + "epoch": 0.17695487607053775, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 20290 + }, + { + "epoch": 0.17696359735570633, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20291 + }, + { + "epoch": 0.17697231864087493, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 20292 + }, + { + "epoch": 0.1769810399260435, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20293 + }, + { + "epoch": 0.17698976121121207, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 20294 + }, + { + "epoch": 0.17699848249638067, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 20295 + }, + { + "epoch": 0.17700720378154924, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 20296 + }, + { + "epoch": 0.17701592506671784, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 20297 + }, + { + "epoch": 0.17702464635188642, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 20298 + }, + { + "epoch": 0.177033367637055, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20299 + }, + { + "epoch": 0.1770420889222236, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 20300 + }, + { + "epoch": 0.17705081020739216, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 20301 + }, + { + "epoch": 0.17705953149256073, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 20302 + }, + { + "epoch": 0.17706825277772933, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 20303 + }, + { + "epoch": 0.1770769740628979, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 20304 + }, + { + "epoch": 0.17708569534806648, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 20305 + }, + { + "epoch": 0.17709441663323508, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 20306 + }, + { + "epoch": 0.17710313791840365, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 20307 + }, + { + "epoch": 0.17711185920357223, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 20308 + }, + { + "epoch": 0.17712058048874083, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 20309 + }, + { + "epoch": 0.1771293017739094, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 20310 + }, + { + "epoch": 0.177138023059078, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 20311 + }, + { + "epoch": 0.17714674434424657, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 20312 + }, + { + "epoch": 0.17715546562941514, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 20313 + }, + { + "epoch": 0.17716418691458374, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 20314 + }, + { + "epoch": 0.17717290819975232, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 20315 + }, + { + "epoch": 0.1771816294849209, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 20316 + }, + { + "epoch": 0.1771903507700895, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20317 + }, + { + "epoch": 0.17719907205525806, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 20318 + }, + { + "epoch": 0.17720779334042663, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 20319 + }, + { + "epoch": 0.17721651462559523, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 20320 + }, + { + "epoch": 0.1772252359107638, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 20321 + }, + { + "epoch": 0.17723395719593238, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 20322 + }, + { + "epoch": 0.17724267848110098, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 20323 + }, + { + "epoch": 0.17725139976626955, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 20324 + }, + { + "epoch": 0.17726012105143815, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 20325 + }, + { + "epoch": 0.17726884233660672, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 20326 + }, + { + "epoch": 0.1772775636217753, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 20327 + }, + { + "epoch": 0.1772862849069439, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 20328 + }, + { + "epoch": 0.17729500619211247, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 20329 + }, + { + "epoch": 0.17730372747728104, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 20330 + }, + { + "epoch": 0.17731244876244964, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20331 + }, + { + "epoch": 0.17732117004761822, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 20332 + }, + { + "epoch": 0.1773298913327868, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 20333 + }, + { + "epoch": 0.1773386126179554, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 20334 + }, + { + "epoch": 0.17734733390312396, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 20335 + }, + { + "epoch": 0.17735605518829256, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 20336 + }, + { + "epoch": 0.17736477647346113, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 20337 + }, + { + "epoch": 0.1773734977586297, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 20338 + }, + { + "epoch": 0.1773822190437983, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 20339 + }, + { + "epoch": 0.17739094032896688, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 20340 + }, + { + "epoch": 0.17739966161413545, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 20341 + }, + { + "epoch": 0.17740838289930405, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 20342 + }, + { + "epoch": 0.17741710418447262, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 20343 + }, + { + "epoch": 0.1774258254696412, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 20344 + }, + { + "epoch": 0.1774345467548098, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 20345 + }, + { + "epoch": 0.17744326803997837, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 20346 + }, + { + "epoch": 0.17745198932514694, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 20347 + }, + { + "epoch": 0.17746071061031554, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 20348 + }, + { + "epoch": 0.17746943189548411, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 20349 + }, + { + "epoch": 0.17747815318065271, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 20350 + }, + { + "epoch": 0.1774868744658213, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 20351 + }, + { + "epoch": 0.17749559575098986, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 20352 + }, + { + "epoch": 0.17750431703615846, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 20353 + }, + { + "epoch": 0.17751303832132703, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 20354 + }, + { + "epoch": 0.1775217596064956, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 20355 + }, + { + "epoch": 0.1775304808916642, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 20356 + }, + { + "epoch": 0.17753920217683278, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 20357 + }, + { + "epoch": 0.17754792346200135, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 20358 + }, + { + "epoch": 0.17755664474716995, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 20359 + }, + { + "epoch": 0.17756536603233852, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20360 + }, + { + "epoch": 0.1775740873175071, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 20361 + }, + { + "epoch": 0.1775828086026757, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 20362 + }, + { + "epoch": 0.17759152988784427, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 20363 + }, + { + "epoch": 0.17760025117301287, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 20364 + }, + { + "epoch": 0.17760897245818144, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20365 + }, + { + "epoch": 0.17761769374335, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 20366 + }, + { + "epoch": 0.1776264150285186, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20367 + }, + { + "epoch": 0.1776351363136872, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 20368 + }, + { + "epoch": 0.17764385759885576, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 20369 + }, + { + "epoch": 0.17765257888402436, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 20370 + }, + { + "epoch": 0.17766130016919293, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 20371 + }, + { + "epoch": 0.1776700214543615, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 20372 + }, + { + "epoch": 0.1776787427395301, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 20373 + }, + { + "epoch": 0.17768746402469868, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 20374 + }, + { + "epoch": 0.17769618530986725, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 20375 + }, + { + "epoch": 0.17770490659503585, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 20376 + }, + { + "epoch": 0.17771362788020442, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 20377 + }, + { + "epoch": 0.17772234916537302, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 20378 + }, + { + "epoch": 0.1777310704505416, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 20379 + }, + { + "epoch": 0.17773979173571017, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 20380 + }, + { + "epoch": 0.17774851302087877, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 20381 + }, + { + "epoch": 0.17775723430604734, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 20382 + }, + { + "epoch": 0.1777659555912159, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 20383 + }, + { + "epoch": 0.1777746768763845, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 20384 + }, + { + "epoch": 0.17778339816155309, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 20385 + }, + { + "epoch": 0.17779211944672166, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 20386 + }, + { + "epoch": 0.17780084073189026, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 20387 + }, + { + "epoch": 0.17780956201705883, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 20388 + }, + { + "epoch": 0.1778182833022274, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 20389 + }, + { + "epoch": 0.177827004587396, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 20390 + }, + { + "epoch": 0.17783572587256458, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 20391 + }, + { + "epoch": 0.17784444715773318, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 20392 + }, + { + "epoch": 0.17785316844290175, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 20393 + }, + { + "epoch": 0.17786188972807032, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 20394 + }, + { + "epoch": 0.17787061101323892, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 20395 + }, + { + "epoch": 0.1778793322984075, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 20396 + }, + { + "epoch": 0.17788805358357607, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 20397 + }, + { + "epoch": 0.17789677486874467, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 20398 + }, + { + "epoch": 0.17790549615391324, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 20399 + }, + { + "epoch": 0.1779142174390818, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 20400 + }, + { + "epoch": 0.1779229387242504, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 20401 + }, + { + "epoch": 0.17793166000941898, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 20402 + }, + { + "epoch": 0.17794038129458756, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 20403 + }, + { + "epoch": 0.17794910257975616, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 20404 + }, + { + "epoch": 0.17795782386492473, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 20405 + }, + { + "epoch": 0.17796654515009333, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 20406 + }, + { + "epoch": 0.1779752664352619, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 20407 + }, + { + "epoch": 0.17798398772043048, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 20408 + }, + { + "epoch": 0.17799270900559908, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 20409 + }, + { + "epoch": 0.17800143029076765, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 20410 + }, + { + "epoch": 0.17801015157593622, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20411 + }, + { + "epoch": 0.17801887286110482, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 20412 + }, + { + "epoch": 0.1780275941462734, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 20413 + }, + { + "epoch": 0.17803631543144197, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 20414 + }, + { + "epoch": 0.17804503671661057, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 20415 + }, + { + "epoch": 0.17805375800177914, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 20416 + }, + { + "epoch": 0.1780624792869477, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 20417 + }, + { + "epoch": 0.1780712005721163, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 20418 + }, + { + "epoch": 0.17807992185728488, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 20419 + }, + { + "epoch": 0.17808864314245348, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 20420 + }, + { + "epoch": 0.17809736442762206, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 20421 + }, + { + "epoch": 0.17810608571279063, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 20422 + }, + { + "epoch": 0.17811480699795923, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 20423 + }, + { + "epoch": 0.1781235282831278, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 20424 + }, + { + "epoch": 0.17813224956829637, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 20425 + }, + { + "epoch": 0.17814097085346497, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 20426 + }, + { + "epoch": 0.17814969213863355, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 20427 + }, + { + "epoch": 0.17815841342380212, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 20428 + }, + { + "epoch": 0.17816713470897072, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 20429 + }, + { + "epoch": 0.1781758559941393, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20430 + }, + { + "epoch": 0.17818457727930787, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 20431 + }, + { + "epoch": 0.17819329856447647, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 20432 + }, + { + "epoch": 0.17820201984964504, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 20433 + }, + { + "epoch": 0.17821074113481364, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 20434 + }, + { + "epoch": 0.1782194624199822, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 20435 + }, + { + "epoch": 0.17822818370515078, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 20436 + }, + { + "epoch": 0.17823690499031938, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 20437 + }, + { + "epoch": 0.17824562627548796, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 20438 + }, + { + "epoch": 0.17825434756065653, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 20439 + }, + { + "epoch": 0.17826306884582513, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 20440 + }, + { + "epoch": 0.1782717901309937, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 20441 + }, + { + "epoch": 0.17828051141616227, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20442 + }, + { + "epoch": 0.17828923270133087, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 20443 + }, + { + "epoch": 0.17829795398649945, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 20444 + }, + { + "epoch": 0.17830667527166802, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 20445 + }, + { + "epoch": 0.17831539655683662, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 20446 + }, + { + "epoch": 0.1783241178420052, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 20447 + }, + { + "epoch": 0.1783328391271738, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 20448 + }, + { + "epoch": 0.17834156041234236, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 20449 + }, + { + "epoch": 0.17835028169751094, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 20450 + }, + { + "epoch": 0.17835900298267954, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 20451 + }, + { + "epoch": 0.1783677242678481, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 20452 + }, + { + "epoch": 0.17837644555301668, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 20453 + }, + { + "epoch": 0.17838516683818528, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 20454 + }, + { + "epoch": 0.17839388812335386, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20455 + }, + { + "epoch": 0.17840260940852243, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 20456 + }, + { + "epoch": 0.17841133069369103, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20457 + }, + { + "epoch": 0.1784200519788596, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 20458 + }, + { + "epoch": 0.1784287732640282, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 20459 + }, + { + "epoch": 0.17843749454919677, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 20460 + }, + { + "epoch": 0.17844621583436535, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 20461 + }, + { + "epoch": 0.17845493711953395, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 20462 + }, + { + "epoch": 0.17846365840470252, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 20463 + }, + { + "epoch": 0.1784723796898711, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20464 + }, + { + "epoch": 0.1784811009750397, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 20465 + }, + { + "epoch": 0.17848982226020826, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 20466 + }, + { + "epoch": 0.17849854354537684, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 20467 + }, + { + "epoch": 0.17850726483054544, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 20468 + }, + { + "epoch": 0.178515986115714, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 20469 + }, + { + "epoch": 0.17852470740088258, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 20470 + }, + { + "epoch": 0.17853342868605118, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 20471 + }, + { + "epoch": 0.17854214997121975, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 20472 + }, + { + "epoch": 0.17855087125638835, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 20473 + }, + { + "epoch": 0.17855959254155693, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 20474 + }, + { + "epoch": 0.1785683138267255, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 20475 + }, + { + "epoch": 0.1785770351118941, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 20476 + }, + { + "epoch": 0.17858575639706267, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 20477 + }, + { + "epoch": 0.17859447768223125, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 20478 + }, + { + "epoch": 0.17860319896739985, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 20479 + }, + { + "epoch": 0.17861192025256842, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 20480 + }, + { + "epoch": 0.178620641537737, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 20481 + }, + { + "epoch": 0.1786293628229056, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 20482 + }, + { + "epoch": 0.17863808410807416, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 20483 + }, + { + "epoch": 0.17864680539324274, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 20484 + }, + { + "epoch": 0.17865552667841134, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 20485 + }, + { + "epoch": 0.1786642479635799, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 20486 + }, + { + "epoch": 0.1786729692487485, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 20487 + }, + { + "epoch": 0.17868169053391708, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20488 + }, + { + "epoch": 0.17869041181908565, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 20489 + }, + { + "epoch": 0.17869913310425425, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20490 + }, + { + "epoch": 0.17870785438942283, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 20491 + }, + { + "epoch": 0.1787165756745914, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 20492 + }, + { + "epoch": 0.17872529695976, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 20493 + }, + { + "epoch": 0.17873401824492857, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 20494 + }, + { + "epoch": 0.17874273953009714, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 20495 + }, + { + "epoch": 0.17875146081526574, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 20496 + }, + { + "epoch": 0.17876018210043432, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 20497 + }, + { + "epoch": 0.1787689033856029, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 20498 + }, + { + "epoch": 0.1787776246707715, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 20499 + }, + { + "epoch": 0.17878634595594006, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 20500 + }, + { + "epoch": 0.17879506724110866, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 20501 + }, + { + "epoch": 0.17880378852627724, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 20502 + }, + { + "epoch": 0.1788125098114458, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 20503 + }, + { + "epoch": 0.1788212310966144, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 20504 + }, + { + "epoch": 0.17882995238178298, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 20505 + }, + { + "epoch": 0.17883867366695155, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 20506 + }, + { + "epoch": 0.17884739495212015, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 20507 + }, + { + "epoch": 0.17885611623728873, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 20508 + }, + { + "epoch": 0.1788648375224573, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 20509 + }, + { + "epoch": 0.1788735588076259, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 20510 + }, + { + "epoch": 0.17888228009279447, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 20511 + }, + { + "epoch": 0.17889100137796304, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 20512 + }, + { + "epoch": 0.17889972266313164, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20513 + }, + { + "epoch": 0.17890844394830022, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 20514 + }, + { + "epoch": 0.17891716523346882, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 20515 + }, + { + "epoch": 0.1789258865186374, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 20516 + }, + { + "epoch": 0.17893460780380596, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 20517 + }, + { + "epoch": 0.17894332908897456, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 20518 + }, + { + "epoch": 0.17895205037414313, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 20519 + }, + { + "epoch": 0.1789607716593117, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 20520 + }, + { + "epoch": 0.1789694929444803, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 20521 + }, + { + "epoch": 0.17897821422964888, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20522 + }, + { + "epoch": 0.17898693551481745, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 20523 + }, + { + "epoch": 0.17899565679998605, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 20524 + }, + { + "epoch": 0.17900437808515463, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 20525 + }, + { + "epoch": 0.1790130993703232, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 20526 + }, + { + "epoch": 0.1790218206554918, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 20527 + }, + { + "epoch": 0.17903054194066037, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 20528 + }, + { + "epoch": 0.17903926322582897, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20529 + }, + { + "epoch": 0.17904798451099754, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 20530 + }, + { + "epoch": 0.17905670579616612, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 20531 + }, + { + "epoch": 0.17906542708133472, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 20532 + }, + { + "epoch": 0.1790741483665033, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 20533 + }, + { + "epoch": 0.17908286965167186, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 20534 + }, + { + "epoch": 0.17909159093684046, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20535 + }, + { + "epoch": 0.17910031222200903, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 20536 + }, + { + "epoch": 0.1791090335071776, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 20537 + }, + { + "epoch": 0.1791177547923462, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 20538 + }, + { + "epoch": 0.17912647607751478, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 20539 + }, + { + "epoch": 0.17913519736268335, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 20540 + }, + { + "epoch": 0.17914391864785195, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 20541 + }, + { + "epoch": 0.17915263993302052, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 20542 + }, + { + "epoch": 0.17916136121818912, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 20543 + }, + { + "epoch": 0.1791700825033577, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20544 + }, + { + "epoch": 0.17917880378852627, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 20545 + }, + { + "epoch": 0.17918752507369487, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 20546 + }, + { + "epoch": 0.17919624635886344, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 20547 + }, + { + "epoch": 0.17920496764403201, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 20548 + }, + { + "epoch": 0.17921368892920062, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 20549 + }, + { + "epoch": 0.1792224102143692, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 20550 + }, + { + "epoch": 0.17923113149953776, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20551 + }, + { + "epoch": 0.17923985278470636, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 20552 + }, + { + "epoch": 0.17924857406987493, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 20553 + }, + { + "epoch": 0.1792572953550435, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 20554 + }, + { + "epoch": 0.1792660166402121, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 20555 + }, + { + "epoch": 0.17927473792538068, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 20556 + }, + { + "epoch": 0.17928345921054928, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 20557 + }, + { + "epoch": 0.17929218049571785, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 20558 + }, + { + "epoch": 0.17930090178088642, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 20559 + }, + { + "epoch": 0.17930962306605502, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 20560 + }, + { + "epoch": 0.1793183443512236, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 20561 + }, + { + "epoch": 0.17932706563639217, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 20562 + }, + { + "epoch": 0.17933578692156077, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 20563 + }, + { + "epoch": 0.17934450820672934, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20564 + }, + { + "epoch": 0.17935322949189791, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 20565 + }, + { + "epoch": 0.17936195077706651, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 20566 + }, + { + "epoch": 0.1793706720622351, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 20567 + }, + { + "epoch": 0.1793793933474037, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 20568 + }, + { + "epoch": 0.17938811463257226, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20569 + }, + { + "epoch": 0.17939683591774083, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20570 + }, + { + "epoch": 0.17940555720290943, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 20571 + }, + { + "epoch": 0.179414278488078, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 20572 + }, + { + "epoch": 0.17942299977324658, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 20573 + }, + { + "epoch": 0.17943172105841518, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 20574 + }, + { + "epoch": 0.17944044234358375, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 20575 + }, + { + "epoch": 0.17944916362875232, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 20576 + }, + { + "epoch": 0.17945788491392092, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 20577 + }, + { + "epoch": 0.1794666061990895, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 20578 + }, + { + "epoch": 0.17947532748425807, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 20579 + }, + { + "epoch": 0.17948404876942667, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 20580 + }, + { + "epoch": 0.17949277005459524, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 20581 + }, + { + "epoch": 0.17950149133976384, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 20582 + }, + { + "epoch": 0.1795102126249324, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 20583 + }, + { + "epoch": 0.179518933910101, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 20584 + }, + { + "epoch": 0.1795276551952696, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 20585 + }, + { + "epoch": 0.17953637648043816, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 20586 + }, + { + "epoch": 0.17954509776560673, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 20587 + }, + { + "epoch": 0.17955381905077533, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 20588 + }, + { + "epoch": 0.1795625403359439, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 20589 + }, + { + "epoch": 0.17957126162111248, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 20590 + }, + { + "epoch": 0.17957998290628108, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 20591 + }, + { + "epoch": 0.17958870419144965, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 20592 + }, + { + "epoch": 0.17959742547661822, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 20593 + }, + { + "epoch": 0.17960614676178682, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 20594 + }, + { + "epoch": 0.1796148680469554, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 20595 + }, + { + "epoch": 0.179623589332124, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 20596 + }, + { + "epoch": 0.17963231061729257, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 20597 + }, + { + "epoch": 0.17964103190246114, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 20598 + }, + { + "epoch": 0.17964975318762974, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 20599 + }, + { + "epoch": 0.1796584744727983, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 20600 + }, + { + "epoch": 0.17966719575796689, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 20601 + }, + { + "epoch": 0.17967591704313549, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 20602 + }, + { + "epoch": 0.17968463832830406, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 20603 + }, + { + "epoch": 0.17969335961347263, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 20604 + }, + { + "epoch": 0.17970208089864123, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20605 + }, + { + "epoch": 0.1797108021838098, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 20606 + }, + { + "epoch": 0.17971952346897838, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 20607 + }, + { + "epoch": 0.17972824475414698, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 20608 + }, + { + "epoch": 0.17973696603931555, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 20609 + }, + { + "epoch": 0.17974568732448415, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 20610 + }, + { + "epoch": 0.17975440860965272, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 20611 + }, + { + "epoch": 0.1797631298948213, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 20612 + }, + { + "epoch": 0.1797718511799899, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 20613 + }, + { + "epoch": 0.17978057246515847, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 20614 + }, + { + "epoch": 0.17978929375032704, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 20615 + }, + { + "epoch": 0.17979801503549564, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 20616 + }, + { + "epoch": 0.1798067363206642, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 20617 + }, + { + "epoch": 0.17981545760583278, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 20618 + }, + { + "epoch": 0.17982417889100138, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 20619 + }, + { + "epoch": 0.17983290017616996, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 20620 + }, + { + "epoch": 0.17984162146133853, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20621 + }, + { + "epoch": 0.17985034274650713, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 20622 + }, + { + "epoch": 0.1798590640316757, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 20623 + }, + { + "epoch": 0.1798677853168443, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 20624 + }, + { + "epoch": 0.17987650660201288, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 20625 + }, + { + "epoch": 0.17988522788718145, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 20626 + }, + { + "epoch": 0.17989394917235005, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 20627 + }, + { + "epoch": 0.17990267045751862, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 20628 + }, + { + "epoch": 0.1799113917426872, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 20629 + }, + { + "epoch": 0.1799201130278558, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 20630 + }, + { + "epoch": 0.17992883431302437, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 20631 + }, + { + "epoch": 0.17993755559819294, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 20632 + }, + { + "epoch": 0.17994627688336154, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 20633 + }, + { + "epoch": 0.1799549981685301, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 20634 + }, + { + "epoch": 0.17996371945369868, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 20635 + }, + { + "epoch": 0.17997244073886728, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 20636 + }, + { + "epoch": 0.17998116202403586, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 20637 + }, + { + "epoch": 0.17998988330920446, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 20638 + }, + { + "epoch": 0.17999860459437303, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20639 + }, + { + "epoch": 0.1800073258795416, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 20640 + }, + { + "epoch": 0.1800160471647102, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 20641 + }, + { + "epoch": 0.18002476844987877, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 20642 + }, + { + "epoch": 0.18003348973504735, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 20643 + }, + { + "epoch": 0.18004221102021595, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 20644 + }, + { + "epoch": 0.18005093230538452, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 20645 + }, + { + "epoch": 0.1800596535905531, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 20646 + }, + { + "epoch": 0.1800683748757217, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 20647 + }, + { + "epoch": 0.18007709616089027, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 20648 + }, + { + "epoch": 0.18008581744605884, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 20649 + }, + { + "epoch": 0.18009453873122744, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 20650 + }, + { + "epoch": 0.180103260016396, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 20651 + }, + { + "epoch": 0.1801119813015646, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20652 + }, + { + "epoch": 0.18012070258673318, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20653 + }, + { + "epoch": 0.18012942387190176, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 20654 + }, + { + "epoch": 0.18013814515707036, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 20655 + }, + { + "epoch": 0.18014686644223893, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 20656 + }, + { + "epoch": 0.1801555877274075, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 20657 + }, + { + "epoch": 0.1801643090125761, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 20658 + }, + { + "epoch": 0.18017303029774467, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 20659 + }, + { + "epoch": 0.18018175158291325, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 20660 + }, + { + "epoch": 0.18019047286808185, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 20661 + }, + { + "epoch": 0.18019919415325042, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 20662 + }, + { + "epoch": 0.180207915438419, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 20663 + }, + { + "epoch": 0.1802166367235876, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 20664 + }, + { + "epoch": 0.18022535800875616, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 20665 + }, + { + "epoch": 0.18023407929392476, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 20666 + }, + { + "epoch": 0.18024280057909334, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 20667 + }, + { + "epoch": 0.1802515218642619, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 20668 + }, + { + "epoch": 0.1802602431494305, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 20669 + }, + { + "epoch": 0.18026896443459908, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 20670 + }, + { + "epoch": 0.18027768571976766, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 20671 + }, + { + "epoch": 0.18028640700493626, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 20672 + }, + { + "epoch": 0.18029512829010483, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 20673 + }, + { + "epoch": 0.1803038495752734, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 20674 + }, + { + "epoch": 0.180312570860442, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 20675 + }, + { + "epoch": 0.18032129214561057, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 20676 + }, + { + "epoch": 0.18033001343077915, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 20677 + }, + { + "epoch": 0.18033873471594775, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 20678 + }, + { + "epoch": 0.18034745600111632, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 20679 + }, + { + "epoch": 0.18035617728628492, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 20680 + }, + { + "epoch": 0.1803648985714535, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20681 + }, + { + "epoch": 0.18037361985662206, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 20682 + }, + { + "epoch": 0.18038234114179066, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 20683 + }, + { + "epoch": 0.18039106242695924, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 20684 + }, + { + "epoch": 0.1803997837121278, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 20685 + }, + { + "epoch": 0.1804085049972964, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 20686 + }, + { + "epoch": 0.18041722628246498, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 20687 + }, + { + "epoch": 0.18042594756763355, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 20688 + }, + { + "epoch": 0.18043466885280215, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 20689 + }, + { + "epoch": 0.18044339013797073, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 20690 + }, + { + "epoch": 0.18045211142313933, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20691 + }, + { + "epoch": 0.1804608327083079, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 20692 + }, + { + "epoch": 0.18046955399347647, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20693 + }, + { + "epoch": 0.18047827527864507, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 20694 + }, + { + "epoch": 0.18048699656381365, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 20695 + }, + { + "epoch": 0.18049571784898222, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 20696 + }, + { + "epoch": 0.18050443913415082, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 20697 + }, + { + "epoch": 0.1805131604193194, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20698 + }, + { + "epoch": 0.18052188170448796, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20699 + }, + { + "epoch": 0.18053060298965656, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 20700 + }, + { + "epoch": 0.18053932427482514, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 20701 + }, + { + "epoch": 0.1805480455599937, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 20702 + }, + { + "epoch": 0.1805567668451623, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 20703 + }, + { + "epoch": 0.18056548813033088, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 20704 + }, + { + "epoch": 0.18057420941549948, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 20705 + }, + { + "epoch": 0.18058293070066805, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 20706 + }, + { + "epoch": 0.18059165198583663, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 20707 + }, + { + "epoch": 0.18060037327100523, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 20708 + }, + { + "epoch": 0.1806090945561738, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 20709 + }, + { + "epoch": 0.18061781584134237, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 20710 + }, + { + "epoch": 0.18062653712651097, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 20711 + }, + { + "epoch": 0.18063525841167954, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 20712 + }, + { + "epoch": 0.18064397969684812, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 20713 + }, + { + "epoch": 0.18065270098201672, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 20714 + }, + { + "epoch": 0.1806614222671853, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 20715 + }, + { + "epoch": 0.18067014355235386, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 20716 + }, + { + "epoch": 0.18067886483752246, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 20717 + }, + { + "epoch": 0.18068758612269104, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 20718 + }, + { + "epoch": 0.18069630740785964, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 20719 + }, + { + "epoch": 0.1807050286930282, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 20720 + }, + { + "epoch": 0.18071374997819678, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 20721 + }, + { + "epoch": 0.18072247126336538, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 20722 + }, + { + "epoch": 0.18073119254853395, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 20723 + }, + { + "epoch": 0.18073991383370253, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 20724 + }, + { + "epoch": 0.18074863511887113, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 20725 + }, + { + "epoch": 0.1807573564040397, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 20726 + }, + { + "epoch": 0.18076607768920827, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 20727 + }, + { + "epoch": 0.18077479897437687, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 20728 + }, + { + "epoch": 0.18078352025954544, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 20729 + }, + { + "epoch": 0.18079224154471402, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 20730 + }, + { + "epoch": 0.18080096282988262, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 20731 + }, + { + "epoch": 0.1808096841150512, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 20732 + }, + { + "epoch": 0.1808184054002198, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 20733 + }, + { + "epoch": 0.18082712668538836, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 20734 + }, + { + "epoch": 0.18083584797055693, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 20735 + }, + { + "epoch": 0.18084456925572553, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 20736 + }, + { + "epoch": 0.1808532905408941, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 20737 + }, + { + "epoch": 0.18086201182606268, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 20738 + }, + { + "epoch": 0.18087073311123128, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 20739 + }, + { + "epoch": 0.18087945439639985, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 20740 + }, + { + "epoch": 0.18088817568156842, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 20741 + }, + { + "epoch": 0.18089689696673703, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 20742 + }, + { + "epoch": 0.1809056182519056, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 20743 + }, + { + "epoch": 0.18091433953707417, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 20744 + }, + { + "epoch": 0.18092306082224277, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 20745 + }, + { + "epoch": 0.18093178210741134, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 20746 + }, + { + "epoch": 0.18094050339257994, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 20747 + }, + { + "epoch": 0.18094922467774852, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 20748 + }, + { + "epoch": 0.1809579459629171, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 20749 + }, + { + "epoch": 0.1809666672480857, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20750 + }, + { + "epoch": 0.18097538853325426, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 20751 + }, + { + "epoch": 0.18098410981842283, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 20752 + }, + { + "epoch": 0.18099283110359143, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 20753 + }, + { + "epoch": 0.18100155238876, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 20754 + }, + { + "epoch": 0.18101027367392858, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 20755 + }, + { + "epoch": 0.18101899495909718, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 20756 + }, + { + "epoch": 0.18102771624426575, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 20757 + }, + { + "epoch": 0.18103643752943432, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 20758 + }, + { + "epoch": 0.18104515881460292, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 20759 + }, + { + "epoch": 0.1810538800997715, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 20760 + }, + { + "epoch": 0.1810626013849401, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 20761 + }, + { + "epoch": 0.18107132267010867, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 20762 + }, + { + "epoch": 0.18108004395527724, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 20763 + }, + { + "epoch": 0.18108876524044584, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 20764 + }, + { + "epoch": 0.18109748652561442, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 20765 + }, + { + "epoch": 0.181106207810783, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 20766 + }, + { + "epoch": 0.1811149290959516, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 20767 + }, + { + "epoch": 0.18112365038112016, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20768 + }, + { + "epoch": 0.18113237166628873, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 20769 + }, + { + "epoch": 0.18114109295145733, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 20770 + }, + { + "epoch": 0.1811498142366259, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 20771 + }, + { + "epoch": 0.18115853552179448, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 20772 + }, + { + "epoch": 0.18116725680696308, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 20773 + }, + { + "epoch": 0.18117597809213165, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20774 + }, + { + "epoch": 0.18118469937730025, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 20775 + }, + { + "epoch": 0.18119342066246882, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 20776 + }, + { + "epoch": 0.1812021419476374, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 20777 + }, + { + "epoch": 0.181210863232806, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20778 + }, + { + "epoch": 0.18121958451797457, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 20779 + }, + { + "epoch": 0.18122830580314314, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20780 + }, + { + "epoch": 0.18123702708831174, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 20781 + }, + { + "epoch": 0.18124574837348031, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 20782 + }, + { + "epoch": 0.1812544696586489, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 20783 + }, + { + "epoch": 0.1812631909438175, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 20784 + }, + { + "epoch": 0.18127191222898606, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 20785 + }, + { + "epoch": 0.18128063351415463, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 20786 + }, + { + "epoch": 0.18128935479932323, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 20787 + }, + { + "epoch": 0.1812980760844918, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 20788 + }, + { + "epoch": 0.1813067973696604, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 20789 + }, + { + "epoch": 0.18131551865482898, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 20790 + }, + { + "epoch": 0.18132423993999755, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 20791 + }, + { + "epoch": 0.18133296122516615, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 20792 + }, + { + "epoch": 0.18134168251033472, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 20793 + }, + { + "epoch": 0.1813504037955033, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 20794 + }, + { + "epoch": 0.1813591250806719, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 20795 + }, + { + "epoch": 0.18136784636584047, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 20796 + }, + { + "epoch": 0.18137656765100904, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 20797 + }, + { + "epoch": 0.18138528893617764, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 20798 + }, + { + "epoch": 0.1813940102213462, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 20799 + }, + { + "epoch": 0.1814027315065148, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 20800 + }, + { + "epoch": 0.1814114527916834, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 20801 + }, + { + "epoch": 0.18142017407685196, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 20802 + }, + { + "epoch": 0.18142889536202056, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 20803 + }, + { + "epoch": 0.18143761664718913, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 20804 + }, + { + "epoch": 0.1814463379323577, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 20805 + }, + { + "epoch": 0.1814550592175263, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 20806 + }, + { + "epoch": 0.18146378050269488, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 20807 + }, + { + "epoch": 0.18147250178786345, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 20808 + }, + { + "epoch": 0.18148122307303205, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 20809 + }, + { + "epoch": 0.18148994435820062, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 20810 + }, + { + "epoch": 0.1814986656433692, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 20811 + }, + { + "epoch": 0.1815073869285378, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 20812 + }, + { + "epoch": 0.18151610821370637, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 20813 + }, + { + "epoch": 0.18152482949887497, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 20814 + }, + { + "epoch": 0.18153355078404354, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 20815 + }, + { + "epoch": 0.1815422720692121, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 20816 + }, + { + "epoch": 0.1815509933543807, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 20817 + }, + { + "epoch": 0.18155971463954929, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 20818 + }, + { + "epoch": 0.18156843592471786, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 20819 + }, + { + "epoch": 0.18157715720988646, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 20820 + }, + { + "epoch": 0.18158587849505503, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 20821 + }, + { + "epoch": 0.1815945997802236, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 20822 + }, + { + "epoch": 0.1816033210653922, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 20823 + }, + { + "epoch": 0.18161204235056078, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 20824 + }, + { + "epoch": 0.18162076363572935, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 20825 + }, + { + "epoch": 0.18162948492089795, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 20826 + }, + { + "epoch": 0.18163820620606652, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 20827 + }, + { + "epoch": 0.18164692749123512, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 20828 + }, + { + "epoch": 0.1816556487764037, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 20829 + }, + { + "epoch": 0.18166437006157227, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 20830 + }, + { + "epoch": 0.18167309134674087, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 20831 + }, + { + "epoch": 0.18168181263190944, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 20832 + }, + { + "epoch": 0.181690533917078, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20833 + }, + { + "epoch": 0.1816992552022466, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 20834 + }, + { + "epoch": 0.18170797648741518, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 20835 + }, + { + "epoch": 0.18171669777258376, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 20836 + }, + { + "epoch": 0.18172541905775236, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20837 + }, + { + "epoch": 0.18173414034292093, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 20838 + }, + { + "epoch": 0.1817428616280895, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 20839 + }, + { + "epoch": 0.1817515829132581, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 20840 + }, + { + "epoch": 0.18176030419842668, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 20841 + }, + { + "epoch": 0.18176902548359528, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 20842 + }, + { + "epoch": 0.18177774676876385, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 20843 + }, + { + "epoch": 0.18178646805393242, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 20844 + }, + { + "epoch": 0.18179518933910102, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 20845 + }, + { + "epoch": 0.1818039106242696, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20846 + }, + { + "epoch": 0.18181263190943817, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 20847 + }, + { + "epoch": 0.18182135319460677, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 20848 + }, + { + "epoch": 0.18183007447977534, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 20849 + }, + { + "epoch": 0.1818387957649439, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 20850 + }, + { + "epoch": 0.1818475170501125, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20851 + }, + { + "epoch": 0.18185623833528108, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 20852 + }, + { + "epoch": 0.18186495962044966, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 20853 + }, + { + "epoch": 0.18187368090561826, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 20854 + }, + { + "epoch": 0.18188240219078683, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 20855 + }, + { + "epoch": 0.18189112347595543, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 20856 + }, + { + "epoch": 0.181899844761124, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 20857 + }, + { + "epoch": 0.18190856604629257, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 20858 + }, + { + "epoch": 0.18191728733146117, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 20859 + }, + { + "epoch": 0.18192600861662975, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 20860 + }, + { + "epoch": 0.18193472990179832, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 20861 + }, + { + "epoch": 0.18194345118696692, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 20862 + }, + { + "epoch": 0.1819521724721355, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 20863 + }, + { + "epoch": 0.18196089375730407, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 20864 + }, + { + "epoch": 0.18196961504247267, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 20865 + }, + { + "epoch": 0.18197833632764124, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20866 + }, + { + "epoch": 0.1819870576128098, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 20867 + }, + { + "epoch": 0.1819957788979784, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 20868 + }, + { + "epoch": 0.18200450018314698, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 20869 + }, + { + "epoch": 0.18201322146831558, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 20870 + }, + { + "epoch": 0.18202194275348416, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 20871 + }, + { + "epoch": 0.18203066403865273, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20872 + }, + { + "epoch": 0.18203938532382133, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 20873 + }, + { + "epoch": 0.1820481066089899, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 20874 + }, + { + "epoch": 0.18205682789415847, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 20875 + }, + { + "epoch": 0.18206554917932707, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 20876 + }, + { + "epoch": 0.18207427046449565, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 20877 + }, + { + "epoch": 0.18208299174966422, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 20878 + }, + { + "epoch": 0.18209171303483282, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 20879 + }, + { + "epoch": 0.1821004343200014, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 20880 + }, + { + "epoch": 0.18210915560516996, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20881 + }, + { + "epoch": 0.18211787689033856, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 20882 + }, + { + "epoch": 0.18212659817550714, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 20883 + }, + { + "epoch": 0.18213531946067574, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 20884 + }, + { + "epoch": 0.1821440407458443, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 20885 + }, + { + "epoch": 0.18215276203101288, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 20886 + }, + { + "epoch": 0.18216148331618148, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 20887 + }, + { + "epoch": 0.18217020460135006, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 20888 + }, + { + "epoch": 0.18217892588651863, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 20889 + }, + { + "epoch": 0.18218764717168723, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 20890 + }, + { + "epoch": 0.1821963684568558, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 20891 + }, + { + "epoch": 0.18220508974202437, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 20892 + }, + { + "epoch": 0.18221381102719297, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 20893 + }, + { + "epoch": 0.18222253231236155, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 20894 + }, + { + "epoch": 0.18223125359753012, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 20895 + }, + { + "epoch": 0.18223997488269872, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 20896 + }, + { + "epoch": 0.1822486961678673, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 20897 + }, + { + "epoch": 0.1822574174530359, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 20898 + }, + { + "epoch": 0.18226613873820446, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 20899 + }, + { + "epoch": 0.18227486002337304, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 20900 + }, + { + "epoch": 0.18228358130854164, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 20901 + }, + { + "epoch": 0.1822923025937102, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 20902 + }, + { + "epoch": 0.18230102387887878, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 20903 + }, + { + "epoch": 0.18230974516404738, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 20904 + }, + { + "epoch": 0.18231846644921595, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 20905 + }, + { + "epoch": 0.18232718773438453, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 20906 + }, + { + "epoch": 0.18233590901955313, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 20907 + }, + { + "epoch": 0.1823446303047217, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 20908 + }, + { + "epoch": 0.18235335158989027, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 20909 + }, + { + "epoch": 0.18236207287505887, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 20910 + }, + { + "epoch": 0.18237079416022745, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 20911 + }, + { + "epoch": 0.18237951544539605, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 20912 + }, + { + "epoch": 0.18238823673056462, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 20913 + }, + { + "epoch": 0.1823969580157332, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 20914 + }, + { + "epoch": 0.1824056793009018, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 20915 + }, + { + "epoch": 0.18241440058607036, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 20916 + }, + { + "epoch": 0.18242312187123894, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 20917 + }, + { + "epoch": 0.18243184315640754, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 20918 + }, + { + "epoch": 0.1824405644415761, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 20919 + }, + { + "epoch": 0.18244928572674468, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 20920 + }, + { + "epoch": 0.18245800701191328, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 20921 + }, + { + "epoch": 0.18246672829708185, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 20922 + }, + { + "epoch": 0.18247544958225045, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 20923 + }, + { + "epoch": 0.18248417086741903, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 20924 + }, + { + "epoch": 0.1824928921525876, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 20925 + }, + { + "epoch": 0.1825016134377562, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 20926 + }, + { + "epoch": 0.18251033472292477, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 20927 + }, + { + "epoch": 0.18251905600809334, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 20928 + }, + { + "epoch": 0.18252777729326194, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 20929 + }, + { + "epoch": 0.18253649857843052, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 20930 + }, + { + "epoch": 0.1825452198635991, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 20931 + }, + { + "epoch": 0.1825539411487677, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 20932 + }, + { + "epoch": 0.18256266243393626, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 20933 + }, + { + "epoch": 0.18257138371910483, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 20934 + }, + { + "epoch": 0.18258010500427344, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 20935 + }, + { + "epoch": 0.182588826289442, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 20936 + }, + { + "epoch": 0.1825975475746106, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 20937 + }, + { + "epoch": 0.18260626885977918, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 20938 + }, + { + "epoch": 0.18261499014494775, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 20939 + }, + { + "epoch": 0.18262371143011635, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 20940 + }, + { + "epoch": 0.18263243271528493, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 20941 + }, + { + "epoch": 0.1826411540004535, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20942 + }, + { + "epoch": 0.1826498752856221, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 20943 + }, + { + "epoch": 0.18265859657079067, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 20944 + }, + { + "epoch": 0.18266731785595924, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 20945 + }, + { + "epoch": 0.18267603914112784, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 20946 + }, + { + "epoch": 0.18268476042629642, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 20947 + }, + { + "epoch": 0.182693481711465, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 20948 + }, + { + "epoch": 0.1827022029966336, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 20949 + }, + { + "epoch": 0.18271092428180216, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 20950 + }, + { + "epoch": 0.18271964556697076, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 20951 + }, + { + "epoch": 0.18272836685213933, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 20952 + }, + { + "epoch": 0.1827370881373079, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 20953 + }, + { + "epoch": 0.1827458094224765, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 20954 + }, + { + "epoch": 0.18275453070764508, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 20955 + }, + { + "epoch": 0.18276325199281365, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 20956 + }, + { + "epoch": 0.18277197327798225, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 20957 + }, + { + "epoch": 0.18278069456315083, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 20958 + }, + { + "epoch": 0.1827894158483194, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 20959 + }, + { + "epoch": 0.182798137133488, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 20960 + }, + { + "epoch": 0.18280685841865657, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 20961 + }, + { + "epoch": 0.18281557970382514, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 20962 + }, + { + "epoch": 0.18282430098899374, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 20963 + }, + { + "epoch": 0.18283302227416232, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 20964 + }, + { + "epoch": 0.18284174355933092, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 20965 + }, + { + "epoch": 0.1828504648444995, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 20966 + }, + { + "epoch": 0.18285918612966806, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 20967 + }, + { + "epoch": 0.18286790741483666, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 20968 + }, + { + "epoch": 0.18287662870000523, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 20969 + }, + { + "epoch": 0.1828853499851738, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 20970 + }, + { + "epoch": 0.1828940712703424, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 20971 + }, + { + "epoch": 0.18290279255551098, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 20972 + }, + { + "epoch": 0.18291151384067955, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 20973 + }, + { + "epoch": 0.18292023512584815, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 20974 + }, + { + "epoch": 0.18292895641101672, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 20975 + }, + { + "epoch": 0.1829376776961853, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 20976 + }, + { + "epoch": 0.1829463989813539, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 20977 + }, + { + "epoch": 0.18295512026652247, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 20978 + }, + { + "epoch": 0.18296384155169107, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 20979 + }, + { + "epoch": 0.18297256283685964, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 20980 + }, + { + "epoch": 0.18298128412202821, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 20981 + }, + { + "epoch": 0.18299000540719682, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 20982 + }, + { + "epoch": 0.1829987266923654, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 20983 + }, + { + "epoch": 0.18300744797753396, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 20984 + }, + { + "epoch": 0.18301616926270256, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 20985 + }, + { + "epoch": 0.18302489054787113, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 20986 + }, + { + "epoch": 0.1830336118330397, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 20987 + }, + { + "epoch": 0.1830423331182083, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 20988 + }, + { + "epoch": 0.18305105440337688, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 20989 + }, + { + "epoch": 0.18305977568854545, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 20990 + }, + { + "epoch": 0.18306849697371405, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 20991 + }, + { + "epoch": 0.18307721825888262, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 20992 + }, + { + "epoch": 0.18308593954405122, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 20993 + }, + { + "epoch": 0.1830946608292198, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 20994 + }, + { + "epoch": 0.18310338211438837, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 20995 + }, + { + "epoch": 0.18311210339955697, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 20996 + }, + { + "epoch": 0.18312082468472554, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 20997 + }, + { + "epoch": 0.18312954596989411, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 20998 + }, + { + "epoch": 0.18313826725506271, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 20999 + }, + { + "epoch": 0.1831469885402313, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 21000 + }, + { + "epoch": 0.18315570982539986, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 21001 + }, + { + "epoch": 0.18316443111056846, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 21002 + }, + { + "epoch": 0.18317315239573703, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21003 + }, + { + "epoch": 0.1831818736809056, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 21004 + }, + { + "epoch": 0.1831905949660742, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 21005 + }, + { + "epoch": 0.18319931625124278, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 21006 + }, + { + "epoch": 0.18320803753641138, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 21007 + }, + { + "epoch": 0.18321675882157995, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 21008 + }, + { + "epoch": 0.18322548010674852, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 21009 + }, + { + "epoch": 0.18323420139191712, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21010 + }, + { + "epoch": 0.1832429226770857, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 21011 + }, + { + "epoch": 0.18325164396225427, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 21012 + }, + { + "epoch": 0.18326036524742287, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 21013 + }, + { + "epoch": 0.18326908653259144, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 21014 + }, + { + "epoch": 0.18327780781776, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21015 + }, + { + "epoch": 0.1832865291029286, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 21016 + }, + { + "epoch": 0.1832952503880972, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 21017 + }, + { + "epoch": 0.18330397167326576, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 21018 + }, + { + "epoch": 0.18331269295843436, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 21019 + }, + { + "epoch": 0.18332141424360293, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 21020 + }, + { + "epoch": 0.18333013552877153, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 21021 + }, + { + "epoch": 0.1833388568139401, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 21022 + }, + { + "epoch": 0.18334757809910868, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 21023 + }, + { + "epoch": 0.18335629938427728, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 21024 + }, + { + "epoch": 0.18336502066944585, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 21025 + }, + { + "epoch": 0.18337374195461442, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 21026 + }, + { + "epoch": 0.18338246323978302, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 21027 + }, + { + "epoch": 0.1833911845249516, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 21028 + }, + { + "epoch": 0.18339990581012017, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 21029 + }, + { + "epoch": 0.18340862709528877, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 21030 + }, + { + "epoch": 0.18341734838045734, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 21031 + }, + { + "epoch": 0.18342606966562594, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 21032 + }, + { + "epoch": 0.1834347909507945, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 21033 + }, + { + "epoch": 0.18344351223596309, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 21034 + }, + { + "epoch": 0.18345223352113169, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21035 + }, + { + "epoch": 0.18346095480630026, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 21036 + }, + { + "epoch": 0.18346967609146883, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 21037 + }, + { + "epoch": 0.18347839737663743, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 21038 + }, + { + "epoch": 0.183487118661806, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 21039 + }, + { + "epoch": 0.18349583994697458, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 21040 + }, + { + "epoch": 0.18350456123214318, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 21041 + }, + { + "epoch": 0.18351328251731175, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 21042 + }, + { + "epoch": 0.18352200380248032, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 21043 + }, + { + "epoch": 0.18353072508764892, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 21044 + }, + { + "epoch": 0.1835394463728175, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 21045 + }, + { + "epoch": 0.1835481676579861, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 21046 + }, + { + "epoch": 0.18355688894315467, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 21047 + }, + { + "epoch": 0.18356561022832324, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 21048 + }, + { + "epoch": 0.18357433151349184, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 21049 + }, + { + "epoch": 0.1835830527986604, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 21050 + }, + { + "epoch": 0.18359177408382898, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 21051 + }, + { + "epoch": 0.18360049536899758, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21052 + }, + { + "epoch": 0.18360921665416616, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 21053 + }, + { + "epoch": 0.18361793793933473, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 21054 + }, + { + "epoch": 0.18362665922450333, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 21055 + }, + { + "epoch": 0.1836353805096719, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 21056 + }, + { + "epoch": 0.18364410179484048, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 21057 + }, + { + "epoch": 0.18365282308000908, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 21058 + }, + { + "epoch": 0.18366154436517765, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 21059 + }, + { + "epoch": 0.18367026565034625, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 21060 + }, + { + "epoch": 0.18367898693551482, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 21061 + }, + { + "epoch": 0.1836877082206834, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 21062 + }, + { + "epoch": 0.183696429505852, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 21063 + }, + { + "epoch": 0.18370515079102057, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 21064 + }, + { + "epoch": 0.18371387207618914, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 21065 + }, + { + "epoch": 0.18372259336135774, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 21066 + }, + { + "epoch": 0.1837313146465263, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21067 + }, + { + "epoch": 0.18374003593169488, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 21068 + }, + { + "epoch": 0.18374875721686348, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 21069 + }, + { + "epoch": 0.18375747850203206, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 21070 + }, + { + "epoch": 0.18376619978720063, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 21071 + }, + { + "epoch": 0.18377492107236923, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 21072 + }, + { + "epoch": 0.1837836423575378, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 21073 + }, + { + "epoch": 0.1837923636427064, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 21074 + }, + { + "epoch": 0.18380108492787497, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 21075 + }, + { + "epoch": 0.18380980621304355, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 21076 + }, + { + "epoch": 0.18381852749821215, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 21077 + }, + { + "epoch": 0.18382724878338072, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 21078 + }, + { + "epoch": 0.1838359700685493, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 21079 + }, + { + "epoch": 0.1838446913537179, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 21080 + }, + { + "epoch": 0.18385341263888647, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 21081 + }, + { + "epoch": 0.18386213392405504, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21082 + }, + { + "epoch": 0.18387085520922364, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 21083 + }, + { + "epoch": 0.1838795764943922, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 21084 + }, + { + "epoch": 0.18388829777956078, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 21085 + }, + { + "epoch": 0.18389701906472938, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 21086 + }, + { + "epoch": 0.18390574034989796, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 21087 + }, + { + "epoch": 0.18391446163506656, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 21088 + }, + { + "epoch": 0.18392318292023513, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 21089 + }, + { + "epoch": 0.1839319042054037, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 21090 + }, + { + "epoch": 0.1839406254905723, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 21091 + }, + { + "epoch": 0.18394934677574087, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 21092 + }, + { + "epoch": 0.18395806806090945, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 21093 + }, + { + "epoch": 0.18396678934607805, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 21094 + }, + { + "epoch": 0.18397551063124662, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 21095 + }, + { + "epoch": 0.1839842319164152, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 21096 + }, + { + "epoch": 0.1839929532015838, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 21097 + }, + { + "epoch": 0.18400167448675236, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 21098 + }, + { + "epoch": 0.18401039577192094, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 21099 + }, + { + "epoch": 0.18401911705708954, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 21100 + }, + { + "epoch": 0.1840278383422581, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 21101 + }, + { + "epoch": 0.1840365596274267, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 21102 + }, + { + "epoch": 0.18404528091259528, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 21103 + }, + { + "epoch": 0.18405400219776386, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 21104 + }, + { + "epoch": 0.18406272348293246, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 21105 + }, + { + "epoch": 0.18407144476810103, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 21106 + }, + { + "epoch": 0.1840801660532696, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 21107 + }, + { + "epoch": 0.1840888873384382, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 21108 + }, + { + "epoch": 0.18409760862360677, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 21109 + }, + { + "epoch": 0.18410632990877535, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 21110 + }, + { + "epoch": 0.18411505119394395, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 21111 + }, + { + "epoch": 0.18412377247911252, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 21112 + }, + { + "epoch": 0.1841324937642811, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 21113 + }, + { + "epoch": 0.1841412150494497, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 21114 + }, + { + "epoch": 0.18414993633461826, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 21115 + }, + { + "epoch": 0.18415865761978686, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 21116 + }, + { + "epoch": 0.18416737890495544, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 21117 + }, + { + "epoch": 0.184176100190124, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 21118 + }, + { + "epoch": 0.1841848214752926, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 21119 + }, + { + "epoch": 0.18419354276046118, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 21120 + }, + { + "epoch": 0.18420226404562975, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 21121 + }, + { + "epoch": 0.18421098533079835, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 21122 + }, + { + "epoch": 0.18421970661596693, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 21123 + }, + { + "epoch": 0.1842284279011355, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 21124 + }, + { + "epoch": 0.1842371491863041, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 21125 + }, + { + "epoch": 0.18424587047147267, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 21126 + }, + { + "epoch": 0.18425459175664125, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 21127 + }, + { + "epoch": 0.18426331304180985, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 21128 + }, + { + "epoch": 0.18427203432697842, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 21129 + }, + { + "epoch": 0.18428075561214702, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 21130 + }, + { + "epoch": 0.1842894768973156, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 21131 + }, + { + "epoch": 0.18429819818248416, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 21132 + }, + { + "epoch": 0.18430691946765276, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 21133 + }, + { + "epoch": 0.18431564075282134, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21134 + }, + { + "epoch": 0.1843243620379899, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21135 + }, + { + "epoch": 0.1843330833231585, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 21136 + }, + { + "epoch": 0.18434180460832708, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21137 + }, + { + "epoch": 0.18435052589349565, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 21138 + }, + { + "epoch": 0.18435924717866425, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 21139 + }, + { + "epoch": 0.18436796846383283, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 21140 + }, + { + "epoch": 0.18437668974900143, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 21141 + }, + { + "epoch": 0.18438541103417, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21142 + }, + { + "epoch": 0.18439413231933857, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 21143 + }, + { + "epoch": 0.18440285360450717, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 21144 + }, + { + "epoch": 0.18441157488967574, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 21145 + }, + { + "epoch": 0.18442029617484432, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 21146 + }, + { + "epoch": 0.18442901746001292, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 21147 + }, + { + "epoch": 0.1844377387451815, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 21148 + }, + { + "epoch": 0.18444646003035006, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 21149 + }, + { + "epoch": 0.18445518131551866, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 21150 + }, + { + "epoch": 0.18446390260068724, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 21151 + }, + { + "epoch": 0.1844726238858558, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 21152 + }, + { + "epoch": 0.1844813451710244, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 21153 + }, + { + "epoch": 0.18449006645619298, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 21154 + }, + { + "epoch": 0.18449878774136158, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 21155 + }, + { + "epoch": 0.18450750902653015, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 21156 + }, + { + "epoch": 0.18451623031169873, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 21157 + }, + { + "epoch": 0.18452495159686733, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 21158 + }, + { + "epoch": 0.1845336728820359, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 21159 + }, + { + "epoch": 0.18454239416720447, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 21160 + }, + { + "epoch": 0.18455111545237307, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 21161 + }, + { + "epoch": 0.18455983673754164, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 21162 + }, + { + "epoch": 0.18456855802271022, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21163 + }, + { + "epoch": 0.18457727930787882, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 21164 + }, + { + "epoch": 0.1845860005930474, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 21165 + }, + { + "epoch": 0.18459472187821596, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 21166 + }, + { + "epoch": 0.18460344316338456, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 21167 + }, + { + "epoch": 0.18461216444855313, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 21168 + }, + { + "epoch": 0.18462088573372173, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 21169 + }, + { + "epoch": 0.1846296070188903, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 21170 + }, + { + "epoch": 0.18463832830405888, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21171 + }, + { + "epoch": 0.18464704958922748, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 21172 + }, + { + "epoch": 0.18465577087439605, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 21173 + }, + { + "epoch": 0.18466449215956462, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 21174 + }, + { + "epoch": 0.18467321344473323, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 21175 + }, + { + "epoch": 0.1846819347299018, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 21176 + }, + { + "epoch": 0.18469065601507037, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 21177 + }, + { + "epoch": 0.18469937730023897, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 21178 + }, + { + "epoch": 0.18470809858540754, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 21179 + }, + { + "epoch": 0.18471681987057612, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 21180 + }, + { + "epoch": 0.18472554115574472, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 21181 + }, + { + "epoch": 0.1847342624409133, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 21182 + }, + { + "epoch": 0.1847429837260819, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 21183 + }, + { + "epoch": 0.18475170501125046, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 21184 + }, + { + "epoch": 0.18476042629641903, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 21185 + }, + { + "epoch": 0.18476914758158763, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 21186 + }, + { + "epoch": 0.1847778688667562, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 21187 + }, + { + "epoch": 0.18478659015192478, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 21188 + }, + { + "epoch": 0.18479531143709338, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 21189 + }, + { + "epoch": 0.18480403272226195, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 21190 + }, + { + "epoch": 0.18481275400743052, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 21191 + }, + { + "epoch": 0.18482147529259912, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 21192 + }, + { + "epoch": 0.1848301965777677, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 21193 + }, + { + "epoch": 0.18483891786293627, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 21194 + }, + { + "epoch": 0.18484763914810487, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 21195 + }, + { + "epoch": 0.18485636043327344, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 21196 + }, + { + "epoch": 0.18486508171844204, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 21197 + }, + { + "epoch": 0.18487380300361062, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 21198 + }, + { + "epoch": 0.1848825242887792, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 21199 + }, + { + "epoch": 0.1848912455739478, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 21200 + }, + { + "epoch": 0.18489996685911636, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 21201 + }, + { + "epoch": 0.18490868814428493, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 21202 + }, + { + "epoch": 0.18491740942945353, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 21203 + }, + { + "epoch": 0.1849261307146221, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 21204 + }, + { + "epoch": 0.18493485199979068, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 21205 + }, + { + "epoch": 0.18494357328495928, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 21206 + }, + { + "epoch": 0.18495229457012785, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 21207 + }, + { + "epoch": 0.18496101585529642, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 21208 + }, + { + "epoch": 0.18496973714046502, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 21209 + }, + { + "epoch": 0.1849784584256336, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 21210 + }, + { + "epoch": 0.1849871797108022, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 21211 + }, + { + "epoch": 0.18499590099597077, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 21212 + }, + { + "epoch": 0.18500462228113934, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 21213 + }, + { + "epoch": 0.18501334356630794, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21214 + }, + { + "epoch": 0.18502206485147651, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 21215 + }, + { + "epoch": 0.1850307861366451, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 21216 + }, + { + "epoch": 0.1850395074218137, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 21217 + }, + { + "epoch": 0.18504822870698226, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 21218 + }, + { + "epoch": 0.18505694999215083, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 21219 + }, + { + "epoch": 0.18506567127731943, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 21220 + }, + { + "epoch": 0.185074392562488, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 21221 + }, + { + "epoch": 0.18508311384765658, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 21222 + }, + { + "epoch": 0.18509183513282518, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 21223 + }, + { + "epoch": 0.18510055641799375, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 21224 + }, + { + "epoch": 0.18510927770316235, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 21225 + }, + { + "epoch": 0.18511799898833092, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 21226 + }, + { + "epoch": 0.1851267202734995, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 21227 + }, + { + "epoch": 0.1851354415586681, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 21228 + }, + { + "epoch": 0.18514416284383667, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 21229 + }, + { + "epoch": 0.18515288412900524, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 21230 + }, + { + "epoch": 0.18516160541417384, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 21231 + }, + { + "epoch": 0.1851703266993424, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 21232 + }, + { + "epoch": 0.18517904798451099, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 21233 + }, + { + "epoch": 0.1851877692696796, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21234 + }, + { + "epoch": 0.18519649055484816, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 21235 + }, + { + "epoch": 0.18520521184001673, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 21236 + }, + { + "epoch": 0.18521393312518533, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 21237 + }, + { + "epoch": 0.1852226544103539, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 21238 + }, + { + "epoch": 0.1852313756955225, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 21239 + }, + { + "epoch": 0.18524009698069108, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 21240 + }, + { + "epoch": 0.18524881826585965, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 21241 + }, + { + "epoch": 0.18525753955102825, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 21242 + }, + { + "epoch": 0.18526626083619682, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 21243 + }, + { + "epoch": 0.1852749821213654, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 21244 + }, + { + "epoch": 0.185283703406534, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 21245 + }, + { + "epoch": 0.18529242469170257, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 21246 + }, + { + "epoch": 0.18530114597687114, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 21247 + }, + { + "epoch": 0.18530986726203974, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21248 + }, + { + "epoch": 0.1853185885472083, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 21249 + }, + { + "epoch": 0.18532730983237689, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 21250 + }, + { + "epoch": 0.18533603111754549, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 21251 + }, + { + "epoch": 0.18534475240271406, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 21252 + }, + { + "epoch": 0.18535347368788266, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 21253 + }, + { + "epoch": 0.18536219497305123, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 21254 + }, + { + "epoch": 0.1853709162582198, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 21255 + }, + { + "epoch": 0.1853796375433884, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 21256 + }, + { + "epoch": 0.18538835882855698, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 21257 + }, + { + "epoch": 0.18539708011372555, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 21258 + }, + { + "epoch": 0.18540580139889415, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 21259 + }, + { + "epoch": 0.18541452268406272, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 21260 + }, + { + "epoch": 0.1854232439692313, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 21261 + }, + { + "epoch": 0.1854319652543999, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 21262 + }, + { + "epoch": 0.18544068653956847, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 21263 + }, + { + "epoch": 0.18544940782473707, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 21264 + }, + { + "epoch": 0.18545812910990564, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 21265 + }, + { + "epoch": 0.1854668503950742, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 21266 + }, + { + "epoch": 0.1854755716802428, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 21267 + }, + { + "epoch": 0.18548429296541138, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 21268 + }, + { + "epoch": 0.18549301425057996, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 21269 + }, + { + "epoch": 0.18550173553574856, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 21270 + }, + { + "epoch": 0.18551045682091713, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 21271 + }, + { + "epoch": 0.1855191781060857, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 21272 + }, + { + "epoch": 0.1855278993912543, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 21273 + }, + { + "epoch": 0.18553662067642288, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 21274 + }, + { + "epoch": 0.18554534196159145, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 21275 + }, + { + "epoch": 0.18555406324676005, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 21276 + }, + { + "epoch": 0.18556278453192862, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 21277 + }, + { + "epoch": 0.18557150581709722, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 21278 + }, + { + "epoch": 0.1855802271022658, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 21279 + }, + { + "epoch": 0.18558894838743437, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21280 + }, + { + "epoch": 0.18559766967260297, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 21281 + }, + { + "epoch": 0.18560639095777154, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 21282 + }, + { + "epoch": 0.1856151122429401, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 21283 + }, + { + "epoch": 0.1856238335281087, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21284 + }, + { + "epoch": 0.18563255481327728, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 21285 + }, + { + "epoch": 0.18564127609844586, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 21286 + }, + { + "epoch": 0.18564999738361446, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 21287 + }, + { + "epoch": 0.18565871866878303, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 21288 + }, + { + "epoch": 0.1856674399539516, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 21289 + }, + { + "epoch": 0.1856761612391202, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 21290 + }, + { + "epoch": 0.18568488252428877, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 21291 + }, + { + "epoch": 0.18569360380945737, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 21292 + }, + { + "epoch": 0.18570232509462595, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 21293 + }, + { + "epoch": 0.18571104637979452, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 21294 + }, + { + "epoch": 0.18571976766496312, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 21295 + }, + { + "epoch": 0.1857284889501317, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 21296 + }, + { + "epoch": 0.18573721023530027, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 21297 + }, + { + "epoch": 0.18574593152046887, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 21298 + }, + { + "epoch": 0.18575465280563744, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 21299 + }, + { + "epoch": 0.185763374090806, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 21300 + }, + { + "epoch": 0.1857720953759746, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 21301 + }, + { + "epoch": 0.18578081666114318, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 21302 + }, + { + "epoch": 0.18578953794631176, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 21303 + }, + { + "epoch": 0.18579825923148036, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21304 + }, + { + "epoch": 0.18580698051664893, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 21305 + }, + { + "epoch": 0.18581570180181753, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21306 + }, + { + "epoch": 0.1858244230869861, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 21307 + }, + { + "epoch": 0.18583314437215467, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 21308 + }, + { + "epoch": 0.18584186565732327, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 21309 + }, + { + "epoch": 0.18585058694249185, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 21310 + }, + { + "epoch": 0.18585930822766042, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 21311 + }, + { + "epoch": 0.18586802951282902, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 21312 + }, + { + "epoch": 0.1858767507979976, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 21313 + }, + { + "epoch": 0.18588547208316616, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 21314 + }, + { + "epoch": 0.18589419336833476, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 21315 + }, + { + "epoch": 0.18590291465350334, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 21316 + }, + { + "epoch": 0.1859116359386719, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 21317 + }, + { + "epoch": 0.1859203572238405, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 21318 + }, + { + "epoch": 0.18592907850900908, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 21319 + }, + { + "epoch": 0.18593779979417768, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 21320 + }, + { + "epoch": 0.18594652107934626, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 21321 + }, + { + "epoch": 0.18595524236451483, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 21322 + }, + { + "epoch": 0.18596396364968343, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 21323 + }, + { + "epoch": 0.185972684934852, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 21324 + }, + { + "epoch": 0.18598140622002057, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 21325 + }, + { + "epoch": 0.18599012750518917, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 21326 + }, + { + "epoch": 0.18599884879035775, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 21327 + }, + { + "epoch": 0.18600757007552632, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 21328 + }, + { + "epoch": 0.18601629136069492, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 21329 + }, + { + "epoch": 0.1860250126458635, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 21330 + }, + { + "epoch": 0.18603373393103206, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 21331 + }, + { + "epoch": 0.18604245521620066, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 21332 + }, + { + "epoch": 0.18605117650136924, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 21333 + }, + { + "epoch": 0.18605989778653784, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 21334 + }, + { + "epoch": 0.1860686190717064, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 21335 + }, + { + "epoch": 0.18607734035687498, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 21336 + }, + { + "epoch": 0.18608606164204358, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 21337 + }, + { + "epoch": 0.18609478292721215, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 21338 + }, + { + "epoch": 0.18610350421238073, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 21339 + }, + { + "epoch": 0.18611222549754933, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 21340 + }, + { + "epoch": 0.1861209467827179, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 21341 + }, + { + "epoch": 0.18612966806788647, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21342 + }, + { + "epoch": 0.18613838935305507, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 21343 + }, + { + "epoch": 0.18614711063822365, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 21344 + }, + { + "epoch": 0.18615583192339222, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 21345 + }, + { + "epoch": 0.18616455320856082, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 21346 + }, + { + "epoch": 0.1861732744937294, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 21347 + }, + { + "epoch": 0.186181995778898, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 21348 + }, + { + "epoch": 0.18619071706406656, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 21349 + }, + { + "epoch": 0.18619943834923514, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 21350 + }, + { + "epoch": 0.18620815963440374, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21351 + }, + { + "epoch": 0.1862168809195723, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 21352 + }, + { + "epoch": 0.18622560220474088, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 21353 + }, + { + "epoch": 0.18623432348990948, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 21354 + }, + { + "epoch": 0.18624304477507805, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 21355 + }, + { + "epoch": 0.18625176606024663, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 21356 + }, + { + "epoch": 0.18626048734541523, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 21357 + }, + { + "epoch": 0.1862692086305838, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 21358 + }, + { + "epoch": 0.18627792991575237, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 21359 + }, + { + "epoch": 0.18628665120092097, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 21360 + }, + { + "epoch": 0.18629537248608954, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 21361 + }, + { + "epoch": 0.18630409377125814, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 21362 + }, + { + "epoch": 0.18631281505642672, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 21363 + }, + { + "epoch": 0.1863215363415953, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 21364 + }, + { + "epoch": 0.1863302576267639, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 21365 + }, + { + "epoch": 0.18633897891193246, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 21366 + }, + { + "epoch": 0.18634770019710103, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 21367 + }, + { + "epoch": 0.18635642148226964, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21368 + }, + { + "epoch": 0.1863651427674382, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 21369 + }, + { + "epoch": 0.18637386405260678, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 21370 + }, + { + "epoch": 0.18638258533777538, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 21371 + }, + { + "epoch": 0.18639130662294395, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 21372 + }, + { + "epoch": 0.18640002790811255, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 21373 + }, + { + "epoch": 0.18640874919328113, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 21374 + }, + { + "epoch": 0.1864174704784497, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 21375 + }, + { + "epoch": 0.1864261917636183, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 21376 + }, + { + "epoch": 0.18643491304878687, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 21377 + }, + { + "epoch": 0.18644363433395544, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 21378 + }, + { + "epoch": 0.18645235561912404, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 21379 + }, + { + "epoch": 0.18646107690429262, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21380 + }, + { + "epoch": 0.1864697981894612, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 21381 + }, + { + "epoch": 0.1864785194746298, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 21382 + }, + { + "epoch": 0.18648724075979836, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 21383 + }, + { + "epoch": 0.18649596204496693, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 21384 + }, + { + "epoch": 0.18650468333013553, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 21385 + }, + { + "epoch": 0.1865134046153041, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 21386 + }, + { + "epoch": 0.1865221259004727, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 21387 + }, + { + "epoch": 0.18653084718564128, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 21388 + }, + { + "epoch": 0.18653956847080985, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 21389 + }, + { + "epoch": 0.18654828975597845, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21390 + }, + { + "epoch": 0.18655701104114703, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 21391 + }, + { + "epoch": 0.1865657323263156, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 21392 + }, + { + "epoch": 0.1865744536114842, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 21393 + }, + { + "epoch": 0.18658317489665277, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 21394 + }, + { + "epoch": 0.18659189618182134, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 21395 + }, + { + "epoch": 0.18660061746698994, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 21396 + }, + { + "epoch": 0.18660933875215852, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 21397 + }, + { + "epoch": 0.1866180600373271, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 21398 + }, + { + "epoch": 0.1866267813224957, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 21399 + }, + { + "epoch": 0.18663550260766426, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 21400 + }, + { + "epoch": 0.18664422389283286, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 21401 + }, + { + "epoch": 0.18665294517800143, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 21402 + }, + { + "epoch": 0.18666166646317, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 21403 + }, + { + "epoch": 0.1866703877483386, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 21404 + }, + { + "epoch": 0.18667910903350718, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 21405 + }, + { + "epoch": 0.18668783031867575, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 21406 + }, + { + "epoch": 0.18669655160384435, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 21407 + }, + { + "epoch": 0.18670527288901292, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 21408 + }, + { + "epoch": 0.1867139941741815, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 21409 + }, + { + "epoch": 0.1867227154593501, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 21410 + }, + { + "epoch": 0.18673143674451867, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21411 + }, + { + "epoch": 0.18674015802968724, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 21412 + }, + { + "epoch": 0.18674887931485584, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 21413 + }, + { + "epoch": 0.18675760060002441, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 21414 + }, + { + "epoch": 0.18676632188519302, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 21415 + }, + { + "epoch": 0.1867750431703616, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 21416 + }, + { + "epoch": 0.18678376445553016, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 21417 + }, + { + "epoch": 0.18679248574069876, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 21418 + }, + { + "epoch": 0.18680120702586733, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 21419 + }, + { + "epoch": 0.1868099283110359, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 21420 + }, + { + "epoch": 0.1868186495962045, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 21421 + }, + { + "epoch": 0.18682737088137308, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 21422 + }, + { + "epoch": 0.18683609216654165, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 21423 + }, + { + "epoch": 0.18684481345171025, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 21424 + }, + { + "epoch": 0.18685353473687882, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 21425 + }, + { + "epoch": 0.1868622560220474, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 21426 + }, + { + "epoch": 0.186870977307216, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 21427 + }, + { + "epoch": 0.18687969859238457, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 21428 + }, + { + "epoch": 0.18688841987755317, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 21429 + }, + { + "epoch": 0.18689714116272174, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 21430 + }, + { + "epoch": 0.18690586244789031, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 21431 + }, + { + "epoch": 0.18691458373305891, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 21432 + }, + { + "epoch": 0.1869233050182275, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 21433 + }, + { + "epoch": 0.18693202630339606, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 21434 + }, + { + "epoch": 0.18694074758856466, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 21435 + }, + { + "epoch": 0.18694946887373323, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 21436 + }, + { + "epoch": 0.1869581901589018, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 21437 + }, + { + "epoch": 0.1869669114440704, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 21438 + }, + { + "epoch": 0.18697563272923898, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 21439 + }, + { + "epoch": 0.18698435401440755, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 21440 + }, + { + "epoch": 0.18699307529957615, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 21441 + }, + { + "epoch": 0.18700179658474472, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 21442 + }, + { + "epoch": 0.18701051786991332, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 21443 + }, + { + "epoch": 0.1870192391550819, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 21444 + }, + { + "epoch": 0.18702796044025047, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 21445 + }, + { + "epoch": 0.18703668172541907, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 21446 + }, + { + "epoch": 0.18704540301058764, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 21447 + }, + { + "epoch": 0.1870541242957562, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 21448 + }, + { + "epoch": 0.1870628455809248, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 21449 + }, + { + "epoch": 0.1870715668660934, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 21450 + }, + { + "epoch": 0.18708028815126196, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 21451 + }, + { + "epoch": 0.18708900943643056, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 21452 + }, + { + "epoch": 0.18709773072159913, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 21453 + }, + { + "epoch": 0.1871064520067677, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 21454 + }, + { + "epoch": 0.1871151732919363, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 21455 + }, + { + "epoch": 0.18712389457710488, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 21456 + }, + { + "epoch": 0.18713261586227348, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 21457 + }, + { + "epoch": 0.18714133714744205, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 21458 + }, + { + "epoch": 0.18715005843261062, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 21459 + }, + { + "epoch": 0.18715877971777922, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 21460 + }, + { + "epoch": 0.1871675010029478, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 21461 + }, + { + "epoch": 0.18717622228811637, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 21462 + }, + { + "epoch": 0.18718494357328497, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 21463 + }, + { + "epoch": 0.18719366485845354, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 21464 + }, + { + "epoch": 0.1872023861436221, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 21465 + }, + { + "epoch": 0.1872111074287907, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 21466 + }, + { + "epoch": 0.18721982871395929, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 21467 + }, + { + "epoch": 0.18722854999912786, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 21468 + }, + { + "epoch": 0.18723727128429646, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 21469 + }, + { + "epoch": 0.18724599256946503, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 21470 + }, + { + "epoch": 0.18725471385463363, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 21471 + }, + { + "epoch": 0.1872634351398022, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 21472 + }, + { + "epoch": 0.18727215642497078, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 21473 + }, + { + "epoch": 0.18728087771013938, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 21474 + }, + { + "epoch": 0.18728959899530795, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 21475 + }, + { + "epoch": 0.18729832028047652, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 21476 + }, + { + "epoch": 0.18730704156564512, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 21477 + }, + { + "epoch": 0.1873157628508137, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 21478 + }, + { + "epoch": 0.18732448413598227, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 21479 + }, + { + "epoch": 0.18733320542115087, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 21480 + }, + { + "epoch": 0.18734192670631944, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 21481 + }, + { + "epoch": 0.187350647991488, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21482 + }, + { + "epoch": 0.1873593692766566, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 21483 + }, + { + "epoch": 0.18736809056182518, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 21484 + }, + { + "epoch": 0.18737681184699378, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 21485 + }, + { + "epoch": 0.18738553313216236, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 21486 + }, + { + "epoch": 0.18739425441733093, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 21487 + }, + { + "epoch": 0.18740297570249953, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21488 + }, + { + "epoch": 0.1874116969876681, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 21489 + }, + { + "epoch": 0.18742041827283668, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 21490 + }, + { + "epoch": 0.18742913955800528, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 21491 + }, + { + "epoch": 0.18743786084317385, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 21492 + }, + { + "epoch": 0.18744658212834242, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 21493 + }, + { + "epoch": 0.18745530341351102, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 21494 + }, + { + "epoch": 0.1874640246986796, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 21495 + }, + { + "epoch": 0.1874727459838482, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 21496 + }, + { + "epoch": 0.18748146726901677, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 21497 + }, + { + "epoch": 0.18749018855418534, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 21498 + }, + { + "epoch": 0.18749890983935394, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 21499 + }, + { + "epoch": 0.1875076311245225, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 21500 + }, + { + "epoch": 0.18751635240969108, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 21501 + }, + { + "epoch": 0.18752507369485968, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 21502 + }, + { + "epoch": 0.18753379498002826, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 21503 + }, + { + "epoch": 0.18754251626519683, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 21504 + }, + { + "epoch": 0.18755123755036543, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 21505 + }, + { + "epoch": 0.187559958835534, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 21506 + }, + { + "epoch": 0.18756868012070257, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 21507 + }, + { + "epoch": 0.18757740140587117, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 21508 + }, + { + "epoch": 0.18758612269103975, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 21509 + }, + { + "epoch": 0.18759484397620835, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 21510 + }, + { + "epoch": 0.18760356526137692, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 21511 + }, + { + "epoch": 0.1876122865465455, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 21512 + }, + { + "epoch": 0.1876210078317141, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 21513 + }, + { + "epoch": 0.18762972911688267, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 21514 + }, + { + "epoch": 0.18763845040205124, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 21515 + }, + { + "epoch": 0.18764717168721984, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 21516 + }, + { + "epoch": 0.1876558929723884, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 21517 + }, + { + "epoch": 0.18766461425755698, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 21518 + }, + { + "epoch": 0.18767333554272558, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 21519 + }, + { + "epoch": 0.18768205682789416, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 21520 + }, + { + "epoch": 0.18769077811306273, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 21521 + }, + { + "epoch": 0.18769949939823133, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 21522 + }, + { + "epoch": 0.1877082206833999, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 21523 + }, + { + "epoch": 0.1877169419685685, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 21524 + }, + { + "epoch": 0.18772566325373707, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 21525 + }, + { + "epoch": 0.18773438453890565, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 21526 + }, + { + "epoch": 0.18774310582407425, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 21527 + }, + { + "epoch": 0.18775182710924282, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 21528 + }, + { + "epoch": 0.1877605483944114, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 21529 + }, + { + "epoch": 0.18776926967958, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 21530 + }, + { + "epoch": 0.18777799096474856, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 21531 + }, + { + "epoch": 0.18778671224991714, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 21532 + }, + { + "epoch": 0.18779543353508574, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 21533 + }, + { + "epoch": 0.1878041548202543, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 21534 + }, + { + "epoch": 0.18781287610542288, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 21535 + }, + { + "epoch": 0.18782159739059148, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 21536 + }, + { + "epoch": 0.18783031867576006, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 21537 + }, + { + "epoch": 0.18783903996092866, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 21538 + }, + { + "epoch": 0.18784776124609723, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21539 + }, + { + "epoch": 0.1878564825312658, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 21540 + }, + { + "epoch": 0.1878652038164344, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21541 + }, + { + "epoch": 0.18787392510160297, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 21542 + }, + { + "epoch": 0.18788264638677155, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 21543 + }, + { + "epoch": 0.18789136767194015, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21544 + }, + { + "epoch": 0.18790008895710872, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 21545 + }, + { + "epoch": 0.1879088102422773, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 21546 + }, + { + "epoch": 0.1879175315274459, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 21547 + }, + { + "epoch": 0.18792625281261446, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 21548 + }, + { + "epoch": 0.18793497409778304, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 21549 + }, + { + "epoch": 0.18794369538295164, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 21550 + }, + { + "epoch": 0.1879524166681202, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 21551 + }, + { + "epoch": 0.1879611379532888, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 21552 + }, + { + "epoch": 0.18796985923845738, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 21553 + }, + { + "epoch": 0.18797858052362595, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 21554 + }, + { + "epoch": 0.18798730180879455, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 21555 + }, + { + "epoch": 0.18799602309396313, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 21556 + }, + { + "epoch": 0.1880047443791317, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 21557 + }, + { + "epoch": 0.1880134656643003, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 21558 + }, + { + "epoch": 0.18802218694946887, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 21559 + }, + { + "epoch": 0.18803090823463744, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 21560 + }, + { + "epoch": 0.18803962951980605, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 21561 + }, + { + "epoch": 0.18804835080497462, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 21562 + }, + { + "epoch": 0.1880570720901432, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 21563 + }, + { + "epoch": 0.1880657933753118, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21564 + }, + { + "epoch": 0.18807451466048036, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 21565 + }, + { + "epoch": 0.18808323594564896, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 21566 + }, + { + "epoch": 0.18809195723081754, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 21567 + }, + { + "epoch": 0.1881006785159861, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 21568 + }, + { + "epoch": 0.1881093998011547, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 21569 + }, + { + "epoch": 0.18811812108632328, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 21570 + }, + { + "epoch": 0.18812684237149185, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 21571 + }, + { + "epoch": 0.18813556365666045, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 21572 + }, + { + "epoch": 0.18814428494182903, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 21573 + }, + { + "epoch": 0.1881530062269976, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 21574 + }, + { + "epoch": 0.1881617275121662, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 21575 + }, + { + "epoch": 0.18817044879733477, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 21576 + }, + { + "epoch": 0.18817917008250334, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 21577 + }, + { + "epoch": 0.18818789136767194, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 21578 + }, + { + "epoch": 0.18819661265284052, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 21579 + }, + { + "epoch": 0.18820533393800912, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 21580 + }, + { + "epoch": 0.1882140552231777, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 21581 + }, + { + "epoch": 0.18822277650834626, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 21582 + }, + { + "epoch": 0.18823149779351486, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 21583 + }, + { + "epoch": 0.18824021907868344, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 21584 + }, + { + "epoch": 0.188248940363852, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 21585 + }, + { + "epoch": 0.1882576616490206, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 21586 + }, + { + "epoch": 0.18826638293418918, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 21587 + }, + { + "epoch": 0.18827510421935775, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 21588 + }, + { + "epoch": 0.18828382550452635, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 21589 + }, + { + "epoch": 0.18829254678969493, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 21590 + }, + { + "epoch": 0.1883012680748635, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 21591 + }, + { + "epoch": 0.1883099893600321, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 21592 + }, + { + "epoch": 0.18831871064520067, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21593 + }, + { + "epoch": 0.18832743193036927, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 21594 + }, + { + "epoch": 0.18833615321553784, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 21595 + }, + { + "epoch": 0.18834487450070642, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 21596 + }, + { + "epoch": 0.18835359578587502, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 21597 + }, + { + "epoch": 0.1883623170710436, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 21598 + }, + { + "epoch": 0.18837103835621216, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 21599 + }, + { + "epoch": 0.18837975964138076, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 21600 + }, + { + "epoch": 0.18838848092654933, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 21601 + }, + { + "epoch": 0.1883972022117179, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 21602 + }, + { + "epoch": 0.1884059234968865, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 21603 + }, + { + "epoch": 0.18841464478205508, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 21604 + }, + { + "epoch": 0.18842336606722368, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 21605 + }, + { + "epoch": 0.18843208735239225, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 21606 + }, + { + "epoch": 0.18844080863756082, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 21607 + }, + { + "epoch": 0.18844952992272943, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 21608 + }, + { + "epoch": 0.188458251207898, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 21609 + }, + { + "epoch": 0.18846697249306657, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 21610 + }, + { + "epoch": 0.18847569377823517, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 21611 + }, + { + "epoch": 0.18848441506340374, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 21612 + }, + { + "epoch": 0.18849313634857232, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 21613 + }, + { + "epoch": 0.18850185763374092, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 21614 + }, + { + "epoch": 0.1885105789189095, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 21615 + }, + { + "epoch": 0.18851930020407806, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 21616 + }, + { + "epoch": 0.18852802148924666, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 21617 + }, + { + "epoch": 0.18853674277441523, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 21618 + }, + { + "epoch": 0.18854546405958383, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 21619 + }, + { + "epoch": 0.1885541853447524, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 21620 + }, + { + "epoch": 0.18856290662992098, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 21621 + }, + { + "epoch": 0.18857162791508958, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 21622 + }, + { + "epoch": 0.18858034920025815, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 21623 + }, + { + "epoch": 0.18858907048542672, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 21624 + }, + { + "epoch": 0.18859779177059532, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 21625 + }, + { + "epoch": 0.1886065130557639, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 21626 + }, + { + "epoch": 0.18861523434093247, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 21627 + }, + { + "epoch": 0.18862395562610107, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21628 + }, + { + "epoch": 0.18863267691126964, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 21629 + }, + { + "epoch": 0.18864139819643821, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 21630 + }, + { + "epoch": 0.18865011948160681, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 21631 + }, + { + "epoch": 0.1886588407667754, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 21632 + }, + { + "epoch": 0.188667562051944, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 21633 + }, + { + "epoch": 0.18867628333711256, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 21634 + }, + { + "epoch": 0.18868500462228113, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 21635 + }, + { + "epoch": 0.18869372590744973, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 21636 + }, + { + "epoch": 0.1887024471926183, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 21637 + }, + { + "epoch": 0.18871116847778688, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 21638 + }, + { + "epoch": 0.18871988976295548, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 21639 + }, + { + "epoch": 0.18872861104812405, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 21640 + }, + { + "epoch": 0.18873733233329262, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 21641 + }, + { + "epoch": 0.18874605361846122, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 21642 + }, + { + "epoch": 0.1887547749036298, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 21643 + }, + { + "epoch": 0.18876349618879837, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 21644 + }, + { + "epoch": 0.18877221747396697, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 21645 + }, + { + "epoch": 0.18878093875913554, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 21646 + }, + { + "epoch": 0.18878966004430414, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 21647 + }, + { + "epoch": 0.18879838132947271, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 21648 + }, + { + "epoch": 0.1888071026146413, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 21649 + }, + { + "epoch": 0.1888158238998099, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 21650 + }, + { + "epoch": 0.18882454518497846, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 21651 + }, + { + "epoch": 0.18883326647014703, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 21652 + }, + { + "epoch": 0.18884198775531563, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 21653 + }, + { + "epoch": 0.1888507090404842, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 21654 + }, + { + "epoch": 0.18885943032565278, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 21655 + }, + { + "epoch": 0.18886815161082138, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 21656 + }, + { + "epoch": 0.18887687289598995, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 21657 + }, + { + "epoch": 0.18888559418115852, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 21658 + }, + { + "epoch": 0.18889431546632712, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 21659 + }, + { + "epoch": 0.1889030367514957, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 21660 + }, + { + "epoch": 0.1889117580366643, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 21661 + }, + { + "epoch": 0.18892047932183287, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 21662 + }, + { + "epoch": 0.18892920060700144, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 21663 + }, + { + "epoch": 0.18893792189217004, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 21664 + }, + { + "epoch": 0.1889466431773386, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 21665 + }, + { + "epoch": 0.18895536446250719, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 21666 + }, + { + "epoch": 0.1889640857476758, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 21667 + }, + { + "epoch": 0.18897280703284436, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 21668 + }, + { + "epoch": 0.18898152831801293, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 21669 + }, + { + "epoch": 0.18899024960318153, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 21670 + }, + { + "epoch": 0.1889989708883501, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 21671 + }, + { + "epoch": 0.18900769217351868, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 21672 + }, + { + "epoch": 0.18901641345868728, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 21673 + }, + { + "epoch": 0.18902513474385585, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21674 + }, + { + "epoch": 0.18903385602902445, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 21675 + }, + { + "epoch": 0.18904257731419302, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 21676 + }, + { + "epoch": 0.1890512985993616, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 21677 + }, + { + "epoch": 0.1890600198845302, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 21678 + }, + { + "epoch": 0.18906874116969877, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 21679 + }, + { + "epoch": 0.18907746245486734, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9861, + "step": 21680 + }, + { + "epoch": 0.18908618374003594, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 21681 + }, + { + "epoch": 0.1890949050252045, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 21682 + }, + { + "epoch": 0.18910362631037309, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 21683 + }, + { + "epoch": 0.18911234759554169, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 21684 + }, + { + "epoch": 0.18912106888071026, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 21685 + }, + { + "epoch": 0.18912979016587883, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 21686 + }, + { + "epoch": 0.18913851145104743, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 21687 + }, + { + "epoch": 0.189147232736216, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21688 + }, + { + "epoch": 0.1891559540213846, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 21689 + }, + { + "epoch": 0.18916467530655318, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 21690 + }, + { + "epoch": 0.18917339659172175, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 21691 + }, + { + "epoch": 0.18918211787689035, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 21692 + }, + { + "epoch": 0.18919083916205892, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 21693 + }, + { + "epoch": 0.1891995604472275, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 21694 + }, + { + "epoch": 0.1892082817323961, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 21695 + }, + { + "epoch": 0.18921700301756467, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 21696 + }, + { + "epoch": 0.18922572430273324, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21697 + }, + { + "epoch": 0.18923444558790184, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 21698 + }, + { + "epoch": 0.1892431668730704, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 21699 + }, + { + "epoch": 0.18925188815823898, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 21700 + }, + { + "epoch": 0.18926060944340758, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21701 + }, + { + "epoch": 0.18926933072857616, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 21702 + }, + { + "epoch": 0.18927805201374476, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 21703 + }, + { + "epoch": 0.18928677329891333, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 21704 + }, + { + "epoch": 0.1892954945840819, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 21705 + }, + { + "epoch": 0.1893042158692505, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 21706 + }, + { + "epoch": 0.18931293715441908, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 21707 + }, + { + "epoch": 0.18932165843958765, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21708 + }, + { + "epoch": 0.18933037972475625, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 21709 + }, + { + "epoch": 0.18933910100992482, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 21710 + }, + { + "epoch": 0.1893478222950934, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 21711 + }, + { + "epoch": 0.189356543580262, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 21712 + }, + { + "epoch": 0.18936526486543057, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 21713 + }, + { + "epoch": 0.18937398615059914, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 21714 + }, + { + "epoch": 0.18938270743576774, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 21715 + }, + { + "epoch": 0.1893914287209363, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 21716 + }, + { + "epoch": 0.1894001500061049, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 21717 + }, + { + "epoch": 0.18940887129127348, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 21718 + }, + { + "epoch": 0.18941759257644206, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 21719 + }, + { + "epoch": 0.18942631386161066, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 21720 + }, + { + "epoch": 0.18943503514677923, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 21721 + }, + { + "epoch": 0.1894437564319478, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 21722 + }, + { + "epoch": 0.1894524777171164, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 21723 + }, + { + "epoch": 0.18946119900228497, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 21724 + }, + { + "epoch": 0.18946992028745355, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 21725 + }, + { + "epoch": 0.18947864157262215, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 21726 + }, + { + "epoch": 0.18948736285779072, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 21727 + }, + { + "epoch": 0.18949608414295932, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 21728 + }, + { + "epoch": 0.1895048054281279, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 21729 + }, + { + "epoch": 0.18951352671329647, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 21730 + }, + { + "epoch": 0.18952224799846507, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 21731 + }, + { + "epoch": 0.18953096928363364, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 21732 + }, + { + "epoch": 0.1895396905688022, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 21733 + }, + { + "epoch": 0.1895484118539708, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 21734 + }, + { + "epoch": 0.18955713313913938, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 21735 + }, + { + "epoch": 0.18956585442430796, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 21736 + }, + { + "epoch": 0.18957457570947656, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 21737 + }, + { + "epoch": 0.18958329699464513, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 21738 + }, + { + "epoch": 0.1895920182798137, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 21739 + }, + { + "epoch": 0.1896007395649823, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 21740 + }, + { + "epoch": 0.18960946085015087, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 21741 + }, + { + "epoch": 0.18961818213531947, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 21742 + }, + { + "epoch": 0.18962690342048805, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 21743 + }, + { + "epoch": 0.18963562470565662, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 21744 + }, + { + "epoch": 0.18964434599082522, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 21745 + }, + { + "epoch": 0.1896530672759938, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 21746 + }, + { + "epoch": 0.18966178856116236, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 21747 + }, + { + "epoch": 0.18967050984633096, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 21748 + }, + { + "epoch": 0.18967923113149954, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 21749 + }, + { + "epoch": 0.1896879524166681, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 21750 + }, + { + "epoch": 0.1896966737018367, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 21751 + }, + { + "epoch": 0.18970539498700528, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 21752 + }, + { + "epoch": 0.18971411627217385, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 21753 + }, + { + "epoch": 0.18972283755734246, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 21754 + }, + { + "epoch": 0.18973155884251103, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 21755 + }, + { + "epoch": 0.18974028012767963, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 21756 + }, + { + "epoch": 0.1897490014128482, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 21757 + }, + { + "epoch": 0.18975772269801677, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 21758 + }, + { + "epoch": 0.18976644398318537, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 21759 + }, + { + "epoch": 0.18977516526835395, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 21760 + }, + { + "epoch": 0.18978388655352252, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 21761 + }, + { + "epoch": 0.18979260783869112, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 21762 + }, + { + "epoch": 0.1898013291238597, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 21763 + }, + { + "epoch": 0.18981005040902826, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 21764 + }, + { + "epoch": 0.18981877169419686, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 21765 + }, + { + "epoch": 0.18982749297936544, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 21766 + }, + { + "epoch": 0.189836214264534, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 21767 + }, + { + "epoch": 0.1898449355497026, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 21768 + }, + { + "epoch": 0.18985365683487118, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 21769 + }, + { + "epoch": 0.18986237812003978, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 21770 + }, + { + "epoch": 0.18987109940520835, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 21771 + }, + { + "epoch": 0.18987982069037693, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 21772 + }, + { + "epoch": 0.18988854197554553, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 21773 + }, + { + "epoch": 0.1898972632607141, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 21774 + }, + { + "epoch": 0.18990598454588267, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 21775 + }, + { + "epoch": 0.18991470583105127, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 21776 + }, + { + "epoch": 0.18992342711621985, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 21777 + }, + { + "epoch": 0.18993214840138842, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 21778 + }, + { + "epoch": 0.18994086968655702, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 21779 + }, + { + "epoch": 0.1899495909717256, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 21780 + }, + { + "epoch": 0.18995831225689416, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 21781 + }, + { + "epoch": 0.18996703354206276, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 21782 + }, + { + "epoch": 0.18997575482723134, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 21783 + }, + { + "epoch": 0.18998447611239994, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 21784 + }, + { + "epoch": 0.1899931973975685, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 21785 + }, + { + "epoch": 0.19000191868273708, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 21786 + }, + { + "epoch": 0.19001063996790568, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 21787 + }, + { + "epoch": 0.19001936125307425, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 21788 + }, + { + "epoch": 0.19002808253824283, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 21789 + }, + { + "epoch": 0.19003680382341143, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 21790 + }, + { + "epoch": 0.19004552510858, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 21791 + }, + { + "epoch": 0.19005424639374857, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 21792 + }, + { + "epoch": 0.19006296767891717, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 21793 + }, + { + "epoch": 0.19007168896408574, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 21794 + }, + { + "epoch": 0.19008041024925432, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 21795 + }, + { + "epoch": 0.19008913153442292, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 21796 + }, + { + "epoch": 0.1900978528195915, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 21797 + }, + { + "epoch": 0.1901065741047601, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 21798 + }, + { + "epoch": 0.19011529538992866, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 21799 + }, + { + "epoch": 0.19012401667509723, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 21800 + }, + { + "epoch": 0.19013273796026584, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 21801 + }, + { + "epoch": 0.1901414592454344, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 21802 + }, + { + "epoch": 0.19015018053060298, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 21803 + }, + { + "epoch": 0.19015890181577158, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 21804 + }, + { + "epoch": 0.19016762310094015, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 21805 + }, + { + "epoch": 0.19017634438610873, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 21806 + }, + { + "epoch": 0.19018506567127733, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 21807 + }, + { + "epoch": 0.1901937869564459, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 21808 + }, + { + "epoch": 0.19020250824161447, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 21809 + }, + { + "epoch": 0.19021122952678307, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 21810 + }, + { + "epoch": 0.19021995081195164, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 21811 + }, + { + "epoch": 0.19022867209712024, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 21812 + }, + { + "epoch": 0.19023739338228882, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 21813 + }, + { + "epoch": 0.1902461146674574, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 21814 + }, + { + "epoch": 0.190254835952626, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21815 + }, + { + "epoch": 0.19026355723779456, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 21816 + }, + { + "epoch": 0.19027227852296313, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 21817 + }, + { + "epoch": 0.19028099980813173, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 21818 + }, + { + "epoch": 0.1902897210933003, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 21819 + }, + { + "epoch": 0.19029844237846888, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 21820 + }, + { + "epoch": 0.19030716366363748, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 21821 + }, + { + "epoch": 0.19031588494880605, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 21822 + }, + { + "epoch": 0.19032460623397462, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 21823 + }, + { + "epoch": 0.19033332751914323, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 21824 + }, + { + "epoch": 0.1903420488043118, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 21825 + }, + { + "epoch": 0.1903507700894804, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 21826 + }, + { + "epoch": 0.19035949137464897, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 21827 + }, + { + "epoch": 0.19036821265981754, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 21828 + }, + { + "epoch": 0.19037693394498614, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 21829 + }, + { + "epoch": 0.19038565523015472, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 21830 + }, + { + "epoch": 0.1903943765153233, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 21831 + }, + { + "epoch": 0.1904030978004919, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 21832 + }, + { + "epoch": 0.19041181908566046, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 21833 + }, + { + "epoch": 0.19042054037082903, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 21834 + }, + { + "epoch": 0.19042926165599763, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 21835 + }, + { + "epoch": 0.1904379829411662, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 21836 + }, + { + "epoch": 0.1904467042263348, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 21837 + }, + { + "epoch": 0.19045542551150338, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 21838 + }, + { + "epoch": 0.19046414679667195, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 21839 + }, + { + "epoch": 0.19047286808184055, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 21840 + }, + { + "epoch": 0.19048158936700912, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 21841 + }, + { + "epoch": 0.1904903106521777, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 21842 + }, + { + "epoch": 0.1904990319373463, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 21843 + }, + { + "epoch": 0.19050775322251487, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 21844 + }, + { + "epoch": 0.19051647450768344, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 21845 + }, + { + "epoch": 0.19052519579285204, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 21846 + }, + { + "epoch": 0.19053391707802061, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 21847 + }, + { + "epoch": 0.1905426383631892, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 21848 + }, + { + "epoch": 0.1905513596483578, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 21849 + }, + { + "epoch": 0.19056008093352636, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 21850 + }, + { + "epoch": 0.19056880221869496, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 21851 + }, + { + "epoch": 0.19057752350386353, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 21852 + }, + { + "epoch": 0.1905862447890321, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 21853 + }, + { + "epoch": 0.1905949660742007, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 21854 + }, + { + "epoch": 0.19060368735936928, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 21855 + }, + { + "epoch": 0.19061240864453785, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 21856 + }, + { + "epoch": 0.19062112992970645, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 21857 + }, + { + "epoch": 0.19062985121487502, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 21858 + }, + { + "epoch": 0.1906385725000436, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 21859 + }, + { + "epoch": 0.1906472937852122, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 21860 + }, + { + "epoch": 0.19065601507038077, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 21861 + }, + { + "epoch": 0.19066473635554934, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 21862 + }, + { + "epoch": 0.19067345764071794, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 21863 + }, + { + "epoch": 0.19068217892588651, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21864 + }, + { + "epoch": 0.19069090021105511, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 21865 + }, + { + "epoch": 0.1906996214962237, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 21866 + }, + { + "epoch": 0.19070834278139226, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 21867 + }, + { + "epoch": 0.19071706406656086, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 21868 + }, + { + "epoch": 0.19072578535172943, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 21869 + }, + { + "epoch": 0.190734506636898, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 21870 + }, + { + "epoch": 0.1907432279220666, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 21871 + }, + { + "epoch": 0.19075194920723518, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 21872 + }, + { + "epoch": 0.19076067049240375, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 21873 + }, + { + "epoch": 0.19076939177757235, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21874 + }, + { + "epoch": 0.19077811306274092, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 21875 + }, + { + "epoch": 0.1907868343479095, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 21876 + }, + { + "epoch": 0.1907955556330781, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 21877 + }, + { + "epoch": 0.19080427691824667, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 21878 + }, + { + "epoch": 0.19081299820341527, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 21879 + }, + { + "epoch": 0.19082171948858384, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 21880 + }, + { + "epoch": 0.1908304407737524, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 21881 + }, + { + "epoch": 0.190839162058921, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 21882 + }, + { + "epoch": 0.19084788334408959, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21883 + }, + { + "epoch": 0.19085660462925816, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 21884 + }, + { + "epoch": 0.19086532591442676, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 21885 + }, + { + "epoch": 0.19087404719959533, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 21886 + }, + { + "epoch": 0.1908827684847639, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 21887 + }, + { + "epoch": 0.1908914897699325, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 21888 + }, + { + "epoch": 0.19090021105510108, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 21889 + }, + { + "epoch": 0.19090893234026965, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 21890 + }, + { + "epoch": 0.19091765362543825, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 21891 + }, + { + "epoch": 0.19092637491060682, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 21892 + }, + { + "epoch": 0.19093509619577542, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 21893 + }, + { + "epoch": 0.190943817480944, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 21894 + }, + { + "epoch": 0.19095253876611257, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 21895 + }, + { + "epoch": 0.19096126005128117, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 21896 + }, + { + "epoch": 0.19096998133644974, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 21897 + }, + { + "epoch": 0.1909787026216183, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 21898 + }, + { + "epoch": 0.1909874239067869, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 21899 + }, + { + "epoch": 0.19099614519195549, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 21900 + }, + { + "epoch": 0.19100486647712406, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 21901 + }, + { + "epoch": 0.19101358776229266, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 21902 + }, + { + "epoch": 0.19102230904746123, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 21903 + }, + { + "epoch": 0.1910310303326298, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 21904 + }, + { + "epoch": 0.1910397516177984, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 21905 + }, + { + "epoch": 0.19104847290296698, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 21906 + }, + { + "epoch": 0.19105719418813558, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 21907 + }, + { + "epoch": 0.19106591547330415, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 21908 + }, + { + "epoch": 0.19107463675847272, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 21909 + }, + { + "epoch": 0.19108335804364132, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 21910 + }, + { + "epoch": 0.1910920793288099, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 21911 + }, + { + "epoch": 0.19110080061397847, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21912 + }, + { + "epoch": 0.19110952189914707, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 21913 + }, + { + "epoch": 0.19111824318431564, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 21914 + }, + { + "epoch": 0.1911269644694842, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 21915 + }, + { + "epoch": 0.1911356857546528, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 21916 + }, + { + "epoch": 0.19114440703982138, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 21917 + }, + { + "epoch": 0.19115312832498996, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 21918 + }, + { + "epoch": 0.19116184961015856, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 21919 + }, + { + "epoch": 0.19117057089532713, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 21920 + }, + { + "epoch": 0.19117929218049573, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 21921 + }, + { + "epoch": 0.1911880134656643, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 21922 + }, + { + "epoch": 0.19119673475083288, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 21923 + }, + { + "epoch": 0.19120545603600148, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 21924 + }, + { + "epoch": 0.19121417732117005, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 21925 + }, + { + "epoch": 0.19122289860633862, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 21926 + }, + { + "epoch": 0.19123161989150722, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 21927 + }, + { + "epoch": 0.1912403411766758, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 21928 + }, + { + "epoch": 0.19124906246184437, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 21929 + }, + { + "epoch": 0.19125778374701297, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 21930 + }, + { + "epoch": 0.19126650503218154, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 21931 + }, + { + "epoch": 0.1912752263173501, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 21932 + }, + { + "epoch": 0.1912839476025187, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 21933 + }, + { + "epoch": 0.19129266888768728, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 21934 + }, + { + "epoch": 0.19130139017285588, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 21935 + }, + { + "epoch": 0.19131011145802446, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 21936 + }, + { + "epoch": 0.19131883274319303, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 21937 + }, + { + "epoch": 0.19132755402836163, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 21938 + }, + { + "epoch": 0.1913362753135302, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 21939 + }, + { + "epoch": 0.19134499659869877, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 21940 + }, + { + "epoch": 0.19135371788386737, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 21941 + }, + { + "epoch": 0.19136243916903595, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 21942 + }, + { + "epoch": 0.19137116045420452, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 21943 + }, + { + "epoch": 0.19137988173937312, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 21944 + }, + { + "epoch": 0.1913886030245417, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 21945 + }, + { + "epoch": 0.1913973243097103, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 21946 + }, + { + "epoch": 0.19140604559487887, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 21947 + }, + { + "epoch": 0.19141476688004744, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 21948 + }, + { + "epoch": 0.19142348816521604, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 21949 + }, + { + "epoch": 0.1914322094503846, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21950 + }, + { + "epoch": 0.19144093073555318, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 21951 + }, + { + "epoch": 0.19144965202072178, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 21952 + }, + { + "epoch": 0.19145837330589036, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 21953 + }, + { + "epoch": 0.19146709459105893, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 21954 + }, + { + "epoch": 0.19147581587622753, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 21955 + }, + { + "epoch": 0.1914845371613961, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 21956 + }, + { + "epoch": 0.19149325844656467, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 21957 + }, + { + "epoch": 0.19150197973173327, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 21958 + }, + { + "epoch": 0.19151070101690185, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 21959 + }, + { + "epoch": 0.19151942230207045, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 21960 + }, + { + "epoch": 0.19152814358723902, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 21961 + }, + { + "epoch": 0.1915368648724076, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 21962 + }, + { + "epoch": 0.1915455861575762, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 21963 + }, + { + "epoch": 0.19155430744274476, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 21964 + }, + { + "epoch": 0.19156302872791334, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 21965 + }, + { + "epoch": 0.19157175001308194, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 21966 + }, + { + "epoch": 0.1915804712982505, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 21967 + }, + { + "epoch": 0.19158919258341908, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 21968 + }, + { + "epoch": 0.19159791386858768, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 21969 + }, + { + "epoch": 0.19160663515375626, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 21970 + }, + { + "epoch": 0.19161535643892483, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 21971 + }, + { + "epoch": 0.19162407772409343, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 21972 + }, + { + "epoch": 0.191632799009262, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 21973 + }, + { + "epoch": 0.1916415202944306, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 21974 + }, + { + "epoch": 0.19165024157959917, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 21975 + }, + { + "epoch": 0.19165896286476775, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 21976 + }, + { + "epoch": 0.19166768414993635, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 21977 + }, + { + "epoch": 0.19167640543510492, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 21978 + }, + { + "epoch": 0.1916851267202735, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 21979 + }, + { + "epoch": 0.1916938480054421, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 21980 + }, + { + "epoch": 0.19170256929061066, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 21981 + }, + { + "epoch": 0.19171129057577924, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 21982 + }, + { + "epoch": 0.19172001186094784, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 21983 + }, + { + "epoch": 0.1917287331461164, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 21984 + }, + { + "epoch": 0.19173745443128498, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 21985 + }, + { + "epoch": 0.19174617571645358, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 21986 + }, + { + "epoch": 0.19175489700162215, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 21987 + }, + { + "epoch": 0.19176361828679075, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 21988 + }, + { + "epoch": 0.19177233957195933, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 21989 + }, + { + "epoch": 0.1917810608571279, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 21990 + }, + { + "epoch": 0.1917897821422965, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 21991 + }, + { + "epoch": 0.19179850342746507, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 21992 + }, + { + "epoch": 0.19180722471263364, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 21993 + }, + { + "epoch": 0.19181594599780225, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 21994 + }, + { + "epoch": 0.19182466728297082, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 21995 + }, + { + "epoch": 0.1918333885681394, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 21996 + }, + { + "epoch": 0.191842109853308, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 21997 + }, + { + "epoch": 0.19185083113847656, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 21998 + }, + { + "epoch": 0.19185955242364514, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 21999 + }, + { + "epoch": 0.19186827370881374, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 22000 + }, + { + "epoch": 0.1918769949939823, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 22001 + }, + { + "epoch": 0.1918857162791509, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22002 + }, + { + "epoch": 0.19189443756431948, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 22003 + }, + { + "epoch": 0.19190315884948805, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 22004 + }, + { + "epoch": 0.19191188013465665, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 22005 + }, + { + "epoch": 0.19192060141982523, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 22006 + }, + { + "epoch": 0.1919293227049938, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 22007 + }, + { + "epoch": 0.1919380439901624, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 22008 + }, + { + "epoch": 0.19194676527533097, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 22009 + }, + { + "epoch": 0.19195548656049954, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 22010 + }, + { + "epoch": 0.19196420784566814, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 22011 + }, + { + "epoch": 0.19197292913083672, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 22012 + }, + { + "epoch": 0.1919816504160053, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 22013 + }, + { + "epoch": 0.1919903717011739, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22014 + }, + { + "epoch": 0.19199909298634246, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 22015 + }, + { + "epoch": 0.19200781427151106, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 22016 + }, + { + "epoch": 0.19201653555667964, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 22017 + }, + { + "epoch": 0.1920252568418482, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 22018 + }, + { + "epoch": 0.1920339781270168, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 22019 + }, + { + "epoch": 0.19204269941218538, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 22020 + }, + { + "epoch": 0.19205142069735395, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 22021 + }, + { + "epoch": 0.19206014198252255, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 22022 + }, + { + "epoch": 0.19206886326769113, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 22023 + }, + { + "epoch": 0.1920775845528597, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 22024 + }, + { + "epoch": 0.1920863058380283, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 22025 + }, + { + "epoch": 0.19209502712319687, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 22026 + }, + { + "epoch": 0.19210374840836544, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 22027 + }, + { + "epoch": 0.19211246969353404, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 22028 + }, + { + "epoch": 0.19212119097870262, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 22029 + }, + { + "epoch": 0.19212991226387122, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 22030 + }, + { + "epoch": 0.1921386335490398, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 22031 + }, + { + "epoch": 0.19214735483420836, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 22032 + }, + { + "epoch": 0.19215607611937696, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 22033 + }, + { + "epoch": 0.19216479740454553, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 22034 + }, + { + "epoch": 0.1921735186897141, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 22035 + }, + { + "epoch": 0.1921822399748827, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 22036 + }, + { + "epoch": 0.19219096126005128, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 22037 + }, + { + "epoch": 0.19219968254521985, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 22038 + }, + { + "epoch": 0.19220840383038845, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 22039 + }, + { + "epoch": 0.19221712511555702, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 22040 + }, + { + "epoch": 0.1922258464007256, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 22041 + }, + { + "epoch": 0.1922345676858942, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 22042 + }, + { + "epoch": 0.19224328897106277, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 22043 + }, + { + "epoch": 0.19225201025623137, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 22044 + }, + { + "epoch": 0.19226073154139994, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 22045 + }, + { + "epoch": 0.19226945282656852, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 22046 + }, + { + "epoch": 0.19227817411173712, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 22047 + }, + { + "epoch": 0.1922868953969057, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 22048 + }, + { + "epoch": 0.19229561668207426, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 22049 + }, + { + "epoch": 0.19230433796724286, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 22050 + }, + { + "epoch": 0.19231305925241143, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 22051 + }, + { + "epoch": 0.19232178053758, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 22052 + }, + { + "epoch": 0.1923305018227486, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 22053 + }, + { + "epoch": 0.19233922310791718, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 22054 + }, + { + "epoch": 0.19234794439308575, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22055 + }, + { + "epoch": 0.19235666567825435, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 22056 + }, + { + "epoch": 0.19236538696342292, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 22057 + }, + { + "epoch": 0.19237410824859152, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 22058 + }, + { + "epoch": 0.1923828295337601, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 22059 + }, + { + "epoch": 0.19239155081892867, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 22060 + }, + { + "epoch": 0.19240027210409727, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 22061 + }, + { + "epoch": 0.19240899338926584, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 22062 + }, + { + "epoch": 0.19241771467443441, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 22063 + }, + { + "epoch": 0.19242643595960301, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 22064 + }, + { + "epoch": 0.1924351572447716, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 22065 + }, + { + "epoch": 0.19244387852994016, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 22066 + }, + { + "epoch": 0.19245259981510876, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 22067 + }, + { + "epoch": 0.19246132110027733, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 22068 + }, + { + "epoch": 0.19247004238544593, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 22069 + }, + { + "epoch": 0.1924787636706145, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 22070 + }, + { + "epoch": 0.19248748495578308, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22071 + }, + { + "epoch": 0.19249620624095168, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 22072 + }, + { + "epoch": 0.19250492752612025, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 22073 + }, + { + "epoch": 0.19251364881128882, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 22074 + }, + { + "epoch": 0.19252237009645742, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 22075 + }, + { + "epoch": 0.192531091381626, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 22076 + }, + { + "epoch": 0.19253981266679457, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 22077 + }, + { + "epoch": 0.19254853395196317, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 22078 + }, + { + "epoch": 0.19255725523713174, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 22079 + }, + { + "epoch": 0.19256597652230031, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 22080 + }, + { + "epoch": 0.19257469780746891, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 22081 + }, + { + "epoch": 0.1925834190926375, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 22082 + }, + { + "epoch": 0.1925921403778061, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 22083 + }, + { + "epoch": 0.19260086166297466, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 22084 + }, + { + "epoch": 0.19260958294814323, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 22085 + }, + { + "epoch": 0.19261830423331183, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 22086 + }, + { + "epoch": 0.1926270255184804, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 22087 + }, + { + "epoch": 0.19263574680364898, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 22088 + }, + { + "epoch": 0.19264446808881758, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 22089 + }, + { + "epoch": 0.19265318937398615, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 22090 + }, + { + "epoch": 0.19266191065915472, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 22091 + }, + { + "epoch": 0.19267063194432332, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22092 + }, + { + "epoch": 0.1926793532294919, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 22093 + }, + { + "epoch": 0.19268807451466047, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 22094 + }, + { + "epoch": 0.19269679579982907, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 22095 + }, + { + "epoch": 0.19270551708499764, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22096 + }, + { + "epoch": 0.19271423837016624, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 22097 + }, + { + "epoch": 0.1927229596553348, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 22098 + }, + { + "epoch": 0.19273168094050339, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 22099 + }, + { + "epoch": 0.192740402225672, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 22100 + }, + { + "epoch": 0.19274912351084056, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 22101 + }, + { + "epoch": 0.19275784479600913, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 22102 + }, + { + "epoch": 0.19276656608117773, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 22103 + }, + { + "epoch": 0.1927752873663463, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 22104 + }, + { + "epoch": 0.19278400865151488, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 22105 + }, + { + "epoch": 0.19279272993668348, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 22106 + }, + { + "epoch": 0.19280145122185205, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 22107 + }, + { + "epoch": 0.19281017250702062, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 22108 + }, + { + "epoch": 0.19281889379218922, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 22109 + }, + { + "epoch": 0.1928276150773578, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 22110 + }, + { + "epoch": 0.1928363363625264, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 22111 + }, + { + "epoch": 0.19284505764769497, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 22112 + }, + { + "epoch": 0.19285377893286354, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 22113 + }, + { + "epoch": 0.19286250021803214, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 22114 + }, + { + "epoch": 0.1928712215032007, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 22115 + }, + { + "epoch": 0.19287994278836929, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 22116 + }, + { + "epoch": 0.19288866407353789, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 22117 + }, + { + "epoch": 0.19289738535870646, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22118 + }, + { + "epoch": 0.19290610664387503, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 22119 + }, + { + "epoch": 0.19291482792904363, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 22120 + }, + { + "epoch": 0.1929235492142122, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 22121 + }, + { + "epoch": 0.19293227049938078, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 22122 + }, + { + "epoch": 0.19294099178454938, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 22123 + }, + { + "epoch": 0.19294971306971795, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 22124 + }, + { + "epoch": 0.19295843435488655, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22125 + }, + { + "epoch": 0.19296715564005512, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 22126 + }, + { + "epoch": 0.1929758769252237, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 22127 + }, + { + "epoch": 0.1929845982103923, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 22128 + }, + { + "epoch": 0.19299331949556087, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 22129 + }, + { + "epoch": 0.19300204078072944, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 22130 + }, + { + "epoch": 0.19301076206589804, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 22131 + }, + { + "epoch": 0.1930194833510666, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 22132 + }, + { + "epoch": 0.19302820463623518, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 22133 + }, + { + "epoch": 0.19303692592140378, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 22134 + }, + { + "epoch": 0.19304564720657236, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 22135 + }, + { + "epoch": 0.19305436849174093, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 22136 + }, + { + "epoch": 0.19306308977690953, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 22137 + }, + { + "epoch": 0.1930718110620781, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 22138 + }, + { + "epoch": 0.1930805323472467, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 22139 + }, + { + "epoch": 0.19308925363241528, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 22140 + }, + { + "epoch": 0.19309797491758385, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 22141 + }, + { + "epoch": 0.19310669620275245, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 22142 + }, + { + "epoch": 0.19311541748792102, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22143 + }, + { + "epoch": 0.1931241387730896, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 22144 + }, + { + "epoch": 0.1931328600582582, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 22145 + }, + { + "epoch": 0.19314158134342677, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 22146 + }, + { + "epoch": 0.19315030262859534, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 22147 + }, + { + "epoch": 0.19315902391376394, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 22148 + }, + { + "epoch": 0.1931677451989325, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 22149 + }, + { + "epoch": 0.19317646648410108, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 22150 + }, + { + "epoch": 0.19318518776926968, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 22151 + }, + { + "epoch": 0.19319390905443826, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 22152 + }, + { + "epoch": 0.19320263033960686, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 22153 + }, + { + "epoch": 0.19321135162477543, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 22154 + }, + { + "epoch": 0.193220072909944, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 22155 + }, + { + "epoch": 0.1932287941951126, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 22156 + }, + { + "epoch": 0.19323751548028117, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 22157 + }, + { + "epoch": 0.19324623676544975, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 22158 + }, + { + "epoch": 0.19325495805061835, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 22159 + }, + { + "epoch": 0.19326367933578692, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 22160 + }, + { + "epoch": 0.1932724006209555, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 22161 + }, + { + "epoch": 0.1932811219061241, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 22162 + }, + { + "epoch": 0.19328984319129267, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 22163 + }, + { + "epoch": 0.19329856447646124, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 22164 + }, + { + "epoch": 0.19330728576162984, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 22165 + }, + { + "epoch": 0.1933160070467984, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 22166 + }, + { + "epoch": 0.193324728331967, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 22167 + }, + { + "epoch": 0.19333344961713558, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 22168 + }, + { + "epoch": 0.19334217090230416, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 22169 + }, + { + "epoch": 0.19335089218747276, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 22170 + }, + { + "epoch": 0.19335961347264133, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 22171 + }, + { + "epoch": 0.1933683347578099, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 22172 + }, + { + "epoch": 0.1933770560429785, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 22173 + }, + { + "epoch": 0.19338577732814707, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 22174 + }, + { + "epoch": 0.19339449861331565, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 22175 + }, + { + "epoch": 0.19340321989848425, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 22176 + }, + { + "epoch": 0.19341194118365282, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 22177 + }, + { + "epoch": 0.19342066246882142, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 22178 + }, + { + "epoch": 0.19342938375399, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 22179 + }, + { + "epoch": 0.19343810503915856, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 22180 + }, + { + "epoch": 0.19344682632432716, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 22181 + }, + { + "epoch": 0.19345554760949574, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 22182 + }, + { + "epoch": 0.1934642688946643, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 22183 + }, + { + "epoch": 0.1934729901798329, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 22184 + }, + { + "epoch": 0.19348171146500148, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22185 + }, + { + "epoch": 0.19349043275017005, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 22186 + }, + { + "epoch": 0.19349915403533866, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 22187 + }, + { + "epoch": 0.19350787532050723, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 22188 + }, + { + "epoch": 0.1935165966056758, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 22189 + }, + { + "epoch": 0.1935253178908444, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 22190 + }, + { + "epoch": 0.19353403917601297, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 22191 + }, + { + "epoch": 0.19354276046118157, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22192 + }, + { + "epoch": 0.19355148174635015, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 22193 + }, + { + "epoch": 0.19356020303151872, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 22194 + }, + { + "epoch": 0.19356892431668732, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 22195 + }, + { + "epoch": 0.1935776456018559, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 22196 + }, + { + "epoch": 0.19358636688702446, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 22197 + }, + { + "epoch": 0.19359508817219306, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 22198 + }, + { + "epoch": 0.19360380945736164, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 22199 + }, + { + "epoch": 0.1936125307425302, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 22200 + }, + { + "epoch": 0.1936212520276988, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 22201 + }, + { + "epoch": 0.19362997331286738, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 22202 + }, + { + "epoch": 0.19363869459803595, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 22203 + }, + { + "epoch": 0.19364741588320455, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 22204 + }, + { + "epoch": 0.19365613716837313, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 22205 + }, + { + "epoch": 0.19366485845354173, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 22206 + }, + { + "epoch": 0.1936735797387103, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 22207 + }, + { + "epoch": 0.19368230102387887, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 22208 + }, + { + "epoch": 0.19369102230904747, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 22209 + }, + { + "epoch": 0.19369974359421605, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 22210 + }, + { + "epoch": 0.19370846487938462, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 22211 + }, + { + "epoch": 0.19371718616455322, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 22212 + }, + { + "epoch": 0.1937259074497218, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 22213 + }, + { + "epoch": 0.19373462873489036, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 22214 + }, + { + "epoch": 0.19374335002005896, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 22215 + }, + { + "epoch": 0.19375207130522754, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 22216 + }, + { + "epoch": 0.1937607925903961, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 22217 + }, + { + "epoch": 0.1937695138755647, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 22218 + }, + { + "epoch": 0.19377823516073328, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 22219 + }, + { + "epoch": 0.19378695644590188, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 22220 + }, + { + "epoch": 0.19379567773107045, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 22221 + }, + { + "epoch": 0.19380439901623903, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 22222 + }, + { + "epoch": 0.19381312030140763, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 22223 + }, + { + "epoch": 0.1938218415865762, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 22224 + }, + { + "epoch": 0.19383056287174477, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 22225 + }, + { + "epoch": 0.19383928415691337, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 22226 + }, + { + "epoch": 0.19384800544208194, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 22227 + }, + { + "epoch": 0.19385672672725052, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 22228 + }, + { + "epoch": 0.19386544801241912, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 22229 + }, + { + "epoch": 0.1938741692975877, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 22230 + }, + { + "epoch": 0.19388289058275626, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 22231 + }, + { + "epoch": 0.19389161186792486, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 22232 + }, + { + "epoch": 0.19390033315309343, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 22233 + }, + { + "epoch": 0.19390905443826204, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 22234 + }, + { + "epoch": 0.1939177757234306, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 22235 + }, + { + "epoch": 0.19392649700859918, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 22236 + }, + { + "epoch": 0.19393521829376778, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 22237 + }, + { + "epoch": 0.19394393957893635, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 22238 + }, + { + "epoch": 0.19395266086410493, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 22239 + }, + { + "epoch": 0.19396138214927353, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 22240 + }, + { + "epoch": 0.1939701034344421, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 22241 + }, + { + "epoch": 0.19397882471961067, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 22242 + }, + { + "epoch": 0.19398754600477927, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 22243 + }, + { + "epoch": 0.19399626728994784, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 22244 + }, + { + "epoch": 0.19400498857511642, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 22245 + }, + { + "epoch": 0.19401370986028502, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 22246 + }, + { + "epoch": 0.1940224311454536, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 22247 + }, + { + "epoch": 0.1940311524306222, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 22248 + }, + { + "epoch": 0.19403987371579076, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 22249 + }, + { + "epoch": 0.19404859500095933, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 22250 + }, + { + "epoch": 0.19405731628612793, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 22251 + }, + { + "epoch": 0.1940660375712965, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 22252 + }, + { + "epoch": 0.19407475885646508, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 22253 + }, + { + "epoch": 0.19408348014163368, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 22254 + }, + { + "epoch": 0.19409220142680225, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 22255 + }, + { + "epoch": 0.19410092271197082, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 22256 + }, + { + "epoch": 0.19410964399713942, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 22257 + }, + { + "epoch": 0.194118365282308, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 22258 + }, + { + "epoch": 0.19412708656747657, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 22259 + }, + { + "epoch": 0.19413580785264517, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 22260 + }, + { + "epoch": 0.19414452913781374, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 22261 + }, + { + "epoch": 0.19415325042298234, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 22262 + }, + { + "epoch": 0.19416197170815092, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 22263 + }, + { + "epoch": 0.1941706929933195, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 22264 + }, + { + "epoch": 0.1941794142784881, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 22265 + }, + { + "epoch": 0.19418813556365666, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 22266 + }, + { + "epoch": 0.19419685684882523, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 22267 + }, + { + "epoch": 0.19420557813399383, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 22268 + }, + { + "epoch": 0.1942142994191624, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 22269 + }, + { + "epoch": 0.19422302070433098, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 22270 + }, + { + "epoch": 0.19423174198949958, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 22271 + }, + { + "epoch": 0.19424046327466815, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 22272 + }, + { + "epoch": 0.19424918455983672, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 22273 + }, + { + "epoch": 0.19425790584500532, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 22274 + }, + { + "epoch": 0.1942666271301739, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 22275 + }, + { + "epoch": 0.1942753484153425, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 22276 + }, + { + "epoch": 0.19428406970051107, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 22277 + }, + { + "epoch": 0.19429279098567964, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22278 + }, + { + "epoch": 0.19430151227084824, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 22279 + }, + { + "epoch": 0.19431023355601681, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22280 + }, + { + "epoch": 0.1943189548411854, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 22281 + }, + { + "epoch": 0.194327676126354, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 22282 + }, + { + "epoch": 0.19433639741152256, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 22283 + }, + { + "epoch": 0.19434511869669113, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 22284 + }, + { + "epoch": 0.19435383998185973, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 22285 + }, + { + "epoch": 0.1943625612670283, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 22286 + }, + { + "epoch": 0.19437128255219688, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22287 + }, + { + "epoch": 0.19438000383736548, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 22288 + }, + { + "epoch": 0.19438872512253405, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 22289 + }, + { + "epoch": 0.19439744640770265, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 22290 + }, + { + "epoch": 0.19440616769287122, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 22291 + }, + { + "epoch": 0.1944148889780398, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 22292 + }, + { + "epoch": 0.1944236102632084, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 22293 + }, + { + "epoch": 0.19443233154837697, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 22294 + }, + { + "epoch": 0.19444105283354554, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 22295 + }, + { + "epoch": 0.19444977411871414, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 22296 + }, + { + "epoch": 0.19445849540388271, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 22297 + }, + { + "epoch": 0.1944672166890513, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 22298 + }, + { + "epoch": 0.1944759379742199, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 22299 + }, + { + "epoch": 0.19448465925938846, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 22300 + }, + { + "epoch": 0.19449338054455706, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 22301 + }, + { + "epoch": 0.19450210182972563, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 22302 + }, + { + "epoch": 0.1945108231148942, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22303 + }, + { + "epoch": 0.1945195444000628, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 22304 + }, + { + "epoch": 0.19452826568523138, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 22305 + }, + { + "epoch": 0.19453698697039995, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 22306 + }, + { + "epoch": 0.19454570825556855, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 22307 + }, + { + "epoch": 0.19455442954073712, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 22308 + }, + { + "epoch": 0.1945631508259057, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 22309 + }, + { + "epoch": 0.1945718721110743, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 22310 + }, + { + "epoch": 0.19458059339624287, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 22311 + }, + { + "epoch": 0.19458931468141144, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 22312 + }, + { + "epoch": 0.19459803596658004, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 22313 + }, + { + "epoch": 0.1946067572517486, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 22314 + }, + { + "epoch": 0.1946154785369172, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 22315 + }, + { + "epoch": 0.19462419982208579, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 22316 + }, + { + "epoch": 0.19463292110725436, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 22317 + }, + { + "epoch": 0.19464164239242296, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 22318 + }, + { + "epoch": 0.19465036367759153, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 22319 + }, + { + "epoch": 0.1946590849627601, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 22320 + }, + { + "epoch": 0.1946678062479287, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 22321 + }, + { + "epoch": 0.19467652753309728, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 22322 + }, + { + "epoch": 0.19468524881826585, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 22323 + }, + { + "epoch": 0.19469397010343445, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 22324 + }, + { + "epoch": 0.19470269138860302, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 22325 + }, + { + "epoch": 0.1947114126737716, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 22326 + }, + { + "epoch": 0.1947201339589402, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 22327 + }, + { + "epoch": 0.19472885524410877, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 22328 + }, + { + "epoch": 0.19473757652927737, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 22329 + }, + { + "epoch": 0.19474629781444594, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 22330 + }, + { + "epoch": 0.1947550190996145, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 22331 + }, + { + "epoch": 0.1947637403847831, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 22332 + }, + { + "epoch": 0.19477246166995169, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 22333 + }, + { + "epoch": 0.19478118295512026, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 22334 + }, + { + "epoch": 0.19478990424028886, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 22335 + }, + { + "epoch": 0.19479862552545743, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 22336 + }, + { + "epoch": 0.194807346810626, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 22337 + }, + { + "epoch": 0.1948160680957946, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 22338 + }, + { + "epoch": 0.19482478938096318, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 22339 + }, + { + "epoch": 0.19483351066613175, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 22340 + }, + { + "epoch": 0.19484223195130035, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 22341 + }, + { + "epoch": 0.19485095323646892, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 22342 + }, + { + "epoch": 0.19485967452163752, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 22343 + }, + { + "epoch": 0.1948683958068061, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 22344 + }, + { + "epoch": 0.19487711709197467, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 22345 + }, + { + "epoch": 0.19488583837714327, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 22346 + }, + { + "epoch": 0.19489455966231184, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 22347 + }, + { + "epoch": 0.1949032809474804, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 22348 + }, + { + "epoch": 0.194912002232649, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 22349 + }, + { + "epoch": 0.19492072351781758, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 22350 + }, + { + "epoch": 0.19492944480298616, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 22351 + }, + { + "epoch": 0.19493816608815476, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 22352 + }, + { + "epoch": 0.19494688737332333, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 22353 + }, + { + "epoch": 0.1949556086584919, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 22354 + }, + { + "epoch": 0.1949643299436605, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 22355 + }, + { + "epoch": 0.19497305122882908, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 22356 + }, + { + "epoch": 0.19498177251399768, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 22357 + }, + { + "epoch": 0.19499049379916625, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 22358 + }, + { + "epoch": 0.19499921508433482, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 22359 + }, + { + "epoch": 0.19500793636950342, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 22360 + }, + { + "epoch": 0.195016657654672, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 22361 + }, + { + "epoch": 0.19502537893984057, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 22362 + }, + { + "epoch": 0.19503410022500917, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 22363 + }, + { + "epoch": 0.19504282151017774, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 22364 + }, + { + "epoch": 0.1950515427953463, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 22365 + }, + { + "epoch": 0.1950602640805149, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 22366 + }, + { + "epoch": 0.19506898536568348, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 22367 + }, + { + "epoch": 0.19507770665085206, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22368 + }, + { + "epoch": 0.19508642793602066, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 22369 + }, + { + "epoch": 0.19509514922118923, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 22370 + }, + { + "epoch": 0.19510387050635783, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 22371 + }, + { + "epoch": 0.1951125917915264, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 22372 + }, + { + "epoch": 0.19512131307669497, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 22373 + }, + { + "epoch": 0.19513003436186357, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 22374 + }, + { + "epoch": 0.19513875564703215, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 22375 + }, + { + "epoch": 0.19514747693220072, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 22376 + }, + { + "epoch": 0.19515619821736932, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 22377 + }, + { + "epoch": 0.1951649195025379, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 22378 + }, + { + "epoch": 0.19517364078770646, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 22379 + }, + { + "epoch": 0.19518236207287507, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 22380 + }, + { + "epoch": 0.19519108335804364, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 22381 + }, + { + "epoch": 0.1951998046432122, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 22382 + }, + { + "epoch": 0.1952085259283808, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 22383 + }, + { + "epoch": 0.19521724721354938, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 22384 + }, + { + "epoch": 0.19522596849871798, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 22385 + }, + { + "epoch": 0.19523468978388656, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 22386 + }, + { + "epoch": 0.19524341106905513, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 22387 + }, + { + "epoch": 0.19525213235422373, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 22388 + }, + { + "epoch": 0.1952608536393923, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22389 + }, + { + "epoch": 0.19526957492456087, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 22390 + }, + { + "epoch": 0.19527829620972947, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 22391 + }, + { + "epoch": 0.19528701749489805, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 22392 + }, + { + "epoch": 0.19529573878006662, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 22393 + }, + { + "epoch": 0.19530446006523522, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 22394 + }, + { + "epoch": 0.1953131813504038, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 22395 + }, + { + "epoch": 0.19532190263557236, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 22396 + }, + { + "epoch": 0.19533062392074096, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 22397 + }, + { + "epoch": 0.19533934520590954, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 22398 + }, + { + "epoch": 0.19534806649107814, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 22399 + }, + { + "epoch": 0.1953567877762467, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 22400 + }, + { + "epoch": 0.19536550906141528, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 22401 + }, + { + "epoch": 0.19537423034658388, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22402 + }, + { + "epoch": 0.19538295163175246, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 22403 + }, + { + "epoch": 0.19539167291692103, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 22404 + }, + { + "epoch": 0.19540039420208963, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 22405 + }, + { + "epoch": 0.1954091154872582, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 22406 + }, + { + "epoch": 0.19541783677242677, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 22407 + }, + { + "epoch": 0.19542655805759537, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 22408 + }, + { + "epoch": 0.19543527934276395, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 22409 + }, + { + "epoch": 0.19544400062793255, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 22410 + }, + { + "epoch": 0.19545272191310112, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 22411 + }, + { + "epoch": 0.1954614431982697, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 22412 + }, + { + "epoch": 0.1954701644834383, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 22413 + }, + { + "epoch": 0.19547888576860686, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 22414 + }, + { + "epoch": 0.19548760705377544, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 22415 + }, + { + "epoch": 0.19549632833894404, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 22416 + }, + { + "epoch": 0.1955050496241126, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 22417 + }, + { + "epoch": 0.19551377090928118, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 22418 + }, + { + "epoch": 0.19552249219444978, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 22419 + }, + { + "epoch": 0.19553121347961835, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 22420 + }, + { + "epoch": 0.19553993476478693, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 22421 + }, + { + "epoch": 0.19554865604995553, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 22422 + }, + { + "epoch": 0.1955573773351241, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 22423 + }, + { + "epoch": 0.1955660986202927, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 22424 + }, + { + "epoch": 0.19557481990546127, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 22425 + }, + { + "epoch": 0.19558354119062984, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 22426 + }, + { + "epoch": 0.19559226247579845, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 22427 + }, + { + "epoch": 0.19560098376096702, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 22428 + }, + { + "epoch": 0.1956097050461356, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 22429 + }, + { + "epoch": 0.1956184263313042, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 22430 + }, + { + "epoch": 0.19562714761647276, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 22431 + }, + { + "epoch": 0.19563586890164134, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 22432 + }, + { + "epoch": 0.19564459018680994, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 22433 + }, + { + "epoch": 0.1956533114719785, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 22434 + }, + { + "epoch": 0.19566203275714708, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 22435 + }, + { + "epoch": 0.19567075404231568, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 22436 + }, + { + "epoch": 0.19567947532748425, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22437 + }, + { + "epoch": 0.19568819661265285, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 22438 + }, + { + "epoch": 0.19569691789782143, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 22439 + }, + { + "epoch": 0.19570563918299, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 22440 + }, + { + "epoch": 0.1957143604681586, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 22441 + }, + { + "epoch": 0.19572308175332717, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 22442 + }, + { + "epoch": 0.19573180303849574, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 22443 + }, + { + "epoch": 0.19574052432366434, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22444 + }, + { + "epoch": 0.19574924560883292, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 22445 + }, + { + "epoch": 0.1957579668940015, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 22446 + }, + { + "epoch": 0.1957666881791701, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 22447 + }, + { + "epoch": 0.19577540946433866, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 22448 + }, + { + "epoch": 0.19578413074950723, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 22449 + }, + { + "epoch": 0.19579285203467583, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 22450 + }, + { + "epoch": 0.1958015733198444, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 22451 + }, + { + "epoch": 0.195810294605013, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 22452 + }, + { + "epoch": 0.19581901589018158, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 22453 + }, + { + "epoch": 0.19582773717535015, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 22454 + }, + { + "epoch": 0.19583645846051875, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 22455 + }, + { + "epoch": 0.19584517974568733, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 22456 + }, + { + "epoch": 0.1958539010308559, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 22457 + }, + { + "epoch": 0.1958626223160245, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 22458 + }, + { + "epoch": 0.19587134360119307, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 22459 + }, + { + "epoch": 0.19588006488636164, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 22460 + }, + { + "epoch": 0.19588878617153024, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22461 + }, + { + "epoch": 0.19589750745669882, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 22462 + }, + { + "epoch": 0.1959062287418674, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 22463 + }, + { + "epoch": 0.195914950027036, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 22464 + }, + { + "epoch": 0.19592367131220456, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 22465 + }, + { + "epoch": 0.19593239259737316, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 22466 + }, + { + "epoch": 0.19594111388254173, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22467 + }, + { + "epoch": 0.1959498351677103, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 22468 + }, + { + "epoch": 0.1959585564528789, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 22469 + }, + { + "epoch": 0.19596727773804748, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 22470 + }, + { + "epoch": 0.19597599902321605, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 22471 + }, + { + "epoch": 0.19598472030838465, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 22472 + }, + { + "epoch": 0.19599344159355322, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 22473 + }, + { + "epoch": 0.1960021628787218, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 22474 + }, + { + "epoch": 0.1960108841638904, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 22475 + }, + { + "epoch": 0.19601960544905897, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 22476 + }, + { + "epoch": 0.19602832673422754, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 22477 + }, + { + "epoch": 0.19603704801939614, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 22478 + }, + { + "epoch": 0.19604576930456472, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 22479 + }, + { + "epoch": 0.19605449058973332, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 22480 + }, + { + "epoch": 0.1960632118749019, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 22481 + }, + { + "epoch": 0.19607193316007046, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 22482 + }, + { + "epoch": 0.19608065444523906, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 22483 + }, + { + "epoch": 0.19608937573040763, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 22484 + }, + { + "epoch": 0.1960980970155762, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22485 + }, + { + "epoch": 0.1961068183007448, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 22486 + }, + { + "epoch": 0.19611553958591338, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 22487 + }, + { + "epoch": 0.19612426087108195, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 22488 + }, + { + "epoch": 0.19613298215625055, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22489 + }, + { + "epoch": 0.19614170344141912, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 22490 + }, + { + "epoch": 0.1961504247265877, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 22491 + }, + { + "epoch": 0.1961591460117563, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 22492 + }, + { + "epoch": 0.19616786729692487, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 22493 + }, + { + "epoch": 0.19617658858209347, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 22494 + }, + { + "epoch": 0.19618530986726204, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 22495 + }, + { + "epoch": 0.19619403115243061, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 22496 + }, + { + "epoch": 0.19620275243759921, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 22497 + }, + { + "epoch": 0.1962114737227678, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 22498 + }, + { + "epoch": 0.19622019500793636, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 22499 + }, + { + "epoch": 0.19622891629310496, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22500 + }, + { + "epoch": 0.19623763757827353, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 22501 + }, + { + "epoch": 0.1962463588634421, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 22502 + }, + { + "epoch": 0.1962550801486107, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 22503 + }, + { + "epoch": 0.19626380143377928, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 22504 + }, + { + "epoch": 0.19627252271894785, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 22505 + }, + { + "epoch": 0.19628124400411645, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 22506 + }, + { + "epoch": 0.19628996528928502, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 22507 + }, + { + "epoch": 0.19629868657445362, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 22508 + }, + { + "epoch": 0.1963074078596222, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 22509 + }, + { + "epoch": 0.19631612914479077, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 22510 + }, + { + "epoch": 0.19632485042995937, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 22511 + }, + { + "epoch": 0.19633357171512794, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22512 + }, + { + "epoch": 0.19634229300029651, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 22513 + }, + { + "epoch": 0.19635101428546511, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 22514 + }, + { + "epoch": 0.1963597355706337, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 22515 + }, + { + "epoch": 0.19636845685580226, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 22516 + }, + { + "epoch": 0.19637717814097086, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 22517 + }, + { + "epoch": 0.19638589942613943, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 22518 + }, + { + "epoch": 0.19639462071130803, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22519 + }, + { + "epoch": 0.1964033419964766, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 22520 + }, + { + "epoch": 0.19641206328164518, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 22521 + }, + { + "epoch": 0.19642078456681378, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 22522 + }, + { + "epoch": 0.19642950585198235, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 22523 + }, + { + "epoch": 0.19643822713715092, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 22524 + }, + { + "epoch": 0.19644694842231952, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 22525 + }, + { + "epoch": 0.1964556697074881, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 22526 + }, + { + "epoch": 0.19646439099265667, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 22527 + }, + { + "epoch": 0.19647311227782527, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 22528 + }, + { + "epoch": 0.19648183356299384, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 22529 + }, + { + "epoch": 0.1964905548481624, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 22530 + }, + { + "epoch": 0.196499276133331, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 22531 + }, + { + "epoch": 0.19650799741849959, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 22532 + }, + { + "epoch": 0.1965167187036682, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 22533 + }, + { + "epoch": 0.19652543998883676, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 22534 + }, + { + "epoch": 0.19653416127400533, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 22535 + }, + { + "epoch": 0.19654288255917393, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 22536 + }, + { + "epoch": 0.1965516038443425, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 22537 + }, + { + "epoch": 0.19656032512951108, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 22538 + }, + { + "epoch": 0.19656904641467968, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 22539 + }, + { + "epoch": 0.19657776769984825, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 22540 + }, + { + "epoch": 0.19658648898501682, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22541 + }, + { + "epoch": 0.19659521027018542, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 22542 + }, + { + "epoch": 0.196603931555354, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 22543 + }, + { + "epoch": 0.19661265284052257, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 22544 + }, + { + "epoch": 0.19662137412569117, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 22545 + }, + { + "epoch": 0.19663009541085974, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 22546 + }, + { + "epoch": 0.19663881669602834, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 22547 + }, + { + "epoch": 0.1966475379811969, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 22548 + }, + { + "epoch": 0.19665625926636549, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 22549 + }, + { + "epoch": 0.19666498055153409, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 22550 + }, + { + "epoch": 0.19667370183670266, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 22551 + }, + { + "epoch": 0.19668242312187123, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 22552 + }, + { + "epoch": 0.19669114440703983, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 22553 + }, + { + "epoch": 0.1966998656922084, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 22554 + }, + { + "epoch": 0.19670858697737698, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 22555 + }, + { + "epoch": 0.19671730826254558, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 22556 + }, + { + "epoch": 0.19672602954771415, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 22557 + }, + { + "epoch": 0.19673475083288272, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 22558 + }, + { + "epoch": 0.19674347211805132, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 22559 + }, + { + "epoch": 0.1967521934032199, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 22560 + }, + { + "epoch": 0.1967609146883885, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 22561 + }, + { + "epoch": 0.19676963597355707, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 22562 + }, + { + "epoch": 0.19677835725872564, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 22563 + }, + { + "epoch": 0.19678707854389424, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 22564 + }, + { + "epoch": 0.1967957998290628, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 22565 + }, + { + "epoch": 0.19680452111423138, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 22566 + }, + { + "epoch": 0.19681324239939998, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 22567 + }, + { + "epoch": 0.19682196368456856, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 22568 + }, + { + "epoch": 0.19683068496973713, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 22569 + }, + { + "epoch": 0.19683940625490573, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 22570 + }, + { + "epoch": 0.1968481275400743, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 22571 + }, + { + "epoch": 0.19685684882524287, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 22572 + }, + { + "epoch": 0.19686557011041148, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 22573 + }, + { + "epoch": 0.19687429139558005, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 22574 + }, + { + "epoch": 0.19688301268074865, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 22575 + }, + { + "epoch": 0.19689173396591722, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 22576 + }, + { + "epoch": 0.1969004552510858, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 22577 + }, + { + "epoch": 0.1969091765362544, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 22578 + }, + { + "epoch": 0.19691789782142297, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 22579 + }, + { + "epoch": 0.19692661910659154, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 22580 + }, + { + "epoch": 0.19693534039176014, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 22581 + }, + { + "epoch": 0.1969440616769287, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 22582 + }, + { + "epoch": 0.19695278296209728, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22583 + }, + { + "epoch": 0.19696150424726588, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 22584 + }, + { + "epoch": 0.19697022553243446, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 22585 + }, + { + "epoch": 0.19697894681760303, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 22586 + }, + { + "epoch": 0.19698766810277163, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 22587 + }, + { + "epoch": 0.1969963893879402, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 22588 + }, + { + "epoch": 0.1970051106731088, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 22589 + }, + { + "epoch": 0.19701383195827737, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 22590 + }, + { + "epoch": 0.19702255324344595, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 22591 + }, + { + "epoch": 0.19703127452861455, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 22592 + }, + { + "epoch": 0.19703999581378312, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 22593 + }, + { + "epoch": 0.1970487170989517, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 22594 + }, + { + "epoch": 0.1970574383841203, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 22595 + }, + { + "epoch": 0.19706615966928887, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 22596 + }, + { + "epoch": 0.19707488095445744, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 22597 + }, + { + "epoch": 0.19708360223962604, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 22598 + }, + { + "epoch": 0.1970923235247946, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 22599 + }, + { + "epoch": 0.19710104480996318, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 22600 + }, + { + "epoch": 0.19710976609513178, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 22601 + }, + { + "epoch": 0.19711848738030036, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 22602 + }, + { + "epoch": 0.19712720866546896, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 22603 + }, + { + "epoch": 0.19713592995063753, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 22604 + }, + { + "epoch": 0.1971446512358061, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 22605 + }, + { + "epoch": 0.1971533725209747, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 22606 + }, + { + "epoch": 0.19716209380614327, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 22607 + }, + { + "epoch": 0.19717081509131185, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 22608 + }, + { + "epoch": 0.19717953637648045, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 22609 + }, + { + "epoch": 0.19718825766164902, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 22610 + }, + { + "epoch": 0.1971969789468176, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 22611 + }, + { + "epoch": 0.1972057002319862, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 22612 + }, + { + "epoch": 0.19721442151715476, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 22613 + }, + { + "epoch": 0.19722314280232334, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 22614 + }, + { + "epoch": 0.19723186408749194, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 22615 + }, + { + "epoch": 0.1972405853726605, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 22616 + }, + { + "epoch": 0.1972493066578291, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22617 + }, + { + "epoch": 0.19725802794299768, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 22618 + }, + { + "epoch": 0.19726674922816625, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 22619 + }, + { + "epoch": 0.19727547051333486, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 22620 + }, + { + "epoch": 0.19728419179850343, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 22621 + }, + { + "epoch": 0.197292913083672, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 22622 + }, + { + "epoch": 0.1973016343688406, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 22623 + }, + { + "epoch": 0.19731035565400917, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 22624 + }, + { + "epoch": 0.19731907693917775, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 22625 + }, + { + "epoch": 0.19732779822434635, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 22626 + }, + { + "epoch": 0.19733651950951492, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 22627 + }, + { + "epoch": 0.1973452407946835, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 22628 + }, + { + "epoch": 0.1973539620798521, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 22629 + }, + { + "epoch": 0.19736268336502066, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 22630 + }, + { + "epoch": 0.19737140465018926, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 22631 + }, + { + "epoch": 0.19738012593535784, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 22632 + }, + { + "epoch": 0.1973888472205264, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 22633 + }, + { + "epoch": 0.197397568505695, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 22634 + }, + { + "epoch": 0.19740628979086358, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 22635 + }, + { + "epoch": 0.19741501107603215, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 22636 + }, + { + "epoch": 0.19742373236120075, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 22637 + }, + { + "epoch": 0.19743245364636933, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 22638 + }, + { + "epoch": 0.1974411749315379, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 22639 + }, + { + "epoch": 0.1974498962167065, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 22640 + }, + { + "epoch": 0.19745861750187507, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 22641 + }, + { + "epoch": 0.19746733878704367, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 22642 + }, + { + "epoch": 0.19747606007221225, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 22643 + }, + { + "epoch": 0.19748478135738082, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 22644 + }, + { + "epoch": 0.19749350264254942, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 22645 + }, + { + "epoch": 0.197502223927718, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 22646 + }, + { + "epoch": 0.19751094521288656, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 22647 + }, + { + "epoch": 0.19751966649805516, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 22648 + }, + { + "epoch": 0.19752838778322374, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 22649 + }, + { + "epoch": 0.1975371090683923, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 22650 + }, + { + "epoch": 0.1975458303535609, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 22651 + }, + { + "epoch": 0.19755455163872948, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 22652 + }, + { + "epoch": 0.19756327292389805, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22653 + }, + { + "epoch": 0.19757199420906665, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 22654 + }, + { + "epoch": 0.19758071549423523, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 22655 + }, + { + "epoch": 0.19758943677940383, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 22656 + }, + { + "epoch": 0.1975981580645724, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 22657 + }, + { + "epoch": 0.19760687934974097, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 22658 + }, + { + "epoch": 0.19761560063490957, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 22659 + }, + { + "epoch": 0.19762432192007814, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 22660 + }, + { + "epoch": 0.19763304320524672, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 22661 + }, + { + "epoch": 0.19764176449041532, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 22662 + }, + { + "epoch": 0.1976504857755839, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 22663 + }, + { + "epoch": 0.19765920706075246, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 22664 + }, + { + "epoch": 0.19766792834592106, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 22665 + }, + { + "epoch": 0.19767664963108963, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 22666 + }, + { + "epoch": 0.1976853709162582, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 22667 + }, + { + "epoch": 0.1976940922014268, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 22668 + }, + { + "epoch": 0.19770281348659538, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 22669 + }, + { + "epoch": 0.19771153477176398, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 22670 + }, + { + "epoch": 0.19772025605693255, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 22671 + }, + { + "epoch": 0.19772897734210113, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 22672 + }, + { + "epoch": 0.19773769862726973, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 22673 + }, + { + "epoch": 0.1977464199124383, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 22674 + }, + { + "epoch": 0.19775514119760687, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 22675 + }, + { + "epoch": 0.19776386248277547, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 22676 + }, + { + "epoch": 0.19777258376794404, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 22677 + }, + { + "epoch": 0.19778130505311262, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 22678 + }, + { + "epoch": 0.19779002633828122, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 22679 + }, + { + "epoch": 0.1977987476234498, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 22680 + }, + { + "epoch": 0.19780746890861836, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 22681 + }, + { + "epoch": 0.19781619019378696, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 22682 + }, + { + "epoch": 0.19782491147895553, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 22683 + }, + { + "epoch": 0.19783363276412413, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 22684 + }, + { + "epoch": 0.1978423540492927, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 22685 + }, + { + "epoch": 0.19785107533446128, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 22686 + }, + { + "epoch": 0.19785979661962988, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 22687 + }, + { + "epoch": 0.19786851790479845, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 22688 + }, + { + "epoch": 0.19787723918996702, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 22689 + }, + { + "epoch": 0.19788596047513562, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 22690 + }, + { + "epoch": 0.1978946817603042, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 22691 + }, + { + "epoch": 0.19790340304547277, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 22692 + }, + { + "epoch": 0.19791212433064137, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 22693 + }, + { + "epoch": 0.19792084561580994, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 22694 + }, + { + "epoch": 0.19792956690097852, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 22695 + }, + { + "epoch": 0.19793828818614712, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 22696 + }, + { + "epoch": 0.1979470094713157, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 22697 + }, + { + "epoch": 0.1979557307564843, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 22698 + }, + { + "epoch": 0.19796445204165286, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 22699 + }, + { + "epoch": 0.19797317332682143, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 22700 + }, + { + "epoch": 0.19798189461199003, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 22701 + }, + { + "epoch": 0.1979906158971586, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 22702 + }, + { + "epoch": 0.19799933718232718, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 22703 + }, + { + "epoch": 0.19800805846749578, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 22704 + }, + { + "epoch": 0.19801677975266435, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 22705 + }, + { + "epoch": 0.19802550103783292, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 22706 + }, + { + "epoch": 0.19803422232300152, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 22707 + }, + { + "epoch": 0.1980429436081701, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 22708 + }, + { + "epoch": 0.19805166489333867, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22709 + }, + { + "epoch": 0.19806038617850727, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 22710 + }, + { + "epoch": 0.19806910746367584, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 22711 + }, + { + "epoch": 0.19807782874884444, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 22712 + }, + { + "epoch": 0.19808655003401301, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 22713 + }, + { + "epoch": 0.1980952713191816, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 22714 + }, + { + "epoch": 0.1981039926043502, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 22715 + }, + { + "epoch": 0.19811271388951876, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 22716 + }, + { + "epoch": 0.19812143517468733, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 22717 + }, + { + "epoch": 0.19813015645985593, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 22718 + }, + { + "epoch": 0.1981388777450245, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 22719 + }, + { + "epoch": 0.19814759903019308, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 22720 + }, + { + "epoch": 0.19815632031536168, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 22721 + }, + { + "epoch": 0.19816504160053025, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 22722 + }, + { + "epoch": 0.19817376288569882, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 22723 + }, + { + "epoch": 0.19818248417086742, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 22724 + }, + { + "epoch": 0.198191205456036, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 22725 + }, + { + "epoch": 0.1981999267412046, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 22726 + }, + { + "epoch": 0.19820864802637317, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 22727 + }, + { + "epoch": 0.19821736931154174, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 22728 + }, + { + "epoch": 0.19822609059671034, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 22729 + }, + { + "epoch": 0.19823481188187891, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 22730 + }, + { + "epoch": 0.1982435331670475, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 22731 + }, + { + "epoch": 0.1982522544522161, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 22732 + }, + { + "epoch": 0.19826097573738466, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 22733 + }, + { + "epoch": 0.19826969702255323, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 22734 + }, + { + "epoch": 0.19827841830772183, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 22735 + }, + { + "epoch": 0.1982871395928904, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 22736 + }, + { + "epoch": 0.19829586087805898, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 22737 + }, + { + "epoch": 0.19830458216322758, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 22738 + }, + { + "epoch": 0.19831330344839615, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 22739 + }, + { + "epoch": 0.19832202473356475, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 22740 + }, + { + "epoch": 0.19833074601873332, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 22741 + }, + { + "epoch": 0.1983394673039019, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 22742 + }, + { + "epoch": 0.1983481885890705, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 22743 + }, + { + "epoch": 0.19835690987423907, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 22744 + }, + { + "epoch": 0.19836563115940764, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 22745 + }, + { + "epoch": 0.19837435244457624, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22746 + }, + { + "epoch": 0.1983830737297448, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 22747 + }, + { + "epoch": 0.19839179501491339, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 22748 + }, + { + "epoch": 0.19840051630008199, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22749 + }, + { + "epoch": 0.19840923758525056, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 22750 + }, + { + "epoch": 0.19841795887041916, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 22751 + }, + { + "epoch": 0.19842668015558773, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 22752 + }, + { + "epoch": 0.1984354014407563, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 22753 + }, + { + "epoch": 0.1984441227259249, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 22754 + }, + { + "epoch": 0.19845284401109348, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 22755 + }, + { + "epoch": 0.19846156529626205, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 22756 + }, + { + "epoch": 0.19847028658143065, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 22757 + }, + { + "epoch": 0.19847900786659922, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 22758 + }, + { + "epoch": 0.1984877291517678, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 22759 + }, + { + "epoch": 0.1984964504369364, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 22760 + }, + { + "epoch": 0.19850517172210497, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 22761 + }, + { + "epoch": 0.19851389300727354, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 22762 + }, + { + "epoch": 0.19852261429244214, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 22763 + }, + { + "epoch": 0.1985313355776107, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 22764 + }, + { + "epoch": 0.1985400568627793, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 22765 + }, + { + "epoch": 0.19854877814794789, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 22766 + }, + { + "epoch": 0.19855749943311646, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 22767 + }, + { + "epoch": 0.19856622071828506, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22768 + }, + { + "epoch": 0.19857494200345363, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 22769 + }, + { + "epoch": 0.1985836632886222, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 22770 + }, + { + "epoch": 0.1985923845737908, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 22771 + }, + { + "epoch": 0.19860110585895938, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 22772 + }, + { + "epoch": 0.19860982714412795, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 22773 + }, + { + "epoch": 0.19861854842929655, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 22774 + }, + { + "epoch": 0.19862726971446512, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 22775 + }, + { + "epoch": 0.1986359909996337, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 22776 + }, + { + "epoch": 0.1986447122848023, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 22777 + }, + { + "epoch": 0.19865343356997087, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 22778 + }, + { + "epoch": 0.19866215485513947, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 22779 + }, + { + "epoch": 0.19867087614030804, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 22780 + }, + { + "epoch": 0.1986795974254766, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 22781 + }, + { + "epoch": 0.1986883187106452, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 22782 + }, + { + "epoch": 0.19869703999581378, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 22783 + }, + { + "epoch": 0.19870576128098236, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 22784 + }, + { + "epoch": 0.19871448256615096, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 22785 + }, + { + "epoch": 0.19872320385131953, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 22786 + }, + { + "epoch": 0.1987319251364881, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 22787 + }, + { + "epoch": 0.1987406464216567, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 22788 + }, + { + "epoch": 0.19874936770682528, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 22789 + }, + { + "epoch": 0.19875808899199385, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 22790 + }, + { + "epoch": 0.19876681027716245, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 22791 + }, + { + "epoch": 0.19877553156233102, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 22792 + }, + { + "epoch": 0.19878425284749962, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 22793 + }, + { + "epoch": 0.1987929741326682, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 22794 + }, + { + "epoch": 0.19880169541783677, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 22795 + }, + { + "epoch": 0.19881041670300537, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 22796 + }, + { + "epoch": 0.19881913798817394, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 22797 + }, + { + "epoch": 0.1988278592733425, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 22798 + }, + { + "epoch": 0.1988365805585111, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 22799 + }, + { + "epoch": 0.19884530184367968, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 22800 + }, + { + "epoch": 0.19885402312884826, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 22801 + }, + { + "epoch": 0.19886274441401686, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 22802 + }, + { + "epoch": 0.19887146569918543, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 22803 + }, + { + "epoch": 0.198880186984354, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 22804 + }, + { + "epoch": 0.1988889082695226, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 22805 + }, + { + "epoch": 0.19889762955469117, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 22806 + }, + { + "epoch": 0.19890635083985977, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 22807 + }, + { + "epoch": 0.19891507212502835, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 22808 + }, + { + "epoch": 0.19892379341019692, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 22809 + }, + { + "epoch": 0.19893251469536552, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22810 + }, + { + "epoch": 0.1989412359805341, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 22811 + }, + { + "epoch": 0.19894995726570266, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 22812 + }, + { + "epoch": 0.19895867855087127, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 22813 + }, + { + "epoch": 0.19896739983603984, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 22814 + }, + { + "epoch": 0.1989761211212084, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 22815 + }, + { + "epoch": 0.198984842406377, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 22816 + }, + { + "epoch": 0.19899356369154558, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 22817 + }, + { + "epoch": 0.19900228497671416, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 22818 + }, + { + "epoch": 0.19901100626188276, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 22819 + }, + { + "epoch": 0.19901972754705133, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 22820 + }, + { + "epoch": 0.19902844883221993, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 22821 + }, + { + "epoch": 0.1990371701173885, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 22822 + }, + { + "epoch": 0.19904589140255707, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 22823 + }, + { + "epoch": 0.19905461268772567, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 22824 + }, + { + "epoch": 0.19906333397289425, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 22825 + }, + { + "epoch": 0.19907205525806282, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 22826 + }, + { + "epoch": 0.19908077654323142, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 22827 + }, + { + "epoch": 0.1990894978284, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 22828 + }, + { + "epoch": 0.19909821911356856, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 22829 + }, + { + "epoch": 0.19910694039873716, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 22830 + }, + { + "epoch": 0.19911566168390574, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 22831 + }, + { + "epoch": 0.1991243829690743, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 22832 + }, + { + "epoch": 0.1991331042542429, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 22833 + }, + { + "epoch": 0.19914182553941148, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 22834 + }, + { + "epoch": 0.19915054682458008, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 22835 + }, + { + "epoch": 0.19915926810974866, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 22836 + }, + { + "epoch": 0.19916798939491723, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 22837 + }, + { + "epoch": 0.19917671068008583, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 22838 + }, + { + "epoch": 0.1991854319652544, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 22839 + }, + { + "epoch": 0.19919415325042297, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 22840 + }, + { + "epoch": 0.19920287453559157, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 22841 + }, + { + "epoch": 0.19921159582076015, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 22842 + }, + { + "epoch": 0.19922031710592872, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 22843 + }, + { + "epoch": 0.19922903839109732, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 22844 + }, + { + "epoch": 0.1992377596762659, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 22845 + }, + { + "epoch": 0.19924648096143446, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 22846 + }, + { + "epoch": 0.19925520224660306, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 22847 + }, + { + "epoch": 0.19926392353177164, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 22848 + }, + { + "epoch": 0.19927264481694024, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 22849 + }, + { + "epoch": 0.1992813661021088, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 22850 + }, + { + "epoch": 0.19929008738727738, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 22851 + }, + { + "epoch": 0.19929880867244598, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 22852 + }, + { + "epoch": 0.19930752995761455, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 22853 + }, + { + "epoch": 0.19931625124278313, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 22854 + }, + { + "epoch": 0.19932497252795173, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 22855 + }, + { + "epoch": 0.1993336938131203, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 22856 + }, + { + "epoch": 0.19934241509828887, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 22857 + }, + { + "epoch": 0.19935113638345747, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 22858 + }, + { + "epoch": 0.19935985766862604, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 22859 + }, + { + "epoch": 0.19936857895379462, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 22860 + }, + { + "epoch": 0.19937730023896322, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 22861 + }, + { + "epoch": 0.1993860215241318, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 22862 + }, + { + "epoch": 0.1993947428093004, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 22863 + }, + { + "epoch": 0.19940346409446896, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 22864 + }, + { + "epoch": 0.19941218537963754, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 22865 + }, + { + "epoch": 0.19942090666480614, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 22866 + }, + { + "epoch": 0.1994296279499747, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 22867 + }, + { + "epoch": 0.19943834923514328, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 22868 + }, + { + "epoch": 0.19944707052031188, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 22869 + }, + { + "epoch": 0.19945579180548045, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 22870 + }, + { + "epoch": 0.19946451309064903, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 22871 + }, + { + "epoch": 0.19947323437581763, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 22872 + }, + { + "epoch": 0.1994819556609862, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 22873 + }, + { + "epoch": 0.1994906769461548, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 22874 + }, + { + "epoch": 0.19949939823132337, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 22875 + }, + { + "epoch": 0.19950811951649194, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 22876 + }, + { + "epoch": 0.19951684080166054, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 22877 + }, + { + "epoch": 0.19952556208682912, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 22878 + }, + { + "epoch": 0.1995342833719977, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 22879 + }, + { + "epoch": 0.1995430046571663, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 22880 + }, + { + "epoch": 0.19955172594233486, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 22881 + }, + { + "epoch": 0.19956044722750343, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 22882 + }, + { + "epoch": 0.19956916851267203, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 22883 + }, + { + "epoch": 0.1995778897978406, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 22884 + }, + { + "epoch": 0.19958661108300918, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22885 + }, + { + "epoch": 0.19959533236817778, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22886 + }, + { + "epoch": 0.19960405365334635, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 22887 + }, + { + "epoch": 0.19961277493851495, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 22888 + }, + { + "epoch": 0.19962149622368353, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 22889 + }, + { + "epoch": 0.1996302175088521, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 22890 + }, + { + "epoch": 0.1996389387940207, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 22891 + }, + { + "epoch": 0.19964766007918927, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 22892 + }, + { + "epoch": 0.19965638136435784, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 22893 + }, + { + "epoch": 0.19966510264952644, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 22894 + }, + { + "epoch": 0.19967382393469502, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 22895 + }, + { + "epoch": 0.1996825452198636, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22896 + }, + { + "epoch": 0.1996912665050322, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 22897 + }, + { + "epoch": 0.19969998779020076, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 22898 + }, + { + "epoch": 0.19970870907536933, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 22899 + }, + { + "epoch": 0.19971743036053793, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 22900 + }, + { + "epoch": 0.1997261516457065, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 22901 + }, + { + "epoch": 0.1997348729308751, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 22902 + }, + { + "epoch": 0.19974359421604368, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 22903 + }, + { + "epoch": 0.19975231550121225, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 22904 + }, + { + "epoch": 0.19976103678638085, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 22905 + }, + { + "epoch": 0.19976975807154942, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 22906 + }, + { + "epoch": 0.199778479356718, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 22907 + }, + { + "epoch": 0.1997872006418866, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 22908 + }, + { + "epoch": 0.19979592192705517, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 22909 + }, + { + "epoch": 0.19980464321222374, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 22910 + }, + { + "epoch": 0.19981336449739234, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 22911 + }, + { + "epoch": 0.19982208578256092, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 22912 + }, + { + "epoch": 0.1998308070677295, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 22913 + }, + { + "epoch": 0.1998395283528981, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 22914 + }, + { + "epoch": 0.19984824963806666, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 22915 + }, + { + "epoch": 0.19985697092323526, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 22916 + }, + { + "epoch": 0.19986569220840383, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 22917 + }, + { + "epoch": 0.1998744134935724, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 22918 + }, + { + "epoch": 0.199883134778741, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 22919 + }, + { + "epoch": 0.19989185606390958, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 22920 + }, + { + "epoch": 0.19990057734907815, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 22921 + }, + { + "epoch": 0.19990929863424675, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 22922 + }, + { + "epoch": 0.19991801991941532, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22923 + }, + { + "epoch": 0.1999267412045839, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 22924 + }, + { + "epoch": 0.1999354624897525, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 22925 + }, + { + "epoch": 0.19994418377492107, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 22926 + }, + { + "epoch": 0.19995290506008964, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 22927 + }, + { + "epoch": 0.19996162634525824, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 22928 + }, + { + "epoch": 0.19997034763042681, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 22929 + }, + { + "epoch": 0.19997906891559541, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 22930 + }, + { + "epoch": 0.199987790200764, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 22931 + }, + { + "epoch": 0.19999651148593256, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 22932 + }, + { + "epoch": 0.20000523277110116, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 22933 + }, + { + "epoch": 0.20001395405626973, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 22934 + }, + { + "epoch": 0.2000226753414383, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 22935 + }, + { + "epoch": 0.2000313966266069, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 22936 + }, + { + "epoch": 0.20004011791177548, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 22937 + }, + { + "epoch": 0.20004883919694405, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 22938 + }, + { + "epoch": 0.20005756048211265, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 22939 + }, + { + "epoch": 0.20006628176728122, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22940 + }, + { + "epoch": 0.2000750030524498, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 22941 + }, + { + "epoch": 0.2000837243376184, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 22942 + }, + { + "epoch": 0.20009244562278697, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 22943 + }, + { + "epoch": 0.20010116690795557, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 22944 + }, + { + "epoch": 0.20010988819312414, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 22945 + }, + { + "epoch": 0.2001186094782927, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 22946 + }, + { + "epoch": 0.20012733076346131, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 22947 + }, + { + "epoch": 0.2001360520486299, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 22948 + }, + { + "epoch": 0.20014477333379846, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 22949 + }, + { + "epoch": 0.20015349461896706, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 22950 + }, + { + "epoch": 0.20016221590413563, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 22951 + }, + { + "epoch": 0.2001709371893042, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 22952 + }, + { + "epoch": 0.2001796584744728, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 22953 + }, + { + "epoch": 0.20018837975964138, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 22954 + }, + { + "epoch": 0.20019710104480995, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 22955 + }, + { + "epoch": 0.20020582232997855, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 22956 + }, + { + "epoch": 0.20021454361514712, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 22957 + }, + { + "epoch": 0.20022326490031572, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 22958 + }, + { + "epoch": 0.2002319861854843, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 22959 + }, + { + "epoch": 0.20024070747065287, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 22960 + }, + { + "epoch": 0.20024942875582147, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 22961 + }, + { + "epoch": 0.20025815004099004, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 22962 + }, + { + "epoch": 0.2002668713261586, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 22963 + }, + { + "epoch": 0.2002755926113272, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 22964 + }, + { + "epoch": 0.20028431389649579, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 22965 + }, + { + "epoch": 0.20029303518166436, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 22966 + }, + { + "epoch": 0.20030175646683296, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 22967 + }, + { + "epoch": 0.20031047775200153, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 22968 + }, + { + "epoch": 0.2003191990371701, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 22969 + }, + { + "epoch": 0.2003279203223387, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 22970 + }, + { + "epoch": 0.20033664160750728, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 22971 + }, + { + "epoch": 0.20034536289267588, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 22972 + }, + { + "epoch": 0.20035408417784445, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 22973 + }, + { + "epoch": 0.20036280546301302, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 22974 + }, + { + "epoch": 0.20037152674818162, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 22975 + }, + { + "epoch": 0.2003802480333502, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 22976 + }, + { + "epoch": 0.20038896931851877, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 22977 + }, + { + "epoch": 0.20039769060368737, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 22978 + }, + { + "epoch": 0.20040641188885594, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 22979 + }, + { + "epoch": 0.2004151331740245, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 22980 + }, + { + "epoch": 0.2004238544591931, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 22981 + }, + { + "epoch": 0.20043257574436169, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 22982 + }, + { + "epoch": 0.20044129702953029, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 22983 + }, + { + "epoch": 0.20045001831469886, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 22984 + }, + { + "epoch": 0.20045873959986743, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 22985 + }, + { + "epoch": 0.20046746088503603, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 22986 + }, + { + "epoch": 0.2004761821702046, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 22987 + }, + { + "epoch": 0.20048490345537318, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 22988 + }, + { + "epoch": 0.20049362474054178, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 22989 + }, + { + "epoch": 0.20050234602571035, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 22990 + }, + { + "epoch": 0.20051106731087892, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 22991 + }, + { + "epoch": 0.20051978859604752, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 22992 + }, + { + "epoch": 0.2005285098812161, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 22993 + }, + { + "epoch": 0.20053723116638467, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 22994 + }, + { + "epoch": 0.20054595245155327, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 22995 + }, + { + "epoch": 0.20055467373672184, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 22996 + }, + { + "epoch": 0.20056339502189044, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 22997 + }, + { + "epoch": 0.200572116307059, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 22998 + }, + { + "epoch": 0.20058083759222758, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 22999 + }, + { + "epoch": 0.20058955887739618, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 23000 + }, + { + "epoch": 0.20059828016256476, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 23001 + }, + { + "epoch": 0.20060700144773333, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 23002 + }, + { + "epoch": 0.20061572273290193, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23003 + }, + { + "epoch": 0.2006244440180705, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 23004 + }, + { + "epoch": 0.20063316530323907, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 23005 + }, + { + "epoch": 0.20064188658840768, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 23006 + }, + { + "epoch": 0.20065060787357625, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 23007 + }, + { + "epoch": 0.20065932915874482, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 23008 + }, + { + "epoch": 0.20066805044391342, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 23009 + }, + { + "epoch": 0.200676771729082, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 23010 + }, + { + "epoch": 0.2006854930142506, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 23011 + }, + { + "epoch": 0.20069421429941917, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 23012 + }, + { + "epoch": 0.20070293558458774, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 23013 + }, + { + "epoch": 0.20071165686975634, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 23014 + }, + { + "epoch": 0.2007203781549249, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 23015 + }, + { + "epoch": 0.20072909944009348, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 23016 + }, + { + "epoch": 0.20073782072526208, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 23017 + }, + { + "epoch": 0.20074654201043066, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 23018 + }, + { + "epoch": 0.20075526329559923, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 23019 + }, + { + "epoch": 0.20076398458076783, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 23020 + }, + { + "epoch": 0.2007727058659364, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 23021 + }, + { + "epoch": 0.20078142715110497, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 23022 + }, + { + "epoch": 0.20079014843627357, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 23023 + }, + { + "epoch": 0.20079886972144215, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 23024 + }, + { + "epoch": 0.20080759100661075, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 23025 + }, + { + "epoch": 0.20081631229177932, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 23026 + }, + { + "epoch": 0.2008250335769479, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 23027 + }, + { + "epoch": 0.2008337548621165, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 23028 + }, + { + "epoch": 0.20084247614728507, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 23029 + }, + { + "epoch": 0.20085119743245364, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 23030 + }, + { + "epoch": 0.20085991871762224, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 23031 + }, + { + "epoch": 0.2008686400027908, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 23032 + }, + { + "epoch": 0.20087736128795938, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 23033 + }, + { + "epoch": 0.20088608257312798, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 23034 + }, + { + "epoch": 0.20089480385829656, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 23035 + }, + { + "epoch": 0.20090352514346513, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 23036 + }, + { + "epoch": 0.20091224642863373, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 23037 + }, + { + "epoch": 0.2009209677138023, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 23038 + }, + { + "epoch": 0.2009296889989709, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 23039 + }, + { + "epoch": 0.20093841028413947, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 23040 + }, + { + "epoch": 0.20094713156930805, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 23041 + }, + { + "epoch": 0.20095585285447665, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23042 + }, + { + "epoch": 0.20096457413964522, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 23043 + }, + { + "epoch": 0.2009732954248138, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23044 + }, + { + "epoch": 0.2009820167099824, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 23045 + }, + { + "epoch": 0.20099073799515096, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 23046 + }, + { + "epoch": 0.20099945928031954, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 23047 + }, + { + "epoch": 0.20100818056548814, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 23048 + }, + { + "epoch": 0.2010169018506567, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 23049 + }, + { + "epoch": 0.20102562313582528, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23050 + }, + { + "epoch": 0.20103434442099388, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 23051 + }, + { + "epoch": 0.20104306570616245, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 23052 + }, + { + "epoch": 0.20105178699133106, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 23053 + }, + { + "epoch": 0.20106050827649963, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 23054 + }, + { + "epoch": 0.2010692295616682, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 23055 + }, + { + "epoch": 0.2010779508468368, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23056 + }, + { + "epoch": 0.20108667213200537, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 23057 + }, + { + "epoch": 0.20109539341717395, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 23058 + }, + { + "epoch": 0.20110411470234255, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 23059 + }, + { + "epoch": 0.20111283598751112, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 23060 + }, + { + "epoch": 0.2011215572726797, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 23061 + }, + { + "epoch": 0.2011302785578483, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 23062 + }, + { + "epoch": 0.20113899984301686, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 23063 + }, + { + "epoch": 0.20114772112818544, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 23064 + }, + { + "epoch": 0.20115644241335404, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 23065 + }, + { + "epoch": 0.2011651636985226, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 23066 + }, + { + "epoch": 0.2011738849836912, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 23067 + }, + { + "epoch": 0.20118260626885978, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 23068 + }, + { + "epoch": 0.20119132755402835, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 23069 + }, + { + "epoch": 0.20120004883919695, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 23070 + }, + { + "epoch": 0.20120877012436553, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 23071 + }, + { + "epoch": 0.2012174914095341, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 23072 + }, + { + "epoch": 0.2012262126947027, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 23073 + }, + { + "epoch": 0.20123493397987127, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 23074 + }, + { + "epoch": 0.20124365526503984, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 23075 + }, + { + "epoch": 0.20125237655020844, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 23076 + }, + { + "epoch": 0.20126109783537702, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 23077 + }, + { + "epoch": 0.2012698191205456, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23078 + }, + { + "epoch": 0.2012785404057142, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 23079 + }, + { + "epoch": 0.20128726169088276, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 23080 + }, + { + "epoch": 0.20129598297605136, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 23081 + }, + { + "epoch": 0.20130470426121994, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 23082 + }, + { + "epoch": 0.2013134255463885, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 23083 + }, + { + "epoch": 0.2013221468315571, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 23084 + }, + { + "epoch": 0.20133086811672568, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 23085 + }, + { + "epoch": 0.20133958940189425, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 23086 + }, + { + "epoch": 0.20134831068706285, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 23087 + }, + { + "epoch": 0.20135703197223143, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 23088 + }, + { + "epoch": 0.2013657532574, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 23089 + }, + { + "epoch": 0.2013744745425686, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 23090 + }, + { + "epoch": 0.20138319582773717, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23091 + }, + { + "epoch": 0.20139191711290574, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 23092 + }, + { + "epoch": 0.20140063839807434, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 23093 + }, + { + "epoch": 0.20140935968324292, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 23094 + }, + { + "epoch": 0.20141808096841152, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 23095 + }, + { + "epoch": 0.2014268022535801, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 23096 + }, + { + "epoch": 0.20143552353874866, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 23097 + }, + { + "epoch": 0.20144424482391726, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23098 + }, + { + "epoch": 0.20145296610908583, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 23099 + }, + { + "epoch": 0.2014616873942544, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 23100 + }, + { + "epoch": 0.201470408679423, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23101 + }, + { + "epoch": 0.20147912996459158, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 23102 + }, + { + "epoch": 0.20148785124976015, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 23103 + }, + { + "epoch": 0.20149657253492875, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23104 + }, + { + "epoch": 0.20150529382009733, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 23105 + }, + { + "epoch": 0.20151401510526593, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 23106 + }, + { + "epoch": 0.2015227363904345, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 23107 + }, + { + "epoch": 0.20153145767560307, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 23108 + }, + { + "epoch": 0.20154017896077167, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23109 + }, + { + "epoch": 0.20154890024594024, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 23110 + }, + { + "epoch": 0.20155762153110882, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 23111 + }, + { + "epoch": 0.20156634281627742, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 23112 + }, + { + "epoch": 0.201575064101446, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 23113 + }, + { + "epoch": 0.20158378538661456, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 23114 + }, + { + "epoch": 0.20159250667178316, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 23115 + }, + { + "epoch": 0.20160122795695173, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 23116 + }, + { + "epoch": 0.2016099492421203, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 23117 + }, + { + "epoch": 0.2016186705272889, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 23118 + }, + { + "epoch": 0.20162739181245748, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 23119 + }, + { + "epoch": 0.20163611309762608, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 23120 + }, + { + "epoch": 0.20164483438279465, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 23121 + }, + { + "epoch": 0.20165355566796322, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 23122 + }, + { + "epoch": 0.20166227695313182, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 23123 + }, + { + "epoch": 0.2016709982383004, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 23124 + }, + { + "epoch": 0.20167971952346897, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 23125 + }, + { + "epoch": 0.20168844080863757, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 23126 + }, + { + "epoch": 0.20169716209380614, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 23127 + }, + { + "epoch": 0.20170588337897472, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 23128 + }, + { + "epoch": 0.20171460466414332, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 23129 + }, + { + "epoch": 0.2017233259493119, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 23130 + }, + { + "epoch": 0.20173204723448046, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 23131 + }, + { + "epoch": 0.20174076851964906, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 23132 + }, + { + "epoch": 0.20174948980481763, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 23133 + }, + { + "epoch": 0.20175821108998623, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 23134 + }, + { + "epoch": 0.2017669323751548, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 23135 + }, + { + "epoch": 0.20177565366032338, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 23136 + }, + { + "epoch": 0.20178437494549198, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 23137 + }, + { + "epoch": 0.20179309623066055, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 23138 + }, + { + "epoch": 0.20180181751582912, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 23139 + }, + { + "epoch": 0.20181053880099772, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23140 + }, + { + "epoch": 0.2018192600861663, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 23141 + }, + { + "epoch": 0.20182798137133487, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 23142 + }, + { + "epoch": 0.20183670265650347, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 23143 + }, + { + "epoch": 0.20184542394167204, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 23144 + }, + { + "epoch": 0.20185414522684061, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 23145 + }, + { + "epoch": 0.20186286651200921, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 23146 + }, + { + "epoch": 0.2018715877971778, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 23147 + }, + { + "epoch": 0.2018803090823464, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 23148 + }, + { + "epoch": 0.20188903036751496, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 23149 + }, + { + "epoch": 0.20189775165268353, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23150 + }, + { + "epoch": 0.20190647293785213, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 23151 + }, + { + "epoch": 0.2019151942230207, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 23152 + }, + { + "epoch": 0.20192391550818928, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 23153 + }, + { + "epoch": 0.20193263679335788, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 23154 + }, + { + "epoch": 0.20194135807852645, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 23155 + }, + { + "epoch": 0.20195007936369502, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 23156 + }, + { + "epoch": 0.20195880064886362, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 23157 + }, + { + "epoch": 0.2019675219340322, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 23158 + }, + { + "epoch": 0.20197624321920077, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 23159 + }, + { + "epoch": 0.20198496450436937, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 23160 + }, + { + "epoch": 0.20199368578953794, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 23161 + }, + { + "epoch": 0.20200240707470654, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 23162 + }, + { + "epoch": 0.20201112835987511, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 23163 + }, + { + "epoch": 0.2020198496450437, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 23164 + }, + { + "epoch": 0.2020285709302123, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 23165 + }, + { + "epoch": 0.20203729221538086, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 23166 + }, + { + "epoch": 0.20204601350054943, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 23167 + }, + { + "epoch": 0.20205473478571803, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 23168 + }, + { + "epoch": 0.2020634560708866, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 23169 + }, + { + "epoch": 0.20207217735605518, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 23170 + }, + { + "epoch": 0.20208089864122378, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 23171 + }, + { + "epoch": 0.20208961992639235, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 23172 + }, + { + "epoch": 0.20209834121156092, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 23173 + }, + { + "epoch": 0.20210706249672952, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 23174 + }, + { + "epoch": 0.2021157837818981, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 23175 + }, + { + "epoch": 0.2021245050670667, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 23176 + }, + { + "epoch": 0.20213322635223527, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 23177 + }, + { + "epoch": 0.20214194763740384, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 23178 + }, + { + "epoch": 0.20215066892257244, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 23179 + }, + { + "epoch": 0.202159390207741, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 23180 + }, + { + "epoch": 0.20216811149290959, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 23181 + }, + { + "epoch": 0.20217683277807819, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 23182 + }, + { + "epoch": 0.20218555406324676, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 23183 + }, + { + "epoch": 0.20219427534841533, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 23184 + }, + { + "epoch": 0.20220299663358393, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 23185 + }, + { + "epoch": 0.2022117179187525, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 23186 + }, + { + "epoch": 0.20222043920392108, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 23187 + }, + { + "epoch": 0.20222916048908968, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 23188 + }, + { + "epoch": 0.20223788177425825, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 23189 + }, + { + "epoch": 0.20224660305942685, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 23190 + }, + { + "epoch": 0.20225532434459542, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 23191 + }, + { + "epoch": 0.202264045629764, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 23192 + }, + { + "epoch": 0.2022727669149326, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 23193 + }, + { + "epoch": 0.20228148820010117, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 23194 + }, + { + "epoch": 0.20229020948526974, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 23195 + }, + { + "epoch": 0.20229893077043834, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 23196 + }, + { + "epoch": 0.2023076520556069, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 23197 + }, + { + "epoch": 0.20231637334077548, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 23198 + }, + { + "epoch": 0.20232509462594409, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 23199 + }, + { + "epoch": 0.20233381591111266, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 23200 + }, + { + "epoch": 0.20234253719628123, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 23201 + }, + { + "epoch": 0.20235125848144983, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 23202 + }, + { + "epoch": 0.2023599797666184, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23203 + }, + { + "epoch": 0.202368701051787, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 23204 + }, + { + "epoch": 0.20237742233695558, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 23205 + }, + { + "epoch": 0.20238614362212415, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 23206 + }, + { + "epoch": 0.20239486490729275, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 23207 + }, + { + "epoch": 0.20240358619246132, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 23208 + }, + { + "epoch": 0.2024123074776299, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 23209 + }, + { + "epoch": 0.2024210287627985, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 23210 + }, + { + "epoch": 0.20242975004796707, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 23211 + }, + { + "epoch": 0.20243847133313564, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 23212 + }, + { + "epoch": 0.20244719261830424, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23213 + }, + { + "epoch": 0.2024559139034728, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 23214 + }, + { + "epoch": 0.2024646351886414, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 23215 + }, + { + "epoch": 0.20247335647380998, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 23216 + }, + { + "epoch": 0.20248207775897856, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 23217 + }, + { + "epoch": 0.20249079904414716, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 23218 + }, + { + "epoch": 0.20249952032931573, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 23219 + }, + { + "epoch": 0.2025082416144843, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 23220 + }, + { + "epoch": 0.2025169628996529, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 23221 + }, + { + "epoch": 0.20252568418482148, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23222 + }, + { + "epoch": 0.20253440546999005, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23223 + }, + { + "epoch": 0.20254312675515865, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 23224 + }, + { + "epoch": 0.20255184804032722, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 23225 + }, + { + "epoch": 0.2025605693254958, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 23226 + }, + { + "epoch": 0.2025692906106644, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 23227 + }, + { + "epoch": 0.20257801189583297, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 23228 + }, + { + "epoch": 0.20258673318100157, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 23229 + }, + { + "epoch": 0.20259545446617014, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 23230 + }, + { + "epoch": 0.2026041757513387, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 23231 + }, + { + "epoch": 0.2026128970365073, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 23232 + }, + { + "epoch": 0.20262161832167588, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 23233 + }, + { + "epoch": 0.20263033960684446, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 23234 + }, + { + "epoch": 0.20263906089201306, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 23235 + }, + { + "epoch": 0.20264778217718163, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 23236 + }, + { + "epoch": 0.2026565034623502, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 23237 + }, + { + "epoch": 0.2026652247475188, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 23238 + }, + { + "epoch": 0.20267394603268737, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 23239 + }, + { + "epoch": 0.20268266731785595, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 23240 + }, + { + "epoch": 0.20269138860302455, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 23241 + }, + { + "epoch": 0.20270010988819312, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 23242 + }, + { + "epoch": 0.20270883117336172, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9804, + "step": 23243 + }, + { + "epoch": 0.2027175524585303, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 23244 + }, + { + "epoch": 0.20272627374369886, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23245 + }, + { + "epoch": 0.20273499502886747, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 23246 + }, + { + "epoch": 0.20274371631403604, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 23247 + }, + { + "epoch": 0.2027524375992046, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 23248 + }, + { + "epoch": 0.2027611588843732, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 23249 + }, + { + "epoch": 0.20276988016954178, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 23250 + }, + { + "epoch": 0.20277860145471036, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 23251 + }, + { + "epoch": 0.20278732273987896, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 23252 + }, + { + "epoch": 0.20279604402504753, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 23253 + }, + { + "epoch": 0.2028047653102161, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 23254 + }, + { + "epoch": 0.2028134865953847, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 23255 + }, + { + "epoch": 0.20282220788055327, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 23256 + }, + { + "epoch": 0.20283092916572187, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 23257 + }, + { + "epoch": 0.20283965045089045, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 23258 + }, + { + "epoch": 0.20284837173605902, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 23259 + }, + { + "epoch": 0.20285709302122762, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 23260 + }, + { + "epoch": 0.2028658143063962, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 23261 + }, + { + "epoch": 0.20287453559156476, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 23262 + }, + { + "epoch": 0.20288325687673336, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 23263 + }, + { + "epoch": 0.20289197816190194, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 23264 + }, + { + "epoch": 0.2029006994470705, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 23265 + }, + { + "epoch": 0.2029094207322391, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 23266 + }, + { + "epoch": 0.20291814201740768, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 23267 + }, + { + "epoch": 0.20292686330257625, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 23268 + }, + { + "epoch": 0.20293558458774485, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 23269 + }, + { + "epoch": 0.20294430587291343, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 23270 + }, + { + "epoch": 0.20295302715808203, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 23271 + }, + { + "epoch": 0.2029617484432506, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 23272 + }, + { + "epoch": 0.20297046972841917, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 23273 + }, + { + "epoch": 0.20297919101358777, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 23274 + }, + { + "epoch": 0.20298791229875635, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 23275 + }, + { + "epoch": 0.20299663358392492, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 23276 + }, + { + "epoch": 0.20300535486909352, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 23277 + }, + { + "epoch": 0.2030140761542621, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 23278 + }, + { + "epoch": 0.20302279743943066, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 23279 + }, + { + "epoch": 0.20303151872459926, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 23280 + }, + { + "epoch": 0.20304024000976784, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 23281 + }, + { + "epoch": 0.2030489612949364, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 23282 + }, + { + "epoch": 0.203057682580105, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 23283 + }, + { + "epoch": 0.20306640386527358, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 23284 + }, + { + "epoch": 0.20307512515044218, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 23285 + }, + { + "epoch": 0.20308384643561075, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 23286 + }, + { + "epoch": 0.20309256772077933, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 23287 + }, + { + "epoch": 0.20310128900594793, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 23288 + }, + { + "epoch": 0.2031100102911165, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 23289 + }, + { + "epoch": 0.20311873157628507, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 23290 + }, + { + "epoch": 0.20312745286145367, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 23291 + }, + { + "epoch": 0.20313617414662224, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 23292 + }, + { + "epoch": 0.20314489543179082, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 23293 + }, + { + "epoch": 0.20315361671695942, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 23294 + }, + { + "epoch": 0.203162338002128, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 23295 + }, + { + "epoch": 0.20317105928729656, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 23296 + }, + { + "epoch": 0.20317978057246516, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 23297 + }, + { + "epoch": 0.20318850185763374, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 23298 + }, + { + "epoch": 0.20319722314280234, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 23299 + }, + { + "epoch": 0.2032059444279709, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 23300 + }, + { + "epoch": 0.20321466571313948, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 23301 + }, + { + "epoch": 0.20322338699830808, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 23302 + }, + { + "epoch": 0.20323210828347665, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 23303 + }, + { + "epoch": 0.20324082956864523, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 23304 + }, + { + "epoch": 0.20324955085381383, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 23305 + }, + { + "epoch": 0.2032582721389824, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 23306 + }, + { + "epoch": 0.20326699342415097, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 23307 + }, + { + "epoch": 0.20327571470931957, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 23308 + }, + { + "epoch": 0.20328443599448814, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 23309 + }, + { + "epoch": 0.20329315727965672, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 23310 + }, + { + "epoch": 0.20330187856482532, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 23311 + }, + { + "epoch": 0.2033105998499939, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 23312 + }, + { + "epoch": 0.2033193211351625, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 23313 + }, + { + "epoch": 0.20332804242033106, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 23314 + }, + { + "epoch": 0.20333676370549963, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 23315 + }, + { + "epoch": 0.20334548499066823, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 23316 + }, + { + "epoch": 0.2033542062758368, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23317 + }, + { + "epoch": 0.20336292756100538, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 23318 + }, + { + "epoch": 0.20337164884617398, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 23319 + }, + { + "epoch": 0.20338037013134255, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 23320 + }, + { + "epoch": 0.20338909141651113, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 23321 + }, + { + "epoch": 0.20339781270167973, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 23322 + }, + { + "epoch": 0.2034065339868483, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 23323 + }, + { + "epoch": 0.2034152552720169, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 23324 + }, + { + "epoch": 0.20342397655718547, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 23325 + }, + { + "epoch": 0.20343269784235404, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 23326 + }, + { + "epoch": 0.20344141912752264, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 23327 + }, + { + "epoch": 0.20345014041269122, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 23328 + }, + { + "epoch": 0.2034588616978598, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 23329 + }, + { + "epoch": 0.2034675829830284, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 23330 + }, + { + "epoch": 0.20347630426819696, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 23331 + }, + { + "epoch": 0.20348502555336553, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 23332 + }, + { + "epoch": 0.20349374683853413, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 23333 + }, + { + "epoch": 0.2035024681237027, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 23334 + }, + { + "epoch": 0.20351118940887128, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 23335 + }, + { + "epoch": 0.20351991069403988, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 23336 + }, + { + "epoch": 0.20352863197920845, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 23337 + }, + { + "epoch": 0.20353735326437705, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 23338 + }, + { + "epoch": 0.20354607454954562, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 23339 + }, + { + "epoch": 0.2035547958347142, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 23340 + }, + { + "epoch": 0.2035635171198828, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 23341 + }, + { + "epoch": 0.20357223840505137, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 23342 + }, + { + "epoch": 0.20358095969021994, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 23343 + }, + { + "epoch": 0.20358968097538854, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 23344 + }, + { + "epoch": 0.20359840226055712, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 23345 + }, + { + "epoch": 0.2036071235457257, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 23346 + }, + { + "epoch": 0.2036158448308943, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 23347 + }, + { + "epoch": 0.20362456611606286, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 23348 + }, + { + "epoch": 0.20363328740123143, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 23349 + }, + { + "epoch": 0.20364200868640003, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 23350 + }, + { + "epoch": 0.2036507299715686, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 23351 + }, + { + "epoch": 0.2036594512567372, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 23352 + }, + { + "epoch": 0.20366817254190578, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 23353 + }, + { + "epoch": 0.20367689382707435, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 23354 + }, + { + "epoch": 0.20368561511224295, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 23355 + }, + { + "epoch": 0.20369433639741152, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 23356 + }, + { + "epoch": 0.2037030576825801, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 23357 + }, + { + "epoch": 0.2037117789677487, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 23358 + }, + { + "epoch": 0.20372050025291727, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23359 + }, + { + "epoch": 0.20372922153808584, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 23360 + }, + { + "epoch": 0.20373794282325444, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 23361 + }, + { + "epoch": 0.20374666410842301, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 23362 + }, + { + "epoch": 0.2037553853935916, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 23363 + }, + { + "epoch": 0.2037641066787602, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 23364 + }, + { + "epoch": 0.20377282796392876, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 23365 + }, + { + "epoch": 0.20378154924909736, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 23366 + }, + { + "epoch": 0.20379027053426593, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 23367 + }, + { + "epoch": 0.2037989918194345, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 23368 + }, + { + "epoch": 0.2038077131046031, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 23369 + }, + { + "epoch": 0.20381643438977168, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 23370 + }, + { + "epoch": 0.20382515567494025, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 23371 + }, + { + "epoch": 0.20383387696010885, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 23372 + }, + { + "epoch": 0.20384259824527742, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 23373 + }, + { + "epoch": 0.203851319530446, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 23374 + }, + { + "epoch": 0.2038600408156146, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 23375 + }, + { + "epoch": 0.20386876210078317, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 23376 + }, + { + "epoch": 0.20387748338595174, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 23377 + }, + { + "epoch": 0.20388620467112034, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 23378 + }, + { + "epoch": 0.2038949259562889, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 23379 + }, + { + "epoch": 0.20390364724145751, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 23380 + }, + { + "epoch": 0.2039123685266261, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 23381 + }, + { + "epoch": 0.20392108981179466, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 23382 + }, + { + "epoch": 0.20392981109696326, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 23383 + }, + { + "epoch": 0.20393853238213183, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 23384 + }, + { + "epoch": 0.2039472536673004, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 23385 + }, + { + "epoch": 0.203955974952469, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 23386 + }, + { + "epoch": 0.20396469623763758, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23387 + }, + { + "epoch": 0.20397341752280615, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 23388 + }, + { + "epoch": 0.20398213880797475, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 23389 + }, + { + "epoch": 0.20399086009314332, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 23390 + }, + { + "epoch": 0.2039995813783119, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 23391 + }, + { + "epoch": 0.2040083026634805, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 23392 + }, + { + "epoch": 0.20401702394864907, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 23393 + }, + { + "epoch": 0.20402574523381767, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 23394 + }, + { + "epoch": 0.20403446651898624, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 23395 + }, + { + "epoch": 0.2040431878041548, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23396 + }, + { + "epoch": 0.2040519090893234, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 23397 + }, + { + "epoch": 0.20406063037449199, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 23398 + }, + { + "epoch": 0.20406935165966056, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 23399 + }, + { + "epoch": 0.20407807294482916, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 23400 + }, + { + "epoch": 0.20408679422999773, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 23401 + }, + { + "epoch": 0.2040955155151663, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 23402 + }, + { + "epoch": 0.2041042368003349, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23403 + }, + { + "epoch": 0.20411295808550348, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 23404 + }, + { + "epoch": 0.20412167937067205, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 23405 + }, + { + "epoch": 0.20413040065584065, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 23406 + }, + { + "epoch": 0.20413912194100922, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 23407 + }, + { + "epoch": 0.20414784322617782, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 23408 + }, + { + "epoch": 0.2041565645113464, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 23409 + }, + { + "epoch": 0.20416528579651497, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23410 + }, + { + "epoch": 0.20417400708168357, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 23411 + }, + { + "epoch": 0.20418272836685214, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 23412 + }, + { + "epoch": 0.2041914496520207, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 23413 + }, + { + "epoch": 0.2042001709371893, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 23414 + }, + { + "epoch": 0.20420889222235789, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 23415 + }, + { + "epoch": 0.20421761350752646, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 23416 + }, + { + "epoch": 0.20422633479269506, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23417 + }, + { + "epoch": 0.20423505607786363, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23418 + }, + { + "epoch": 0.2042437773630322, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 23419 + }, + { + "epoch": 0.2042524986482008, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 23420 + }, + { + "epoch": 0.20426121993336938, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 23421 + }, + { + "epoch": 0.20426994121853798, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 23422 + }, + { + "epoch": 0.20427866250370655, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 23423 + }, + { + "epoch": 0.20428738378887512, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 23424 + }, + { + "epoch": 0.20429610507404372, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 23425 + }, + { + "epoch": 0.2043048263592123, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 23426 + }, + { + "epoch": 0.20431354764438087, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 23427 + }, + { + "epoch": 0.20432226892954947, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 23428 + }, + { + "epoch": 0.20433099021471804, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 23429 + }, + { + "epoch": 0.2043397114998866, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 23430 + }, + { + "epoch": 0.2043484327850552, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 23431 + }, + { + "epoch": 0.20435715407022378, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 23432 + }, + { + "epoch": 0.20436587535539236, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 23433 + }, + { + "epoch": 0.20437459664056096, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 23434 + }, + { + "epoch": 0.20438331792572953, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 23435 + }, + { + "epoch": 0.20439203921089813, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 23436 + }, + { + "epoch": 0.2044007604960667, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 23437 + }, + { + "epoch": 0.20440948178123527, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 23438 + }, + { + "epoch": 0.20441820306640388, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 23439 + }, + { + "epoch": 0.20442692435157245, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23440 + }, + { + "epoch": 0.20443564563674102, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 23441 + }, + { + "epoch": 0.20444436692190962, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 23442 + }, + { + "epoch": 0.2044530882070782, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 23443 + }, + { + "epoch": 0.20446180949224677, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 23444 + }, + { + "epoch": 0.20447053077741537, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 23445 + }, + { + "epoch": 0.20447925206258394, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 23446 + }, + { + "epoch": 0.20448797334775254, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 23447 + }, + { + "epoch": 0.2044966946329211, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 23448 + }, + { + "epoch": 0.20450541591808968, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 23449 + }, + { + "epoch": 0.20451413720325828, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 23450 + }, + { + "epoch": 0.20452285848842686, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 23451 + }, + { + "epoch": 0.20453157977359543, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 23452 + }, + { + "epoch": 0.20454030105876403, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 23453 + }, + { + "epoch": 0.2045490223439326, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 23454 + }, + { + "epoch": 0.20455774362910117, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 23455 + }, + { + "epoch": 0.20456646491426977, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 23456 + }, + { + "epoch": 0.20457518619943835, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 23457 + }, + { + "epoch": 0.20458390748460692, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 23458 + }, + { + "epoch": 0.20459262876977552, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 23459 + }, + { + "epoch": 0.2046013500549441, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 23460 + }, + { + "epoch": 0.2046100713401127, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 23461 + }, + { + "epoch": 0.20461879262528126, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 23462 + }, + { + "epoch": 0.20462751391044984, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 23463 + }, + { + "epoch": 0.20463623519561844, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 23464 + }, + { + "epoch": 0.204644956480787, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 23465 + }, + { + "epoch": 0.20465367776595558, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 23466 + }, + { + "epoch": 0.20466239905112418, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 23467 + }, + { + "epoch": 0.20467112033629276, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 23468 + }, + { + "epoch": 0.20467984162146133, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 23469 + }, + { + "epoch": 0.20468856290662993, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 23470 + }, + { + "epoch": 0.2046972841917985, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 23471 + }, + { + "epoch": 0.20470600547696707, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 23472 + }, + { + "epoch": 0.20471472676213567, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 23473 + }, + { + "epoch": 0.20472344804730425, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 23474 + }, + { + "epoch": 0.20473216933247285, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 23475 + }, + { + "epoch": 0.20474089061764142, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 23476 + }, + { + "epoch": 0.20474961190281, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 23477 + }, + { + "epoch": 0.2047583331879786, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 23478 + }, + { + "epoch": 0.20476705447314716, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 23479 + }, + { + "epoch": 0.20477577575831574, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 23480 + }, + { + "epoch": 0.20478449704348434, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 23481 + }, + { + "epoch": 0.2047932183286529, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 23482 + }, + { + "epoch": 0.20480193961382148, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 23483 + }, + { + "epoch": 0.20481066089899008, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 23484 + }, + { + "epoch": 0.20481938218415865, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 23485 + }, + { + "epoch": 0.20482810346932723, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 23486 + }, + { + "epoch": 0.20483682475449583, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 23487 + }, + { + "epoch": 0.2048455460396644, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 23488 + }, + { + "epoch": 0.204854267324833, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 23489 + }, + { + "epoch": 0.20486298861000157, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 23490 + }, + { + "epoch": 0.20487170989517015, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 23491 + }, + { + "epoch": 0.20488043118033875, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 23492 + }, + { + "epoch": 0.20488915246550732, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 23493 + }, + { + "epoch": 0.2048978737506759, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 23494 + }, + { + "epoch": 0.2049065950358445, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 23495 + }, + { + "epoch": 0.20491531632101306, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 23496 + }, + { + "epoch": 0.20492403760618164, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 23497 + }, + { + "epoch": 0.20493275889135024, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 23498 + }, + { + "epoch": 0.2049414801765188, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 23499 + }, + { + "epoch": 0.20495020146168738, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 23500 + }, + { + "epoch": 0.20495892274685598, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 23501 + }, + { + "epoch": 0.20496764403202455, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 23502 + }, + { + "epoch": 0.20497636531719315, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 23503 + }, + { + "epoch": 0.20498508660236173, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 23504 + }, + { + "epoch": 0.2049938078875303, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 23505 + }, + { + "epoch": 0.2050025291726989, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 23506 + }, + { + "epoch": 0.20501125045786747, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 23507 + }, + { + "epoch": 0.20501997174303604, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 23508 + }, + { + "epoch": 0.20502869302820464, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23509 + }, + { + "epoch": 0.20503741431337322, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 23510 + }, + { + "epoch": 0.2050461355985418, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 23511 + }, + { + "epoch": 0.2050548568837104, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 23512 + }, + { + "epoch": 0.20506357816887896, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 23513 + }, + { + "epoch": 0.20507229945404754, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 23514 + }, + { + "epoch": 0.20508102073921614, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 23515 + }, + { + "epoch": 0.2050897420243847, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 23516 + }, + { + "epoch": 0.2050984633095533, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 23517 + }, + { + "epoch": 0.20510718459472188, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 23518 + }, + { + "epoch": 0.20511590587989045, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 23519 + }, + { + "epoch": 0.20512462716505905, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 23520 + }, + { + "epoch": 0.20513334845022763, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 23521 + }, + { + "epoch": 0.2051420697353962, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 23522 + }, + { + "epoch": 0.2051507910205648, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 23523 + }, + { + "epoch": 0.20515951230573337, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 23524 + }, + { + "epoch": 0.20516823359090194, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 23525 + }, + { + "epoch": 0.20517695487607054, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 23526 + }, + { + "epoch": 0.20518567616123912, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 23527 + }, + { + "epoch": 0.2051943974464077, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 23528 + }, + { + "epoch": 0.2052031187315763, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 23529 + }, + { + "epoch": 0.20521184001674486, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 23530 + }, + { + "epoch": 0.20522056130191346, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 23531 + }, + { + "epoch": 0.20522928258708203, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 23532 + }, + { + "epoch": 0.2052380038722506, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 23533 + }, + { + "epoch": 0.2052467251574192, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 23534 + }, + { + "epoch": 0.20525544644258778, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 23535 + }, + { + "epoch": 0.20526416772775635, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 23536 + }, + { + "epoch": 0.20527288901292495, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 23537 + }, + { + "epoch": 0.20528161029809353, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 23538 + }, + { + "epoch": 0.2052903315832621, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 23539 + }, + { + "epoch": 0.2052990528684307, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 23540 + }, + { + "epoch": 0.20530777415359927, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 23541 + }, + { + "epoch": 0.20531649543876784, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 23542 + }, + { + "epoch": 0.20532521672393644, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 23543 + }, + { + "epoch": 0.20533393800910502, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 23544 + }, + { + "epoch": 0.20534265929427362, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 23545 + }, + { + "epoch": 0.2053513805794422, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 23546 + }, + { + "epoch": 0.20536010186461076, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 23547 + }, + { + "epoch": 0.20536882314977936, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 23548 + }, + { + "epoch": 0.20537754443494793, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 23549 + }, + { + "epoch": 0.2053862657201165, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 23550 + }, + { + "epoch": 0.2053949870052851, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23551 + }, + { + "epoch": 0.20540370829045368, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 23552 + }, + { + "epoch": 0.20541242957562225, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 23553 + }, + { + "epoch": 0.20542115086079085, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 23554 + }, + { + "epoch": 0.20542987214595942, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 23555 + }, + { + "epoch": 0.20543859343112802, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 23556 + }, + { + "epoch": 0.2054473147162966, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 23557 + }, + { + "epoch": 0.20545603600146517, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 23558 + }, + { + "epoch": 0.20546475728663377, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 23559 + }, + { + "epoch": 0.20547347857180234, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 23560 + }, + { + "epoch": 0.20548219985697092, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 23561 + }, + { + "epoch": 0.20549092114213952, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 23562 + }, + { + "epoch": 0.2054996424273081, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 23563 + }, + { + "epoch": 0.20550836371247666, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 23564 + }, + { + "epoch": 0.20551708499764526, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 23565 + }, + { + "epoch": 0.20552580628281383, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 23566 + }, + { + "epoch": 0.2055345275679824, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 23567 + }, + { + "epoch": 0.205543248853151, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 23568 + }, + { + "epoch": 0.20555197013831958, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 23569 + }, + { + "epoch": 0.20556069142348818, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 23570 + }, + { + "epoch": 0.20556941270865675, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 23571 + }, + { + "epoch": 0.20557813399382532, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 23572 + }, + { + "epoch": 0.20558685527899392, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 23573 + }, + { + "epoch": 0.2055955765641625, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 23574 + }, + { + "epoch": 0.20560429784933107, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 23575 + }, + { + "epoch": 0.20561301913449967, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 23576 + }, + { + "epoch": 0.20562174041966824, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23577 + }, + { + "epoch": 0.20563046170483681, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 23578 + }, + { + "epoch": 0.20563918299000541, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 23579 + }, + { + "epoch": 0.205647904275174, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 23580 + }, + { + "epoch": 0.20565662556034256, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 23581 + }, + { + "epoch": 0.20566534684551116, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 23582 + }, + { + "epoch": 0.20567406813067973, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 23583 + }, + { + "epoch": 0.20568278941584833, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 23584 + }, + { + "epoch": 0.2056915107010169, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 23585 + }, + { + "epoch": 0.20570023198618548, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 23586 + }, + { + "epoch": 0.20570895327135408, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 23587 + }, + { + "epoch": 0.20571767455652265, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 23588 + }, + { + "epoch": 0.20572639584169122, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 23589 + }, + { + "epoch": 0.20573511712685982, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23590 + }, + { + "epoch": 0.2057438384120284, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23591 + }, + { + "epoch": 0.20575255969719697, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 23592 + }, + { + "epoch": 0.20576128098236557, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 23593 + }, + { + "epoch": 0.20577000226753414, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 23594 + }, + { + "epoch": 0.2057787235527027, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 23595 + }, + { + "epoch": 0.20578744483787131, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23596 + }, + { + "epoch": 0.2057961661230399, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 23597 + }, + { + "epoch": 0.2058048874082085, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 23598 + }, + { + "epoch": 0.20581360869337706, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 23599 + }, + { + "epoch": 0.20582232997854563, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 23600 + }, + { + "epoch": 0.20583105126371423, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 23601 + }, + { + "epoch": 0.2058397725488828, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 23602 + }, + { + "epoch": 0.20584849383405138, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 23603 + }, + { + "epoch": 0.20585721511921998, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 23604 + }, + { + "epoch": 0.20586593640438855, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 23605 + }, + { + "epoch": 0.20587465768955712, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 23606 + }, + { + "epoch": 0.20588337897472572, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 23607 + }, + { + "epoch": 0.2058921002598943, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 23608 + }, + { + "epoch": 0.20590082154506287, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 23609 + }, + { + "epoch": 0.20590954283023147, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 23610 + }, + { + "epoch": 0.20591826411540004, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 23611 + }, + { + "epoch": 0.20592698540056864, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 23612 + }, + { + "epoch": 0.2059357066857372, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 23613 + }, + { + "epoch": 0.20594442797090579, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 23614 + }, + { + "epoch": 0.20595314925607439, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 23615 + }, + { + "epoch": 0.20596187054124296, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 23616 + }, + { + "epoch": 0.20597059182641153, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 23617 + }, + { + "epoch": 0.20597931311158013, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 23618 + }, + { + "epoch": 0.2059880343967487, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 23619 + }, + { + "epoch": 0.20599675568191728, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 23620 + }, + { + "epoch": 0.20600547696708588, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 23621 + }, + { + "epoch": 0.20601419825225445, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 23622 + }, + { + "epoch": 0.20602291953742302, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 23623 + }, + { + "epoch": 0.20603164082259162, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 23624 + }, + { + "epoch": 0.2060403621077602, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 23625 + }, + { + "epoch": 0.2060490833929288, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 23626 + }, + { + "epoch": 0.20605780467809737, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 23627 + }, + { + "epoch": 0.20606652596326594, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 23628 + }, + { + "epoch": 0.20607524724843454, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 23629 + }, + { + "epoch": 0.2060839685336031, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 23630 + }, + { + "epoch": 0.20609268981877168, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 23631 + }, + { + "epoch": 0.20610141110394029, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 23632 + }, + { + "epoch": 0.20611013238910886, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 23633 + }, + { + "epoch": 0.20611885367427743, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 23634 + }, + { + "epoch": 0.20612757495944603, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 23635 + }, + { + "epoch": 0.2061362962446146, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 23636 + }, + { + "epoch": 0.20614501752978318, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 23637 + }, + { + "epoch": 0.20615373881495178, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 23638 + }, + { + "epoch": 0.20616246010012035, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 23639 + }, + { + "epoch": 0.20617118138528895, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23640 + }, + { + "epoch": 0.20617990267045752, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 23641 + }, + { + "epoch": 0.2061886239556261, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 23642 + }, + { + "epoch": 0.2061973452407947, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 23643 + }, + { + "epoch": 0.20620606652596327, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 23644 + }, + { + "epoch": 0.20621478781113184, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 23645 + }, + { + "epoch": 0.20622350909630044, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 23646 + }, + { + "epoch": 0.206232230381469, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 23647 + }, + { + "epoch": 0.20624095166663758, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23648 + }, + { + "epoch": 0.20624967295180618, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 23649 + }, + { + "epoch": 0.20625839423697476, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 23650 + }, + { + "epoch": 0.20626711552214333, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 23651 + }, + { + "epoch": 0.20627583680731193, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 23652 + }, + { + "epoch": 0.2062845580924805, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 23653 + }, + { + "epoch": 0.2062932793776491, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 23654 + }, + { + "epoch": 0.20630200066281768, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 23655 + }, + { + "epoch": 0.20631072194798625, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 23656 + }, + { + "epoch": 0.20631944323315485, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 23657 + }, + { + "epoch": 0.20632816451832342, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 23658 + }, + { + "epoch": 0.206336885803492, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 23659 + }, + { + "epoch": 0.2063456070886606, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 23660 + }, + { + "epoch": 0.20635432837382917, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 23661 + }, + { + "epoch": 0.20636304965899774, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 23662 + }, + { + "epoch": 0.20637177094416634, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 23663 + }, + { + "epoch": 0.2063804922293349, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 23664 + }, + { + "epoch": 0.20638921351450348, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 23665 + }, + { + "epoch": 0.20639793479967208, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 23666 + }, + { + "epoch": 0.20640665608484066, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 23667 + }, + { + "epoch": 0.20641537737000926, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 23668 + }, + { + "epoch": 0.20642409865517783, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 23669 + }, + { + "epoch": 0.2064328199403464, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 23670 + }, + { + "epoch": 0.206441541225515, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 23671 + }, + { + "epoch": 0.20645026251068357, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 23672 + }, + { + "epoch": 0.20645898379585215, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23673 + }, + { + "epoch": 0.20646770508102075, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 23674 + }, + { + "epoch": 0.20647642636618932, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 23675 + }, + { + "epoch": 0.2064851476513579, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 23676 + }, + { + "epoch": 0.2064938689365265, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 23677 + }, + { + "epoch": 0.20650259022169506, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 23678 + }, + { + "epoch": 0.20651131150686367, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 23679 + }, + { + "epoch": 0.20652003279203224, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 23680 + }, + { + "epoch": 0.2065287540772008, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 23681 + }, + { + "epoch": 0.2065374753623694, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 23682 + }, + { + "epoch": 0.20654619664753798, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 23683 + }, + { + "epoch": 0.20655491793270656, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 23684 + }, + { + "epoch": 0.20656363921787516, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 23685 + }, + { + "epoch": 0.20657236050304373, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 23686 + }, + { + "epoch": 0.2065810817882123, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 23687 + }, + { + "epoch": 0.2065898030733809, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 23688 + }, + { + "epoch": 0.20659852435854947, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 23689 + }, + { + "epoch": 0.20660724564371805, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23690 + }, + { + "epoch": 0.20661596692888665, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 23691 + }, + { + "epoch": 0.20662468821405522, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 23692 + }, + { + "epoch": 0.20663340949922382, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23693 + }, + { + "epoch": 0.2066421307843924, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 23694 + }, + { + "epoch": 0.20665085206956096, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 23695 + }, + { + "epoch": 0.20665957335472956, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 23696 + }, + { + "epoch": 0.20666829463989814, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 23697 + }, + { + "epoch": 0.2066770159250667, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 23698 + }, + { + "epoch": 0.2066857372102353, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 23699 + }, + { + "epoch": 0.20669445849540388, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 23700 + }, + { + "epoch": 0.20670317978057245, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 23701 + }, + { + "epoch": 0.20671190106574105, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 23702 + }, + { + "epoch": 0.20672062235090963, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 23703 + }, + { + "epoch": 0.2067293436360782, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 23704 + }, + { + "epoch": 0.2067380649212468, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 23705 + }, + { + "epoch": 0.20674678620641537, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 23706 + }, + { + "epoch": 0.20675550749158397, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 23707 + }, + { + "epoch": 0.20676422877675255, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 23708 + }, + { + "epoch": 0.20677295006192112, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23709 + }, + { + "epoch": 0.20678167134708972, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 23710 + }, + { + "epoch": 0.2067903926322583, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 23711 + }, + { + "epoch": 0.20679911391742686, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 23712 + }, + { + "epoch": 0.20680783520259546, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 23713 + }, + { + "epoch": 0.20681655648776404, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 23714 + }, + { + "epoch": 0.2068252777729326, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 23715 + }, + { + "epoch": 0.2068339990581012, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 23716 + }, + { + "epoch": 0.20684272034326978, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 23717 + }, + { + "epoch": 0.20685144162843835, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 23718 + }, + { + "epoch": 0.20686016291360695, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 23719 + }, + { + "epoch": 0.20686888419877553, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 23720 + }, + { + "epoch": 0.20687760548394413, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 23721 + }, + { + "epoch": 0.2068863267691127, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 23722 + }, + { + "epoch": 0.20689504805428127, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 23723 + }, + { + "epoch": 0.20690376933944987, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 23724 + }, + { + "epoch": 0.20691249062461844, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 23725 + }, + { + "epoch": 0.20692121190978702, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 23726 + }, + { + "epoch": 0.20692993319495562, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 23727 + }, + { + "epoch": 0.2069386544801242, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 23728 + }, + { + "epoch": 0.20694737576529276, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 23729 + }, + { + "epoch": 0.20695609705046136, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 23730 + }, + { + "epoch": 0.20696481833562994, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 23731 + }, + { + "epoch": 0.2069735396207985, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 23732 + }, + { + "epoch": 0.2069822609059671, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 23733 + }, + { + "epoch": 0.20699098219113568, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23734 + }, + { + "epoch": 0.20699970347630428, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 23735 + }, + { + "epoch": 0.20700842476147285, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 23736 + }, + { + "epoch": 0.20701714604664143, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 23737 + }, + { + "epoch": 0.20702586733181003, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 23738 + }, + { + "epoch": 0.2070345886169786, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 23739 + }, + { + "epoch": 0.20704330990214717, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 23740 + }, + { + "epoch": 0.20705203118731577, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 23741 + }, + { + "epoch": 0.20706075247248434, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 23742 + }, + { + "epoch": 0.20706947375765292, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 23743 + }, + { + "epoch": 0.20707819504282152, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 23744 + }, + { + "epoch": 0.2070869163279901, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 23745 + }, + { + "epoch": 0.20709563761315866, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 23746 + }, + { + "epoch": 0.20710435889832726, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 23747 + }, + { + "epoch": 0.20711308018349583, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 23748 + }, + { + "epoch": 0.20712180146866443, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 23749 + }, + { + "epoch": 0.207130522753833, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 23750 + }, + { + "epoch": 0.20713924403900158, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 23751 + }, + { + "epoch": 0.20714796532417018, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 23752 + }, + { + "epoch": 0.20715668660933875, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 23753 + }, + { + "epoch": 0.20716540789450733, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 23754 + }, + { + "epoch": 0.20717412917967593, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 23755 + }, + { + "epoch": 0.2071828504648445, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 23756 + }, + { + "epoch": 0.20719157175001307, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 23757 + }, + { + "epoch": 0.20720029303518167, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 23758 + }, + { + "epoch": 0.20720901432035024, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 23759 + }, + { + "epoch": 0.20721773560551882, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 23760 + }, + { + "epoch": 0.20722645689068742, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 23761 + }, + { + "epoch": 0.207235178175856, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23762 + }, + { + "epoch": 0.2072438994610246, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 23763 + }, + { + "epoch": 0.20725262074619316, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 23764 + }, + { + "epoch": 0.20726134203136173, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 23765 + }, + { + "epoch": 0.20727006331653033, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 23766 + }, + { + "epoch": 0.2072787846016989, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 23767 + }, + { + "epoch": 0.20728750588686748, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 23768 + }, + { + "epoch": 0.20729622717203608, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 23769 + }, + { + "epoch": 0.20730494845720465, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 23770 + }, + { + "epoch": 0.20731366974237322, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 23771 + }, + { + "epoch": 0.20732239102754182, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 23772 + }, + { + "epoch": 0.2073311123127104, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 23773 + }, + { + "epoch": 0.20733983359787897, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 23774 + }, + { + "epoch": 0.20734855488304757, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 23775 + }, + { + "epoch": 0.20735727616821614, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 23776 + }, + { + "epoch": 0.20736599745338474, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 23777 + }, + { + "epoch": 0.20737471873855332, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 23778 + }, + { + "epoch": 0.2073834400237219, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 23779 + }, + { + "epoch": 0.2073921613088905, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 23780 + }, + { + "epoch": 0.20740088259405906, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 23781 + }, + { + "epoch": 0.20740960387922763, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 23782 + }, + { + "epoch": 0.20741832516439623, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 23783 + }, + { + "epoch": 0.2074270464495648, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 23784 + }, + { + "epoch": 0.20743576773473338, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 23785 + }, + { + "epoch": 0.20744448901990198, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 23786 + }, + { + "epoch": 0.20745321030507055, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 23787 + }, + { + "epoch": 0.20746193159023915, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23788 + }, + { + "epoch": 0.20747065287540772, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 23789 + }, + { + "epoch": 0.2074793741605763, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 23790 + }, + { + "epoch": 0.2074880954457449, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 23791 + }, + { + "epoch": 0.20749681673091347, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 23792 + }, + { + "epoch": 0.20750553801608204, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 23793 + }, + { + "epoch": 0.20751425930125064, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 23794 + }, + { + "epoch": 0.20752298058641921, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 23795 + }, + { + "epoch": 0.2075317018715878, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 23796 + }, + { + "epoch": 0.2075404231567564, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 23797 + }, + { + "epoch": 0.20754914444192496, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 23798 + }, + { + "epoch": 0.20755786572709353, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23799 + }, + { + "epoch": 0.20756658701226213, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 23800 + }, + { + "epoch": 0.2075753082974307, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 23801 + }, + { + "epoch": 0.2075840295825993, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 23802 + }, + { + "epoch": 0.20759275086776788, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 23803 + }, + { + "epoch": 0.20760147215293645, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 23804 + }, + { + "epoch": 0.20761019343810505, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 23805 + }, + { + "epoch": 0.20761891472327362, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23806 + }, + { + "epoch": 0.2076276360084422, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 23807 + }, + { + "epoch": 0.2076363572936108, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 23808 + }, + { + "epoch": 0.20764507857877937, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 23809 + }, + { + "epoch": 0.20765379986394794, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 23810 + }, + { + "epoch": 0.20766252114911654, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 23811 + }, + { + "epoch": 0.2076712424342851, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 23812 + }, + { + "epoch": 0.2076799637194537, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 23813 + }, + { + "epoch": 0.2076886850046223, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 23814 + }, + { + "epoch": 0.20769740628979086, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 23815 + }, + { + "epoch": 0.20770612757495946, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23816 + }, + { + "epoch": 0.20771484886012803, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 23817 + }, + { + "epoch": 0.2077235701452966, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 23818 + }, + { + "epoch": 0.2077322914304652, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 23819 + }, + { + "epoch": 0.20774101271563378, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 23820 + }, + { + "epoch": 0.20774973400080235, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 23821 + }, + { + "epoch": 0.20775845528597095, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23822 + }, + { + "epoch": 0.20776717657113952, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 23823 + }, + { + "epoch": 0.2077758978563081, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 23824 + }, + { + "epoch": 0.2077846191414767, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 23825 + }, + { + "epoch": 0.20779334042664527, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 23826 + }, + { + "epoch": 0.20780206171181384, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 23827 + }, + { + "epoch": 0.20781078299698244, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 23828 + }, + { + "epoch": 0.207819504282151, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 23829 + }, + { + "epoch": 0.2078282255673196, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 23830 + }, + { + "epoch": 0.20783694685248819, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 23831 + }, + { + "epoch": 0.20784566813765676, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 23832 + }, + { + "epoch": 0.20785438942282536, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 23833 + }, + { + "epoch": 0.20786311070799393, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23834 + }, + { + "epoch": 0.2078718319931625, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 23835 + }, + { + "epoch": 0.2078805532783311, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 23836 + }, + { + "epoch": 0.20788927456349968, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 23837 + }, + { + "epoch": 0.20789799584866825, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 23838 + }, + { + "epoch": 0.20790671713383685, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 23839 + }, + { + "epoch": 0.20791543841900542, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 23840 + }, + { + "epoch": 0.207924159704174, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 23841 + }, + { + "epoch": 0.2079328809893426, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 23842 + }, + { + "epoch": 0.20794160227451117, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 23843 + }, + { + "epoch": 0.20795032355967977, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 23844 + }, + { + "epoch": 0.20795904484484834, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 23845 + }, + { + "epoch": 0.2079677661300169, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 23846 + }, + { + "epoch": 0.2079764874151855, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 23847 + }, + { + "epoch": 0.20798520870035409, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 23848 + }, + { + "epoch": 0.20799392998552266, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23849 + }, + { + "epoch": 0.20800265127069126, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 23850 + }, + { + "epoch": 0.20801137255585983, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 23851 + }, + { + "epoch": 0.2080200938410284, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 23852 + }, + { + "epoch": 0.208028815126197, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 23853 + }, + { + "epoch": 0.20803753641136558, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 23854 + }, + { + "epoch": 0.20804625769653415, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 23855 + }, + { + "epoch": 0.20805497898170275, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 23856 + }, + { + "epoch": 0.20806370026687132, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 23857 + }, + { + "epoch": 0.20807242155203992, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 23858 + }, + { + "epoch": 0.2080811428372085, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 23859 + }, + { + "epoch": 0.20808986412237707, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 23860 + }, + { + "epoch": 0.20809858540754567, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 23861 + }, + { + "epoch": 0.20810730669271424, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 23862 + }, + { + "epoch": 0.2081160279778828, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 23863 + }, + { + "epoch": 0.2081247492630514, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 23864 + }, + { + "epoch": 0.20813347054821998, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 23865 + }, + { + "epoch": 0.20814219183338856, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 23866 + }, + { + "epoch": 0.20815091311855716, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 23867 + }, + { + "epoch": 0.20815963440372573, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 23868 + }, + { + "epoch": 0.2081683556888943, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 23869 + }, + { + "epoch": 0.2081770769740629, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 23870 + }, + { + "epoch": 0.20818579825923147, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 23871 + }, + { + "epoch": 0.20819451954440008, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 23872 + }, + { + "epoch": 0.20820324082956865, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 23873 + }, + { + "epoch": 0.20821196211473722, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 23874 + }, + { + "epoch": 0.20822068339990582, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 23875 + }, + { + "epoch": 0.2082294046850744, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 23876 + }, + { + "epoch": 0.20823812597024297, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 23877 + }, + { + "epoch": 0.20824684725541157, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 23878 + }, + { + "epoch": 0.20825556854058014, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 23879 + }, + { + "epoch": 0.2082642898257487, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 23880 + }, + { + "epoch": 0.2082730111109173, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 23881 + }, + { + "epoch": 0.20828173239608588, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 23882 + }, + { + "epoch": 0.20829045368125446, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 23883 + }, + { + "epoch": 0.20829917496642306, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 23884 + }, + { + "epoch": 0.20830789625159163, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 23885 + }, + { + "epoch": 0.20831661753676023, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 23886 + }, + { + "epoch": 0.2083253388219288, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 23887 + }, + { + "epoch": 0.20833406010709737, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 23888 + }, + { + "epoch": 0.20834278139226597, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 23889 + }, + { + "epoch": 0.20835150267743455, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 23890 + }, + { + "epoch": 0.20836022396260312, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 23891 + }, + { + "epoch": 0.20836894524777172, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23892 + }, + { + "epoch": 0.2083776665329403, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 23893 + }, + { + "epoch": 0.20838638781810886, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 23894 + }, + { + "epoch": 0.20839510910327746, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 23895 + }, + { + "epoch": 0.20840383038844604, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 23896 + }, + { + "epoch": 0.2084125516736146, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 23897 + }, + { + "epoch": 0.2084212729587832, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 23898 + }, + { + "epoch": 0.20842999424395178, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 23899 + }, + { + "epoch": 0.20843871552912038, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 23900 + }, + { + "epoch": 0.20844743681428896, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 23901 + }, + { + "epoch": 0.20845615809945753, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 23902 + }, + { + "epoch": 0.20846487938462613, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 23903 + }, + { + "epoch": 0.2084736006697947, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 23904 + }, + { + "epoch": 0.20848232195496327, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 23905 + }, + { + "epoch": 0.20849104324013187, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 23906 + }, + { + "epoch": 0.20849976452530045, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 23907 + }, + { + "epoch": 0.20850848581046902, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 23908 + }, + { + "epoch": 0.20851720709563762, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 23909 + }, + { + "epoch": 0.2085259283808062, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 23910 + }, + { + "epoch": 0.2085346496659748, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 23911 + }, + { + "epoch": 0.20854337095114336, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 23912 + }, + { + "epoch": 0.20855209223631194, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 23913 + }, + { + "epoch": 0.20856081352148054, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 23914 + }, + { + "epoch": 0.2085695348066491, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 23915 + }, + { + "epoch": 0.20857825609181768, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 23916 + }, + { + "epoch": 0.20858697737698628, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 23917 + }, + { + "epoch": 0.20859569866215485, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 23918 + }, + { + "epoch": 0.20860441994732343, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 23919 + }, + { + "epoch": 0.20861314123249203, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 23920 + }, + { + "epoch": 0.2086218625176606, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 23921 + }, + { + "epoch": 0.20863058380282917, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 23922 + }, + { + "epoch": 0.20863930508799777, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 23923 + }, + { + "epoch": 0.20864802637316635, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 23924 + }, + { + "epoch": 0.20865674765833495, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 23925 + }, + { + "epoch": 0.20866546894350352, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 23926 + }, + { + "epoch": 0.2086741902286721, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 23927 + }, + { + "epoch": 0.2086829115138407, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 23928 + }, + { + "epoch": 0.20869163279900926, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 23929 + }, + { + "epoch": 0.20870035408417784, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 23930 + }, + { + "epoch": 0.20870907536934644, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 23931 + }, + { + "epoch": 0.208717796654515, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 23932 + }, + { + "epoch": 0.20872651793968358, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 23933 + }, + { + "epoch": 0.20873523922485218, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 23934 + }, + { + "epoch": 0.20874396051002075, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 23935 + }, + { + "epoch": 0.20875268179518933, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 23936 + }, + { + "epoch": 0.20876140308035793, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 23937 + }, + { + "epoch": 0.2087701243655265, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 23938 + }, + { + "epoch": 0.2087788456506951, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 23939 + }, + { + "epoch": 0.20878756693586367, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 23940 + }, + { + "epoch": 0.20879628822103224, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 23941 + }, + { + "epoch": 0.20880500950620084, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 23942 + }, + { + "epoch": 0.20881373079136942, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 23943 + }, + { + "epoch": 0.208822452076538, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 23944 + }, + { + "epoch": 0.2088311733617066, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 23945 + }, + { + "epoch": 0.20883989464687516, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 23946 + }, + { + "epoch": 0.20884861593204374, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 23947 + }, + { + "epoch": 0.20885733721721234, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 23948 + }, + { + "epoch": 0.2088660585023809, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 23949 + }, + { + "epoch": 0.20887477978754948, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 23950 + }, + { + "epoch": 0.20888350107271808, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 23951 + }, + { + "epoch": 0.20889222235788665, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 23952 + }, + { + "epoch": 0.20890094364305525, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 23953 + }, + { + "epoch": 0.20890966492822383, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 23954 + }, + { + "epoch": 0.2089183862133924, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 23955 + }, + { + "epoch": 0.208927107498561, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 23956 + }, + { + "epoch": 0.20893582878372957, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 23957 + }, + { + "epoch": 0.20894455006889814, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 23958 + }, + { + "epoch": 0.20895327135406674, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 23959 + }, + { + "epoch": 0.20896199263923532, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 23960 + }, + { + "epoch": 0.2089707139244039, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 23961 + }, + { + "epoch": 0.2089794352095725, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 23962 + }, + { + "epoch": 0.20898815649474106, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 23963 + }, + { + "epoch": 0.20899687777990963, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 23964 + }, + { + "epoch": 0.20900559906507823, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 23965 + }, + { + "epoch": 0.2090143203502468, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 23966 + }, + { + "epoch": 0.2090230416354154, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 23967 + }, + { + "epoch": 0.20903176292058398, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 23968 + }, + { + "epoch": 0.20904048420575255, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 23969 + }, + { + "epoch": 0.20904920549092115, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 23970 + }, + { + "epoch": 0.20905792677608973, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 23971 + }, + { + "epoch": 0.2090666480612583, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 23972 + }, + { + "epoch": 0.2090753693464269, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 23973 + }, + { + "epoch": 0.20908409063159547, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 23974 + }, + { + "epoch": 0.20909281191676404, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 23975 + }, + { + "epoch": 0.20910153320193264, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 23976 + }, + { + "epoch": 0.20911025448710122, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 23977 + }, + { + "epoch": 0.2091189757722698, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 23978 + }, + { + "epoch": 0.2091276970574384, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 23979 + }, + { + "epoch": 0.20913641834260696, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 23980 + }, + { + "epoch": 0.20914513962777556, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 23981 + }, + { + "epoch": 0.20915386091294413, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 23982 + }, + { + "epoch": 0.2091625821981127, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 23983 + }, + { + "epoch": 0.2091713034832813, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 23984 + }, + { + "epoch": 0.20918002476844988, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 23985 + }, + { + "epoch": 0.20918874605361845, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 23986 + }, + { + "epoch": 0.20919746733878705, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 23987 + }, + { + "epoch": 0.20920618862395562, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 23988 + }, + { + "epoch": 0.2092149099091242, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 23989 + }, + { + "epoch": 0.2092236311942928, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 23990 + }, + { + "epoch": 0.20923235247946137, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 23991 + }, + { + "epoch": 0.20924107376462994, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 23992 + }, + { + "epoch": 0.20924979504979854, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 23993 + }, + { + "epoch": 0.20925851633496712, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 23994 + }, + { + "epoch": 0.20926723762013572, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 23995 + }, + { + "epoch": 0.2092759589053043, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 23996 + }, + { + "epoch": 0.20928468019047286, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 23997 + }, + { + "epoch": 0.20929340147564146, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 23998 + }, + { + "epoch": 0.20930212276081003, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 23999 + }, + { + "epoch": 0.2093108440459786, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 24000 + }, + { + "epoch": 0.2093195653311472, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 24001 + }, + { + "epoch": 0.20932828661631578, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 24002 + }, + { + "epoch": 0.20933700790148435, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 24003 + }, + { + "epoch": 0.20934572918665295, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 24004 + }, + { + "epoch": 0.20935445047182152, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 24005 + }, + { + "epoch": 0.2093631717569901, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 24006 + }, + { + "epoch": 0.2093718930421587, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 24007 + }, + { + "epoch": 0.20938061432732727, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 24008 + }, + { + "epoch": 0.20938933561249587, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 24009 + }, + { + "epoch": 0.20939805689766444, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 24010 + }, + { + "epoch": 0.20940677818283301, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 24011 + }, + { + "epoch": 0.20941549946800161, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 24012 + }, + { + "epoch": 0.2094242207531702, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 24013 + }, + { + "epoch": 0.20943294203833876, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 24014 + }, + { + "epoch": 0.20944166332350736, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 24015 + }, + { + "epoch": 0.20945038460867593, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24016 + }, + { + "epoch": 0.2094591058938445, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 24017 + }, + { + "epoch": 0.2094678271790131, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 24018 + }, + { + "epoch": 0.20947654846418168, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 24019 + }, + { + "epoch": 0.20948526974935028, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 24020 + }, + { + "epoch": 0.20949399103451885, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 24021 + }, + { + "epoch": 0.20950271231968742, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 24022 + }, + { + "epoch": 0.20951143360485602, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 24023 + }, + { + "epoch": 0.2095201548900246, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 24024 + }, + { + "epoch": 0.20952887617519317, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 24025 + }, + { + "epoch": 0.20953759746036177, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 24026 + }, + { + "epoch": 0.20954631874553034, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 24027 + }, + { + "epoch": 0.2095550400306989, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 24028 + }, + { + "epoch": 0.20956376131586751, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 24029 + }, + { + "epoch": 0.2095724826010361, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 24030 + }, + { + "epoch": 0.20958120388620466, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 24031 + }, + { + "epoch": 0.20958992517137326, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 24032 + }, + { + "epoch": 0.20959864645654183, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 24033 + }, + { + "epoch": 0.20960736774171043, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 24034 + }, + { + "epoch": 0.209616089026879, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24035 + }, + { + "epoch": 0.20962481031204758, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 24036 + }, + { + "epoch": 0.20963353159721618, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 24037 + }, + { + "epoch": 0.20964225288238475, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 24038 + }, + { + "epoch": 0.20965097416755332, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 24039 + }, + { + "epoch": 0.20965969545272192, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 24040 + }, + { + "epoch": 0.2096684167378905, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 24041 + }, + { + "epoch": 0.20967713802305907, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 24042 + }, + { + "epoch": 0.20968585930822767, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 24043 + }, + { + "epoch": 0.20969458059339624, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 24044 + }, + { + "epoch": 0.2097033018785648, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 24045 + }, + { + "epoch": 0.2097120231637334, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 24046 + }, + { + "epoch": 0.20972074444890199, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 24047 + }, + { + "epoch": 0.20972946573407059, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 24048 + }, + { + "epoch": 0.20973818701923916, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 24049 + }, + { + "epoch": 0.20974690830440773, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 24050 + }, + { + "epoch": 0.20975562958957633, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 24051 + }, + { + "epoch": 0.2097643508747449, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 24052 + }, + { + "epoch": 0.20977307215991348, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 24053 + }, + { + "epoch": 0.20978179344508208, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 24054 + }, + { + "epoch": 0.20979051473025065, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 24055 + }, + { + "epoch": 0.20979923601541922, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 24056 + }, + { + "epoch": 0.20980795730058782, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 24057 + }, + { + "epoch": 0.2098166785857564, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 24058 + }, + { + "epoch": 0.20982539987092497, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 24059 + }, + { + "epoch": 0.20983412115609357, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 24060 + }, + { + "epoch": 0.20984284244126214, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 24061 + }, + { + "epoch": 0.20985156372643074, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 24062 + }, + { + "epoch": 0.2098602850115993, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 24063 + }, + { + "epoch": 0.20986900629676788, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 24064 + }, + { + "epoch": 0.20987772758193649, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24065 + }, + { + "epoch": 0.20988644886710506, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 24066 + }, + { + "epoch": 0.20989517015227363, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 24067 + }, + { + "epoch": 0.20990389143744223, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 24068 + }, + { + "epoch": 0.2099126127226108, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 24069 + }, + { + "epoch": 0.20992133400777938, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 24070 + }, + { + "epoch": 0.20993005529294798, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24071 + }, + { + "epoch": 0.20993877657811655, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 24072 + }, + { + "epoch": 0.20994749786328512, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 24073 + }, + { + "epoch": 0.20995621914845372, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 24074 + }, + { + "epoch": 0.2099649404336223, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 24075 + }, + { + "epoch": 0.2099736617187909, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 24076 + }, + { + "epoch": 0.20998238300395947, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 24077 + }, + { + "epoch": 0.20999110428912804, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 24078 + }, + { + "epoch": 0.20999982557429664, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 24079 + }, + { + "epoch": 0.2100085468594652, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 24080 + }, + { + "epoch": 0.21001726814463378, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 24081 + }, + { + "epoch": 0.21002598942980238, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24082 + }, + { + "epoch": 0.21003471071497096, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 24083 + }, + { + "epoch": 0.21004343200013953, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 24084 + }, + { + "epoch": 0.21005215328530813, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 24085 + }, + { + "epoch": 0.2100608745704767, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 24086 + }, + { + "epoch": 0.21006959585564527, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 24087 + }, + { + "epoch": 0.21007831714081387, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24088 + }, + { + "epoch": 0.21008703842598245, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24089 + }, + { + "epoch": 0.21009575971115105, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 24090 + }, + { + "epoch": 0.21010448099631962, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 24091 + }, + { + "epoch": 0.2101132022814882, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 24092 + }, + { + "epoch": 0.2101219235666568, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 24093 + }, + { + "epoch": 0.21013064485182537, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 24094 + }, + { + "epoch": 0.21013936613699394, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 24095 + }, + { + "epoch": 0.21014808742216254, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 24096 + }, + { + "epoch": 0.2101568087073311, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24097 + }, + { + "epoch": 0.21016552999249968, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 24098 + }, + { + "epoch": 0.21017425127766828, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 24099 + }, + { + "epoch": 0.21018297256283686, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 24100 + }, + { + "epoch": 0.21019169384800543, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 24101 + }, + { + "epoch": 0.21020041513317403, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24102 + }, + { + "epoch": 0.2102091364183426, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 24103 + }, + { + "epoch": 0.2102178577035112, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24104 + }, + { + "epoch": 0.21022657898867977, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 24105 + }, + { + "epoch": 0.21023530027384835, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 24106 + }, + { + "epoch": 0.21024402155901695, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 24107 + }, + { + "epoch": 0.21025274284418552, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 24108 + }, + { + "epoch": 0.2102614641293541, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 24109 + }, + { + "epoch": 0.2102701854145227, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 24110 + }, + { + "epoch": 0.21027890669969126, + "grad_norm": 0.427734375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 24111 + }, + { + "epoch": 0.21028762798485984, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 24112 + }, + { + "epoch": 0.21029634927002844, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 24113 + }, + { + "epoch": 0.210305070555197, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 24114 + }, + { + "epoch": 0.21031379184036558, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 24115 + }, + { + "epoch": 0.21032251312553418, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 24116 + }, + { + "epoch": 0.21033123441070276, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 24117 + }, + { + "epoch": 0.21033995569587136, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 24118 + }, + { + "epoch": 0.21034867698103993, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 24119 + }, + { + "epoch": 0.2103573982662085, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 24120 + }, + { + "epoch": 0.2103661195513771, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 24121 + }, + { + "epoch": 0.21037484083654567, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 24122 + }, + { + "epoch": 0.21038356212171425, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 24123 + }, + { + "epoch": 0.21039228340688285, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 24124 + }, + { + "epoch": 0.21040100469205142, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 24125 + }, + { + "epoch": 0.21040972597722, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 24126 + }, + { + "epoch": 0.2104184472623886, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 24127 + }, + { + "epoch": 0.21042716854755716, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 24128 + }, + { + "epoch": 0.21043588983272576, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 24129 + }, + { + "epoch": 0.21044461111789434, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 24130 + }, + { + "epoch": 0.2104533324030629, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 24131 + }, + { + "epoch": 0.2104620536882315, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24132 + }, + { + "epoch": 0.21047077497340008, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 24133 + }, + { + "epoch": 0.21047949625856865, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 24134 + }, + { + "epoch": 0.21048821754373725, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 24135 + }, + { + "epoch": 0.21049693882890583, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 24136 + }, + { + "epoch": 0.2105056601140744, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 24137 + }, + { + "epoch": 0.210514381399243, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 24138 + }, + { + "epoch": 0.21052310268441157, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 24139 + }, + { + "epoch": 0.21053182396958015, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 24140 + }, + { + "epoch": 0.21054054525474875, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24141 + }, + { + "epoch": 0.21054926653991732, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 24142 + }, + { + "epoch": 0.21055798782508592, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 24143 + }, + { + "epoch": 0.2105667091102545, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 24144 + }, + { + "epoch": 0.21057543039542306, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 24145 + }, + { + "epoch": 0.21058415168059166, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 24146 + }, + { + "epoch": 0.21059287296576024, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 24147 + }, + { + "epoch": 0.2106015942509288, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 24148 + }, + { + "epoch": 0.2106103155360974, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 24149 + }, + { + "epoch": 0.21061903682126598, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 24150 + }, + { + "epoch": 0.21062775810643455, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 24151 + }, + { + "epoch": 0.21063647939160315, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 24152 + }, + { + "epoch": 0.21064520067677173, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 24153 + }, + { + "epoch": 0.2106539219619403, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 24154 + }, + { + "epoch": 0.2106626432471089, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 24155 + }, + { + "epoch": 0.21067136453227747, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 24156 + }, + { + "epoch": 0.21068008581744607, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 24157 + }, + { + "epoch": 0.21068880710261464, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 24158 + }, + { + "epoch": 0.21069752838778322, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 24159 + }, + { + "epoch": 0.21070624967295182, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 24160 + }, + { + "epoch": 0.2107149709581204, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 24161 + }, + { + "epoch": 0.21072369224328896, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 24162 + }, + { + "epoch": 0.21073241352845756, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 24163 + }, + { + "epoch": 0.21074113481362614, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 24164 + }, + { + "epoch": 0.2107498560987947, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24165 + }, + { + "epoch": 0.2107585773839633, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 24166 + }, + { + "epoch": 0.21076729866913188, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 24167 + }, + { + "epoch": 0.21077601995430045, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24168 + }, + { + "epoch": 0.21078474123946905, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 24169 + }, + { + "epoch": 0.21079346252463763, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 24170 + }, + { + "epoch": 0.21080218380980623, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 24171 + }, + { + "epoch": 0.2108109050949748, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 24172 + }, + { + "epoch": 0.21081962638014337, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 24173 + }, + { + "epoch": 0.21082834766531197, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 24174 + }, + { + "epoch": 0.21083706895048054, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 24175 + }, + { + "epoch": 0.21084579023564912, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 24176 + }, + { + "epoch": 0.21085451152081772, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 24177 + }, + { + "epoch": 0.2108632328059863, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 24178 + }, + { + "epoch": 0.21087195409115486, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 24179 + }, + { + "epoch": 0.21088067537632346, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 24180 + }, + { + "epoch": 0.21088939666149203, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 24181 + }, + { + "epoch": 0.2108981179466606, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 24182 + }, + { + "epoch": 0.2109068392318292, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 24183 + }, + { + "epoch": 0.21091556051699778, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 24184 + }, + { + "epoch": 0.21092428180216638, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 24185 + }, + { + "epoch": 0.21093300308733495, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 24186 + }, + { + "epoch": 0.21094172437250353, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 24187 + }, + { + "epoch": 0.21095044565767213, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 24188 + }, + { + "epoch": 0.2109591669428407, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 24189 + }, + { + "epoch": 0.21096788822800927, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 24190 + }, + { + "epoch": 0.21097660951317787, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 24191 + }, + { + "epoch": 0.21098533079834644, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 24192 + }, + { + "epoch": 0.21099405208351502, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 24193 + }, + { + "epoch": 0.21100277336868362, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 24194 + }, + { + "epoch": 0.2110114946538522, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 24195 + }, + { + "epoch": 0.21102021593902076, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 24196 + }, + { + "epoch": 0.21102893722418936, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 24197 + }, + { + "epoch": 0.21103765850935793, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 24198 + }, + { + "epoch": 0.21104637979452653, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 24199 + }, + { + "epoch": 0.2110551010796951, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 24200 + }, + { + "epoch": 0.21106382236486368, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 24201 + }, + { + "epoch": 0.21107254365003228, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 24202 + }, + { + "epoch": 0.21108126493520085, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24203 + }, + { + "epoch": 0.21108998622036942, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24204 + }, + { + "epoch": 0.21109870750553802, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 24205 + }, + { + "epoch": 0.2111074287907066, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 24206 + }, + { + "epoch": 0.21111615007587517, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 24207 + }, + { + "epoch": 0.21112487136104377, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 24208 + }, + { + "epoch": 0.21113359264621234, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 24209 + }, + { + "epoch": 0.21114231393138091, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 24210 + }, + { + "epoch": 0.21115103521654952, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 24211 + }, + { + "epoch": 0.2111597565017181, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 24212 + }, + { + "epoch": 0.2111684777868867, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 24213 + }, + { + "epoch": 0.21117719907205526, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 24214 + }, + { + "epoch": 0.21118592035722383, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 24215 + }, + { + "epoch": 0.21119464164239243, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 24216 + }, + { + "epoch": 0.211203362927561, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 24217 + }, + { + "epoch": 0.21121208421272958, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 24218 + }, + { + "epoch": 0.21122080549789818, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 24219 + }, + { + "epoch": 0.21122952678306675, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 24220 + }, + { + "epoch": 0.21123824806823532, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 24221 + }, + { + "epoch": 0.21124696935340392, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 24222 + }, + { + "epoch": 0.2112556906385725, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 24223 + }, + { + "epoch": 0.21126441192374107, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 24224 + }, + { + "epoch": 0.21127313320890967, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 24225 + }, + { + "epoch": 0.21128185449407824, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 24226 + }, + { + "epoch": 0.21129057577924684, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 24227 + }, + { + "epoch": 0.21129929706441541, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 24228 + }, + { + "epoch": 0.211308018349584, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 24229 + }, + { + "epoch": 0.2113167396347526, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 24230 + }, + { + "epoch": 0.21132546091992116, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 24231 + }, + { + "epoch": 0.21133418220508973, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 24232 + }, + { + "epoch": 0.21134290349025833, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 24233 + }, + { + "epoch": 0.2113516247754269, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 24234 + }, + { + "epoch": 0.21136034606059548, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 24235 + }, + { + "epoch": 0.21136906734576408, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 24236 + }, + { + "epoch": 0.21137778863093265, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 24237 + }, + { + "epoch": 0.21138650991610122, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 24238 + }, + { + "epoch": 0.21139523120126982, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 24239 + }, + { + "epoch": 0.2114039524864384, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 24240 + }, + { + "epoch": 0.211412673771607, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 24241 + }, + { + "epoch": 0.21142139505677557, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 24242 + }, + { + "epoch": 0.21143011634194414, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 24243 + }, + { + "epoch": 0.21143883762711274, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 24244 + }, + { + "epoch": 0.2114475589122813, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 24245 + }, + { + "epoch": 0.2114562801974499, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 24246 + }, + { + "epoch": 0.2114650014826185, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 24247 + }, + { + "epoch": 0.21147372276778706, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 24248 + }, + { + "epoch": 0.21148244405295563, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 24249 + }, + { + "epoch": 0.21149116533812423, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 24250 + }, + { + "epoch": 0.2114998866232928, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 24251 + }, + { + "epoch": 0.2115086079084614, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 24252 + }, + { + "epoch": 0.21151732919362998, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 24253 + }, + { + "epoch": 0.21152605047879855, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 24254 + }, + { + "epoch": 0.21153477176396715, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 24255 + }, + { + "epoch": 0.21154349304913572, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 24256 + }, + { + "epoch": 0.2115522143343043, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 24257 + }, + { + "epoch": 0.2115609356194729, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 24258 + }, + { + "epoch": 0.21156965690464147, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 24259 + }, + { + "epoch": 0.21157837818981004, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 24260 + }, + { + "epoch": 0.21158709947497864, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 24261 + }, + { + "epoch": 0.2115958207601472, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24262 + }, + { + "epoch": 0.21160454204531579, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 24263 + }, + { + "epoch": 0.21161326333048439, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 24264 + }, + { + "epoch": 0.21162198461565296, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 24265 + }, + { + "epoch": 0.21163070590082156, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 24266 + }, + { + "epoch": 0.21163942718599013, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 24267 + }, + { + "epoch": 0.2116481484711587, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 24268 + }, + { + "epoch": 0.2116568697563273, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 24269 + }, + { + "epoch": 0.21166559104149588, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 24270 + }, + { + "epoch": 0.21167431232666445, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 24271 + }, + { + "epoch": 0.21168303361183305, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 24272 + }, + { + "epoch": 0.21169175489700162, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 24273 + }, + { + "epoch": 0.2117004761821702, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 24274 + }, + { + "epoch": 0.2117091974673388, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 24275 + }, + { + "epoch": 0.21171791875250737, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 24276 + }, + { + "epoch": 0.21172664003767594, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 24277 + }, + { + "epoch": 0.21173536132284454, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 24278 + }, + { + "epoch": 0.2117440826080131, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 24279 + }, + { + "epoch": 0.2117528038931817, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 24280 + }, + { + "epoch": 0.21176152517835028, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 24281 + }, + { + "epoch": 0.21177024646351886, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 24282 + }, + { + "epoch": 0.21177896774868746, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 24283 + }, + { + "epoch": 0.21178768903385603, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 24284 + }, + { + "epoch": 0.2117964103190246, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 24285 + }, + { + "epoch": 0.2118051316041932, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 24286 + }, + { + "epoch": 0.21181385288936178, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 24287 + }, + { + "epoch": 0.21182257417453035, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24288 + }, + { + "epoch": 0.21183129545969895, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 24289 + }, + { + "epoch": 0.21184001674486752, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 24290 + }, + { + "epoch": 0.2118487380300361, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 24291 + }, + { + "epoch": 0.2118574593152047, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24292 + }, + { + "epoch": 0.21186618060037327, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 24293 + }, + { + "epoch": 0.21187490188554187, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 24294 + }, + { + "epoch": 0.21188362317071044, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 24295 + }, + { + "epoch": 0.211892344455879, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 24296 + }, + { + "epoch": 0.2119010657410476, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 24297 + }, + { + "epoch": 0.21190978702621618, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 24298 + }, + { + "epoch": 0.21191850831138476, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 24299 + }, + { + "epoch": 0.21192722959655336, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 24300 + }, + { + "epoch": 0.21193595088172193, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 24301 + }, + { + "epoch": 0.2119446721668905, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 24302 + }, + { + "epoch": 0.2119533934520591, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 24303 + }, + { + "epoch": 0.21196211473722767, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 24304 + }, + { + "epoch": 0.21197083602239625, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 24305 + }, + { + "epoch": 0.21197955730756485, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 24306 + }, + { + "epoch": 0.21198827859273342, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 24307 + }, + { + "epoch": 0.21199699987790202, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 24308 + }, + { + "epoch": 0.2120057211630706, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 24309 + }, + { + "epoch": 0.21201444244823917, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 24310 + }, + { + "epoch": 0.21202316373340777, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 24311 + }, + { + "epoch": 0.21203188501857634, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 24312 + }, + { + "epoch": 0.2120406063037449, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 24313 + }, + { + "epoch": 0.2120493275889135, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 24314 + }, + { + "epoch": 0.21205804887408208, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 24315 + }, + { + "epoch": 0.21206677015925066, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 24316 + }, + { + "epoch": 0.21207549144441926, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 24317 + }, + { + "epoch": 0.21208421272958783, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24318 + }, + { + "epoch": 0.2120929340147564, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 24319 + }, + { + "epoch": 0.212101655299925, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 24320 + }, + { + "epoch": 0.21211037658509357, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 24321 + }, + { + "epoch": 0.21211909787026217, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 24322 + }, + { + "epoch": 0.21212781915543075, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 24323 + }, + { + "epoch": 0.21213654044059932, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 24324 + }, + { + "epoch": 0.21214526172576792, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 24325 + }, + { + "epoch": 0.2121539830109365, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 24326 + }, + { + "epoch": 0.21216270429610506, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 24327 + }, + { + "epoch": 0.21217142558127366, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 24328 + }, + { + "epoch": 0.21218014686644224, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 24329 + }, + { + "epoch": 0.2121888681516108, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 24330 + }, + { + "epoch": 0.2121975894367794, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 24331 + }, + { + "epoch": 0.21220631072194798, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 24332 + }, + { + "epoch": 0.21221503200711656, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 24333 + }, + { + "epoch": 0.21222375329228516, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 24334 + }, + { + "epoch": 0.21223247457745373, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 24335 + }, + { + "epoch": 0.21224119586262233, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 24336 + }, + { + "epoch": 0.2122499171477909, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 24337 + }, + { + "epoch": 0.21225863843295947, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 24338 + }, + { + "epoch": 0.21226735971812807, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24339 + }, + { + "epoch": 0.21227608100329665, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24340 + }, + { + "epoch": 0.21228480228846522, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 24341 + }, + { + "epoch": 0.21229352357363382, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 24342 + }, + { + "epoch": 0.2123022448588024, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 24343 + }, + { + "epoch": 0.21231096614397096, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24344 + }, + { + "epoch": 0.21231968742913956, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 24345 + }, + { + "epoch": 0.21232840871430814, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 24346 + }, + { + "epoch": 0.2123371299994767, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 24347 + }, + { + "epoch": 0.2123458512846453, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 24348 + }, + { + "epoch": 0.21235457256981388, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 24349 + }, + { + "epoch": 0.21236329385498248, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 24350 + }, + { + "epoch": 0.21237201514015105, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 24351 + }, + { + "epoch": 0.21238073642531963, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 24352 + }, + { + "epoch": 0.21238945771048823, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 24353 + }, + { + "epoch": 0.2123981789956568, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 24354 + }, + { + "epoch": 0.21240690028082537, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 24355 + }, + { + "epoch": 0.21241562156599397, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 24356 + }, + { + "epoch": 0.21242434285116255, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 24357 + }, + { + "epoch": 0.21243306413633112, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 24358 + }, + { + "epoch": 0.21244178542149972, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 24359 + }, + { + "epoch": 0.2124505067066683, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 24360 + }, + { + "epoch": 0.2124592279918369, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 24361 + }, + { + "epoch": 0.21246794927700546, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 24362 + }, + { + "epoch": 0.21247667056217404, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 24363 + }, + { + "epoch": 0.21248539184734264, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 24364 + }, + { + "epoch": 0.2124941131325112, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 24365 + }, + { + "epoch": 0.21250283441767978, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 24366 + }, + { + "epoch": 0.21251155570284838, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 24367 + }, + { + "epoch": 0.21252027698801695, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 24368 + }, + { + "epoch": 0.21252899827318553, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24369 + }, + { + "epoch": 0.21253771955835413, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 24370 + }, + { + "epoch": 0.2125464408435227, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 24371 + }, + { + "epoch": 0.21255516212869127, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24372 + }, + { + "epoch": 0.21256388341385987, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 24373 + }, + { + "epoch": 0.21257260469902844, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 24374 + }, + { + "epoch": 0.21258132598419704, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 24375 + }, + { + "epoch": 0.21259004726936562, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 24376 + }, + { + "epoch": 0.2125987685545342, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 24377 + }, + { + "epoch": 0.2126074898397028, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 24378 + }, + { + "epoch": 0.21261621112487136, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 24379 + }, + { + "epoch": 0.21262493241003994, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 24380 + }, + { + "epoch": 0.21263365369520854, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24381 + }, + { + "epoch": 0.2126423749803771, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 24382 + }, + { + "epoch": 0.21265109626554568, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 24383 + }, + { + "epoch": 0.21265981755071428, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 24384 + }, + { + "epoch": 0.21266853883588285, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 24385 + }, + { + "epoch": 0.21267726012105143, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 24386 + }, + { + "epoch": 0.21268598140622003, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 24387 + }, + { + "epoch": 0.2126947026913886, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 24388 + }, + { + "epoch": 0.2127034239765572, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 24389 + }, + { + "epoch": 0.21271214526172577, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 24390 + }, + { + "epoch": 0.21272086654689434, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 24391 + }, + { + "epoch": 0.21272958783206294, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 24392 + }, + { + "epoch": 0.21273830911723152, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 24393 + }, + { + "epoch": 0.2127470304024001, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 24394 + }, + { + "epoch": 0.2127557516875687, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 24395 + }, + { + "epoch": 0.21276447297273726, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 24396 + }, + { + "epoch": 0.21277319425790583, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 24397 + }, + { + "epoch": 0.21278191554307443, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 24398 + }, + { + "epoch": 0.212790636828243, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 24399 + }, + { + "epoch": 0.21279935811341158, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 24400 + }, + { + "epoch": 0.21280807939858018, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 24401 + }, + { + "epoch": 0.21281680068374875, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 24402 + }, + { + "epoch": 0.21282552196891735, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 24403 + }, + { + "epoch": 0.21283424325408593, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 24404 + }, + { + "epoch": 0.2128429645392545, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24405 + }, + { + "epoch": 0.2128516858244231, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 24406 + }, + { + "epoch": 0.21286040710959167, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 24407 + }, + { + "epoch": 0.21286912839476024, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24408 + }, + { + "epoch": 0.21287784967992884, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 24409 + }, + { + "epoch": 0.21288657096509742, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24410 + }, + { + "epoch": 0.212895292250266, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 24411 + }, + { + "epoch": 0.2129040135354346, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 24412 + }, + { + "epoch": 0.21291273482060316, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24413 + }, + { + "epoch": 0.21292145610577173, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 24414 + }, + { + "epoch": 0.21293017739094033, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 24415 + }, + { + "epoch": 0.2129388986761089, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 24416 + }, + { + "epoch": 0.2129476199612775, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 24417 + }, + { + "epoch": 0.21295634124644608, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24418 + }, + { + "epoch": 0.21296506253161465, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 24419 + }, + { + "epoch": 0.21297378381678325, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 24420 + }, + { + "epoch": 0.21298250510195182, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 24421 + }, + { + "epoch": 0.2129912263871204, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 24422 + }, + { + "epoch": 0.212999947672289, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 24423 + }, + { + "epoch": 0.21300866895745757, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 24424 + }, + { + "epoch": 0.21301739024262614, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 24425 + }, + { + "epoch": 0.21302611152779474, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 24426 + }, + { + "epoch": 0.21303483281296332, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24427 + }, + { + "epoch": 0.2130435540981319, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24428 + }, + { + "epoch": 0.2130522753833005, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 24429 + }, + { + "epoch": 0.21306099666846906, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 24430 + }, + { + "epoch": 0.21306971795363766, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 24431 + }, + { + "epoch": 0.21307843923880623, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 24432 + }, + { + "epoch": 0.2130871605239748, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 24433 + }, + { + "epoch": 0.2130958818091434, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 24434 + }, + { + "epoch": 0.21310460309431198, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 24435 + }, + { + "epoch": 0.21311332437948055, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 24436 + }, + { + "epoch": 0.21312204566464915, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 24437 + }, + { + "epoch": 0.21313076694981772, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 24438 + }, + { + "epoch": 0.2131394882349863, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 24439 + }, + { + "epoch": 0.2131482095201549, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 24440 + }, + { + "epoch": 0.21315693080532347, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 24441 + }, + { + "epoch": 0.21316565209049204, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 24442 + }, + { + "epoch": 0.21317437337566064, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 24443 + }, + { + "epoch": 0.21318309466082921, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 24444 + }, + { + "epoch": 0.21319181594599781, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 24445 + }, + { + "epoch": 0.2132005372311664, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 24446 + }, + { + "epoch": 0.21320925851633496, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 24447 + }, + { + "epoch": 0.21321797980150356, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 24448 + }, + { + "epoch": 0.21322670108667213, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 24449 + }, + { + "epoch": 0.2132354223718407, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 24450 + }, + { + "epoch": 0.2132441436570093, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 24451 + }, + { + "epoch": 0.21325286494217788, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 24452 + }, + { + "epoch": 0.21326158622734645, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 24453 + }, + { + "epoch": 0.21327030751251505, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 24454 + }, + { + "epoch": 0.21327902879768362, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 24455 + }, + { + "epoch": 0.2132877500828522, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 24456 + }, + { + "epoch": 0.2132964713680208, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 24457 + }, + { + "epoch": 0.21330519265318937, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 24458 + }, + { + "epoch": 0.21331391393835797, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 24459 + }, + { + "epoch": 0.21332263522352654, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 24460 + }, + { + "epoch": 0.2133313565086951, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 24461 + }, + { + "epoch": 0.2133400777938637, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 24462 + }, + { + "epoch": 0.2133487990790323, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 24463 + }, + { + "epoch": 0.21335752036420086, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 24464 + }, + { + "epoch": 0.21336624164936946, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24465 + }, + { + "epoch": 0.21337496293453803, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 24466 + }, + { + "epoch": 0.2133836842197066, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 24467 + }, + { + "epoch": 0.2133924055048752, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 24468 + }, + { + "epoch": 0.21340112679004378, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 24469 + }, + { + "epoch": 0.21340984807521235, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 24470 + }, + { + "epoch": 0.21341856936038095, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 24471 + }, + { + "epoch": 0.21342729064554952, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 24472 + }, + { + "epoch": 0.21343601193071812, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 24473 + }, + { + "epoch": 0.2134447332158867, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 24474 + }, + { + "epoch": 0.21345345450105527, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 24475 + }, + { + "epoch": 0.21346217578622387, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24476 + }, + { + "epoch": 0.21347089707139244, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 24477 + }, + { + "epoch": 0.213479618356561, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 24478 + }, + { + "epoch": 0.2134883396417296, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 24479 + }, + { + "epoch": 0.21349706092689819, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 24480 + }, + { + "epoch": 0.21350578221206676, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 24481 + }, + { + "epoch": 0.21351450349723536, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 24482 + }, + { + "epoch": 0.21352322478240393, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 24483 + }, + { + "epoch": 0.21353194606757253, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 24484 + }, + { + "epoch": 0.2135406673527411, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 24485 + }, + { + "epoch": 0.21354938863790968, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 24486 + }, + { + "epoch": 0.21355810992307828, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 24487 + }, + { + "epoch": 0.21356683120824685, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 24488 + }, + { + "epoch": 0.21357555249341542, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 24489 + }, + { + "epoch": 0.21358427377858402, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 24490 + }, + { + "epoch": 0.2135929950637526, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24491 + }, + { + "epoch": 0.21360171634892117, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 24492 + }, + { + "epoch": 0.21361043763408977, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 24493 + }, + { + "epoch": 0.21361915891925834, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 24494 + }, + { + "epoch": 0.2136278802044269, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 24495 + }, + { + "epoch": 0.2136366014895955, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 24496 + }, + { + "epoch": 0.21364532277476408, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 24497 + }, + { + "epoch": 0.21365404405993269, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 24498 + }, + { + "epoch": 0.21366276534510126, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 24499 + }, + { + "epoch": 0.21367148663026983, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 24500 + }, + { + "epoch": 0.21368020791543843, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 24501 + }, + { + "epoch": 0.213688929200607, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 24502 + }, + { + "epoch": 0.21369765048577558, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 24503 + }, + { + "epoch": 0.21370637177094418, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24504 + }, + { + "epoch": 0.21371509305611275, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 24505 + }, + { + "epoch": 0.21372381434128132, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 24506 + }, + { + "epoch": 0.21373253562644992, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 24507 + }, + { + "epoch": 0.2137412569116185, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 24508 + }, + { + "epoch": 0.21374997819678707, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24509 + }, + { + "epoch": 0.21375869948195567, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24510 + }, + { + "epoch": 0.21376742076712424, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 24511 + }, + { + "epoch": 0.21377614205229284, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 24512 + }, + { + "epoch": 0.2137848633374614, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 24513 + }, + { + "epoch": 0.21379358462262998, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 24514 + }, + { + "epoch": 0.21380230590779858, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 24515 + }, + { + "epoch": 0.21381102719296716, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 24516 + }, + { + "epoch": 0.21381974847813573, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 24517 + }, + { + "epoch": 0.21382846976330433, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 24518 + }, + { + "epoch": 0.2138371910484729, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 24519 + }, + { + "epoch": 0.21384591233364147, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 24520 + }, + { + "epoch": 0.21385463361881007, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 24521 + }, + { + "epoch": 0.21386335490397865, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 24522 + }, + { + "epoch": 0.21387207618914722, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 24523 + }, + { + "epoch": 0.21388079747431582, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 24524 + }, + { + "epoch": 0.2138895187594844, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 24525 + }, + { + "epoch": 0.213898240044653, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 24526 + }, + { + "epoch": 0.21390696132982157, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 24527 + }, + { + "epoch": 0.21391568261499014, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24528 + }, + { + "epoch": 0.21392440390015874, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 24529 + }, + { + "epoch": 0.2139331251853273, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 24530 + }, + { + "epoch": 0.21394184647049588, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 24531 + }, + { + "epoch": 0.21395056775566448, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 24532 + }, + { + "epoch": 0.21395928904083306, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 24533 + }, + { + "epoch": 0.21396801032600163, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 24534 + }, + { + "epoch": 0.21397673161117023, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 24535 + }, + { + "epoch": 0.2139854528963388, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 24536 + }, + { + "epoch": 0.21399417418150737, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 24537 + }, + { + "epoch": 0.21400289546667597, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 24538 + }, + { + "epoch": 0.21401161675184455, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 24539 + }, + { + "epoch": 0.21402033803701315, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 24540 + }, + { + "epoch": 0.21402905932218172, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 24541 + }, + { + "epoch": 0.2140377806073503, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 24542 + }, + { + "epoch": 0.2140465018925189, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 24543 + }, + { + "epoch": 0.21405522317768746, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 24544 + }, + { + "epoch": 0.21406394446285604, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 24545 + }, + { + "epoch": 0.21407266574802464, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 24546 + }, + { + "epoch": 0.2140813870331932, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 24547 + }, + { + "epoch": 0.21409010831836178, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 24548 + }, + { + "epoch": 0.21409882960353038, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 24549 + }, + { + "epoch": 0.21410755088869896, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 24550 + }, + { + "epoch": 0.21411627217386753, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 24551 + }, + { + "epoch": 0.21412499345903613, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 24552 + }, + { + "epoch": 0.2141337147442047, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 24553 + }, + { + "epoch": 0.2141424360293733, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 24554 + }, + { + "epoch": 0.21415115731454187, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 24555 + }, + { + "epoch": 0.21415987859971045, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 24556 + }, + { + "epoch": 0.21416859988487905, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 24557 + }, + { + "epoch": 0.21417732117004762, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 24558 + }, + { + "epoch": 0.2141860424552162, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 24559 + }, + { + "epoch": 0.2141947637403848, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 24560 + }, + { + "epoch": 0.21420348502555336, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 24561 + }, + { + "epoch": 0.21421220631072194, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 24562 + }, + { + "epoch": 0.21422092759589054, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 24563 + }, + { + "epoch": 0.2142296488810591, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 24564 + }, + { + "epoch": 0.21423837016622768, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 24565 + }, + { + "epoch": 0.21424709145139628, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 24566 + }, + { + "epoch": 0.21425581273656485, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 24567 + }, + { + "epoch": 0.21426453402173345, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 24568 + }, + { + "epoch": 0.21427325530690203, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 24569 + }, + { + "epoch": 0.2142819765920706, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 24570 + }, + { + "epoch": 0.2142906978772392, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 24571 + }, + { + "epoch": 0.21429941916240777, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 24572 + }, + { + "epoch": 0.21430814044757635, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 24573 + }, + { + "epoch": 0.21431686173274495, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 24574 + }, + { + "epoch": 0.21432558301791352, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 24575 + }, + { + "epoch": 0.2143343043030821, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 24576 + }, + { + "epoch": 0.2143430255882507, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24577 + }, + { + "epoch": 0.21435174687341926, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 24578 + }, + { + "epoch": 0.21436046815858784, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 24579 + }, + { + "epoch": 0.21436918944375644, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 24580 + }, + { + "epoch": 0.214377910728925, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 24581 + }, + { + "epoch": 0.2143866320140936, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 24582 + }, + { + "epoch": 0.21439535329926218, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 24583 + }, + { + "epoch": 0.21440407458443075, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 24584 + }, + { + "epoch": 0.21441279586959935, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 24585 + }, + { + "epoch": 0.21442151715476793, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 24586 + }, + { + "epoch": 0.2144302384399365, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24587 + }, + { + "epoch": 0.2144389597251051, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 24588 + }, + { + "epoch": 0.21444768101027367, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 24589 + }, + { + "epoch": 0.21445640229544224, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 24590 + }, + { + "epoch": 0.21446512358061084, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 24591 + }, + { + "epoch": 0.21447384486577942, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24592 + }, + { + "epoch": 0.21448256615094802, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 24593 + }, + { + "epoch": 0.2144912874361166, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 24594 + }, + { + "epoch": 0.21450000872128516, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 24595 + }, + { + "epoch": 0.21450873000645376, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 24596 + }, + { + "epoch": 0.21451745129162234, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 24597 + }, + { + "epoch": 0.2145261725767909, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 24598 + }, + { + "epoch": 0.2145348938619595, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 24599 + }, + { + "epoch": 0.21454361514712808, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 24600 + }, + { + "epoch": 0.21455233643229665, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 24601 + }, + { + "epoch": 0.21456105771746525, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 24602 + }, + { + "epoch": 0.21456977900263383, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 24603 + }, + { + "epoch": 0.2145785002878024, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 24604 + }, + { + "epoch": 0.214587221572971, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 24605 + }, + { + "epoch": 0.21459594285813957, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 24606 + }, + { + "epoch": 0.21460466414330817, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 24607 + }, + { + "epoch": 0.21461338542847674, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24608 + }, + { + "epoch": 0.21462210671364532, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 24609 + }, + { + "epoch": 0.21463082799881392, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 24610 + }, + { + "epoch": 0.2146395492839825, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 24611 + }, + { + "epoch": 0.21464827056915106, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 24612 + }, + { + "epoch": 0.21465699185431966, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24613 + }, + { + "epoch": 0.21466571313948823, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 24614 + }, + { + "epoch": 0.2146744344246568, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 24615 + }, + { + "epoch": 0.2146831557098254, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 24616 + }, + { + "epoch": 0.21469187699499398, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 24617 + }, + { + "epoch": 0.21470059828016255, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 24618 + }, + { + "epoch": 0.21470931956533115, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 24619 + }, + { + "epoch": 0.21471804085049973, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 24620 + }, + { + "epoch": 0.21472676213566833, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 24621 + }, + { + "epoch": 0.2147354834208369, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 24622 + }, + { + "epoch": 0.21474420470600547, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 24623 + }, + { + "epoch": 0.21475292599117407, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 24624 + }, + { + "epoch": 0.21476164727634264, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 24625 + }, + { + "epoch": 0.21477036856151122, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 24626 + }, + { + "epoch": 0.21477908984667982, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 24627 + }, + { + "epoch": 0.2147878111318484, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 24628 + }, + { + "epoch": 0.21479653241701696, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 24629 + }, + { + "epoch": 0.21480525370218556, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 24630 + }, + { + "epoch": 0.21481397498735413, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 24631 + }, + { + "epoch": 0.2148226962725227, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 24632 + }, + { + "epoch": 0.2148314175576913, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 24633 + }, + { + "epoch": 0.21484013884285988, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 24634 + }, + { + "epoch": 0.21484886012802848, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 24635 + }, + { + "epoch": 0.21485758141319705, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 24636 + }, + { + "epoch": 0.21486630269836562, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 24637 + }, + { + "epoch": 0.21487502398353422, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 24638 + }, + { + "epoch": 0.2148837452687028, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 24639 + }, + { + "epoch": 0.21489246655387137, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 24640 + }, + { + "epoch": 0.21490118783903997, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 24641 + }, + { + "epoch": 0.21490990912420854, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 24642 + }, + { + "epoch": 0.21491863040937711, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 24643 + }, + { + "epoch": 0.21492735169454572, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 24644 + }, + { + "epoch": 0.2149360729797143, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 24645 + }, + { + "epoch": 0.21494479426488286, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 24646 + }, + { + "epoch": 0.21495351555005146, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 24647 + }, + { + "epoch": 0.21496223683522003, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 24648 + }, + { + "epoch": 0.21497095812038863, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 24649 + }, + { + "epoch": 0.2149796794055572, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 24650 + }, + { + "epoch": 0.21498840069072578, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24651 + }, + { + "epoch": 0.21499712197589438, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 24652 + }, + { + "epoch": 0.21500584326106295, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 24653 + }, + { + "epoch": 0.21501456454623152, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 24654 + }, + { + "epoch": 0.21502328583140012, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 24655 + }, + { + "epoch": 0.2150320071165687, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 24656 + }, + { + "epoch": 0.21504072840173727, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 24657 + }, + { + "epoch": 0.21504944968690587, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 24658 + }, + { + "epoch": 0.21505817097207444, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 24659 + }, + { + "epoch": 0.21506689225724301, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 24660 + }, + { + "epoch": 0.21507561354241161, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 24661 + }, + { + "epoch": 0.2150843348275802, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24662 + }, + { + "epoch": 0.2150930561127488, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 24663 + }, + { + "epoch": 0.21510177739791736, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 24664 + }, + { + "epoch": 0.21511049868308593, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 24665 + }, + { + "epoch": 0.21511921996825453, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 24666 + }, + { + "epoch": 0.2151279412534231, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 24667 + }, + { + "epoch": 0.21513666253859168, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 24668 + }, + { + "epoch": 0.21514538382376028, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 24669 + }, + { + "epoch": 0.21515410510892885, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 24670 + }, + { + "epoch": 0.21516282639409742, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 24671 + }, + { + "epoch": 0.21517154767926602, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 24672 + }, + { + "epoch": 0.2151802689644346, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 24673 + }, + { + "epoch": 0.21518899024960317, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 24674 + }, + { + "epoch": 0.21519771153477177, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24675 + }, + { + "epoch": 0.21520643281994034, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 24676 + }, + { + "epoch": 0.21521515410510894, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 24677 + }, + { + "epoch": 0.2152238753902775, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 24678 + }, + { + "epoch": 0.2152325966754461, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 24679 + }, + { + "epoch": 0.2152413179606147, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 24680 + }, + { + "epoch": 0.21525003924578326, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 24681 + }, + { + "epoch": 0.21525876053095183, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 24682 + }, + { + "epoch": 0.21526748181612043, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 24683 + }, + { + "epoch": 0.215276203101289, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 24684 + }, + { + "epoch": 0.21528492438645758, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 24685 + }, + { + "epoch": 0.21529364567162618, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 24686 + }, + { + "epoch": 0.21530236695679475, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 24687 + }, + { + "epoch": 0.21531108824196332, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24688 + }, + { + "epoch": 0.21531980952713192, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 24689 + }, + { + "epoch": 0.2153285308123005, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 24690 + }, + { + "epoch": 0.2153372520974691, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 24691 + }, + { + "epoch": 0.21534597338263767, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 24692 + }, + { + "epoch": 0.21535469466780624, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 24693 + }, + { + "epoch": 0.21536341595297484, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 24694 + }, + { + "epoch": 0.2153721372381434, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 24695 + }, + { + "epoch": 0.21538085852331199, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 24696 + }, + { + "epoch": 0.21538957980848059, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24697 + }, + { + "epoch": 0.21539830109364916, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 24698 + }, + { + "epoch": 0.21540702237881773, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 24699 + }, + { + "epoch": 0.21541574366398633, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 24700 + }, + { + "epoch": 0.2154244649491549, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 24701 + }, + { + "epoch": 0.2154331862343235, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 24702 + }, + { + "epoch": 0.21544190751949208, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 24703 + }, + { + "epoch": 0.21545062880466065, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 24704 + }, + { + "epoch": 0.21545935008982925, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 24705 + }, + { + "epoch": 0.21546807137499782, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 24706 + }, + { + "epoch": 0.2154767926601664, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 24707 + }, + { + "epoch": 0.215485513945335, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 24708 + }, + { + "epoch": 0.21549423523050357, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 24709 + }, + { + "epoch": 0.21550295651567214, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 24710 + }, + { + "epoch": 0.21551167780084074, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 24711 + }, + { + "epoch": 0.2155203990860093, + "grad_norm": 0.31640625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 24712 + }, + { + "epoch": 0.21552912037117788, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 24713 + }, + { + "epoch": 0.21553784165634648, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 24714 + }, + { + "epoch": 0.21554656294151506, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 24715 + }, + { + "epoch": 0.21555528422668366, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 24716 + }, + { + "epoch": 0.21556400551185223, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 24717 + }, + { + "epoch": 0.2155727267970208, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 24718 + }, + { + "epoch": 0.2155814480821894, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 24719 + }, + { + "epoch": 0.21559016936735798, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 24720 + }, + { + "epoch": 0.21559889065252655, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 24721 + }, + { + "epoch": 0.21560761193769515, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 24722 + }, + { + "epoch": 0.21561633322286372, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 24723 + }, + { + "epoch": 0.2156250545080323, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 24724 + }, + { + "epoch": 0.2156337757932009, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 24725 + }, + { + "epoch": 0.21564249707836947, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 24726 + }, + { + "epoch": 0.21565121836353804, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 24727 + }, + { + "epoch": 0.21565993964870664, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 24728 + }, + { + "epoch": 0.2156686609338752, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 24729 + }, + { + "epoch": 0.2156773822190438, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 24730 + }, + { + "epoch": 0.21568610350421238, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 24731 + }, + { + "epoch": 0.21569482478938096, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 24732 + }, + { + "epoch": 0.21570354607454956, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 24733 + }, + { + "epoch": 0.21571226735971813, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24734 + }, + { + "epoch": 0.2157209886448867, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 24735 + }, + { + "epoch": 0.2157297099300553, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 24736 + }, + { + "epoch": 0.21573843121522387, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 24737 + }, + { + "epoch": 0.21574715250039245, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 24738 + }, + { + "epoch": 0.21575587378556105, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 24739 + }, + { + "epoch": 0.21576459507072962, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 24740 + }, + { + "epoch": 0.2157733163558982, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 24741 + }, + { + "epoch": 0.2157820376410668, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 24742 + }, + { + "epoch": 0.21579075892623537, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 24743 + }, + { + "epoch": 0.21579948021140397, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 24744 + }, + { + "epoch": 0.21580820149657254, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 24745 + }, + { + "epoch": 0.2158169227817411, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 24746 + }, + { + "epoch": 0.2158256440669097, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 24747 + }, + { + "epoch": 0.21583436535207828, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 24748 + }, + { + "epoch": 0.21584308663724686, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 24749 + }, + { + "epoch": 0.21585180792241546, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 24750 + }, + { + "epoch": 0.21586052920758403, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 24751 + }, + { + "epoch": 0.2158692504927526, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 24752 + }, + { + "epoch": 0.2158779717779212, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 24753 + }, + { + "epoch": 0.21588669306308977, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 24754 + }, + { + "epoch": 0.21589541434825835, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 24755 + }, + { + "epoch": 0.21590413563342695, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 24756 + }, + { + "epoch": 0.21591285691859552, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 24757 + }, + { + "epoch": 0.21592157820376412, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 24758 + }, + { + "epoch": 0.2159302994889327, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 24759 + }, + { + "epoch": 0.21593902077410126, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 24760 + }, + { + "epoch": 0.21594774205926986, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 24761 + }, + { + "epoch": 0.21595646334443844, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 24762 + }, + { + "epoch": 0.215965184629607, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 24763 + }, + { + "epoch": 0.2159739059147756, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 24764 + }, + { + "epoch": 0.21598262719994418, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 24765 + }, + { + "epoch": 0.21599134848511276, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 24766 + }, + { + "epoch": 0.21600006977028136, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 24767 + }, + { + "epoch": 0.21600879105544993, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 24768 + }, + { + "epoch": 0.2160175123406185, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 24769 + }, + { + "epoch": 0.2160262336257871, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 24770 + }, + { + "epoch": 0.21603495491095567, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 24771 + }, + { + "epoch": 0.21604367619612427, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 24772 + }, + { + "epoch": 0.21605239748129285, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 24773 + }, + { + "epoch": 0.21606111876646142, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 24774 + }, + { + "epoch": 0.21606984005163002, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 24775 + }, + { + "epoch": 0.2160785613367986, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 24776 + }, + { + "epoch": 0.21608728262196716, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 24777 + }, + { + "epoch": 0.21609600390713576, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 24778 + }, + { + "epoch": 0.21610472519230434, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 24779 + }, + { + "epoch": 0.2161134464774729, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 24780 + }, + { + "epoch": 0.2161221677626415, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 24781 + }, + { + "epoch": 0.21613088904781008, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 24782 + }, + { + "epoch": 0.21613961033297865, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 24783 + }, + { + "epoch": 0.21614833161814725, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24784 + }, + { + "epoch": 0.21615705290331583, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 24785 + }, + { + "epoch": 0.21616577418848443, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 24786 + }, + { + "epoch": 0.216174495473653, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 24787 + }, + { + "epoch": 0.21618321675882157, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 24788 + }, + { + "epoch": 0.21619193804399017, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 24789 + }, + { + "epoch": 0.21620065932915875, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 24790 + }, + { + "epoch": 0.21620938061432732, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 24791 + }, + { + "epoch": 0.21621810189949592, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 24792 + }, + { + "epoch": 0.2162268231846645, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 24793 + }, + { + "epoch": 0.21623554446983306, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 24794 + }, + { + "epoch": 0.21624426575500166, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 24795 + }, + { + "epoch": 0.21625298704017024, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 24796 + }, + { + "epoch": 0.2162617083253388, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 24797 + }, + { + "epoch": 0.2162704296105074, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 24798 + }, + { + "epoch": 0.21627915089567598, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 24799 + }, + { + "epoch": 0.21628787218084458, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24800 + }, + { + "epoch": 0.21629659346601315, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 24801 + }, + { + "epoch": 0.21630531475118173, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 24802 + }, + { + "epoch": 0.21631403603635033, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 24803 + }, + { + "epoch": 0.2163227573215189, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 24804 + }, + { + "epoch": 0.21633147860668747, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 24805 + }, + { + "epoch": 0.21634019989185607, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 24806 + }, + { + "epoch": 0.21634892117702464, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 24807 + }, + { + "epoch": 0.21635764246219322, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 24808 + }, + { + "epoch": 0.21636636374736182, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 24809 + }, + { + "epoch": 0.2163750850325304, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 24810 + }, + { + "epoch": 0.21638380631769896, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 24811 + }, + { + "epoch": 0.21639252760286756, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 24812 + }, + { + "epoch": 0.21640124888803614, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 24813 + }, + { + "epoch": 0.21640997017320474, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 24814 + }, + { + "epoch": 0.2164186914583733, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 24815 + }, + { + "epoch": 0.21642741274354188, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 24816 + }, + { + "epoch": 0.21643613402871048, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 24817 + }, + { + "epoch": 0.21644485531387905, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 24818 + }, + { + "epoch": 0.21645357659904763, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 24819 + }, + { + "epoch": 0.21646229788421623, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 24820 + }, + { + "epoch": 0.2164710191693848, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 24821 + }, + { + "epoch": 0.21647974045455337, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 24822 + }, + { + "epoch": 0.21648846173972197, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 24823 + }, + { + "epoch": 0.21649718302489054, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 24824 + }, + { + "epoch": 0.21650590431005914, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 24825 + }, + { + "epoch": 0.21651462559522772, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 24826 + }, + { + "epoch": 0.2165233468803963, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 24827 + }, + { + "epoch": 0.2165320681655649, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 24828 + }, + { + "epoch": 0.21654078945073346, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 24829 + }, + { + "epoch": 0.21654951073590203, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 24830 + }, + { + "epoch": 0.21655823202107063, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 24831 + }, + { + "epoch": 0.2165669533062392, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 24832 + }, + { + "epoch": 0.21657567459140778, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 24833 + }, + { + "epoch": 0.21658439587657638, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 24834 + }, + { + "epoch": 0.21659311716174495, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 24835 + }, + { + "epoch": 0.21660183844691352, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 24836 + }, + { + "epoch": 0.21661055973208213, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 24837 + }, + { + "epoch": 0.2166192810172507, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 24838 + }, + { + "epoch": 0.2166280023024193, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24839 + }, + { + "epoch": 0.21663672358758787, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 24840 + }, + { + "epoch": 0.21664544487275644, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 24841 + }, + { + "epoch": 0.21665416615792504, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 24842 + }, + { + "epoch": 0.21666288744309362, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 24843 + }, + { + "epoch": 0.2166716087282622, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 24844 + }, + { + "epoch": 0.2166803300134308, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 24845 + }, + { + "epoch": 0.21668905129859936, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 24846 + }, + { + "epoch": 0.21669777258376793, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 24847 + }, + { + "epoch": 0.21670649386893653, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 24848 + }, + { + "epoch": 0.2167152151541051, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 24849 + }, + { + "epoch": 0.21672393643927368, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 24850 + }, + { + "epoch": 0.21673265772444228, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 24851 + }, + { + "epoch": 0.21674137900961085, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 24852 + }, + { + "epoch": 0.21675010029477945, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 24853 + }, + { + "epoch": 0.21675882157994802, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 24854 + }, + { + "epoch": 0.2167675428651166, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 24855 + }, + { + "epoch": 0.2167762641502852, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 24856 + }, + { + "epoch": 0.21678498543545377, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 24857 + }, + { + "epoch": 0.21679370672062234, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 24858 + }, + { + "epoch": 0.21680242800579094, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 24859 + }, + { + "epoch": 0.21681114929095952, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 24860 + }, + { + "epoch": 0.2168198705761281, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 24861 + }, + { + "epoch": 0.2168285918612967, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 24862 + }, + { + "epoch": 0.21683731314646526, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 24863 + }, + { + "epoch": 0.21684603443163383, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 24864 + }, + { + "epoch": 0.21685475571680243, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 24865 + }, + { + "epoch": 0.216863477001971, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 24866 + }, + { + "epoch": 0.2168721982871396, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 24867 + }, + { + "epoch": 0.21688091957230818, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 24868 + }, + { + "epoch": 0.21688964085747675, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 24869 + }, + { + "epoch": 0.21689836214264535, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 24870 + }, + { + "epoch": 0.21690708342781392, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 24871 + }, + { + "epoch": 0.2169158047129825, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 24872 + }, + { + "epoch": 0.2169245259981511, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 24873 + }, + { + "epoch": 0.21693324728331967, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 24874 + }, + { + "epoch": 0.21694196856848824, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 24875 + }, + { + "epoch": 0.21695068985365684, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 24876 + }, + { + "epoch": 0.21695941113882541, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 24877 + }, + { + "epoch": 0.216968132423994, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 24878 + }, + { + "epoch": 0.2169768537091626, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 24879 + }, + { + "epoch": 0.21698557499433116, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 24880 + }, + { + "epoch": 0.21699429627949976, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 24881 + }, + { + "epoch": 0.21700301756466833, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 24882 + }, + { + "epoch": 0.2170117388498369, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 24883 + }, + { + "epoch": 0.2170204601350055, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 24884 + }, + { + "epoch": 0.21702918142017408, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 24885 + }, + { + "epoch": 0.21703790270534265, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 24886 + }, + { + "epoch": 0.21704662399051125, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 24887 + }, + { + "epoch": 0.21705534527567982, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 24888 + }, + { + "epoch": 0.2170640665608484, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 24889 + }, + { + "epoch": 0.217072787846017, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 24890 + }, + { + "epoch": 0.21708150913118557, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 24891 + }, + { + "epoch": 0.21709023041635414, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 24892 + }, + { + "epoch": 0.21709895170152274, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 24893 + }, + { + "epoch": 0.2171076729866913, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 24894 + }, + { + "epoch": 0.2171163942718599, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 24895 + }, + { + "epoch": 0.2171251155570285, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 24896 + }, + { + "epoch": 0.21713383684219706, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 24897 + }, + { + "epoch": 0.21714255812736566, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 24898 + }, + { + "epoch": 0.21715127941253423, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 24899 + }, + { + "epoch": 0.2171600006977028, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 24900 + }, + { + "epoch": 0.2171687219828714, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 24901 + }, + { + "epoch": 0.21717744326803998, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 24902 + }, + { + "epoch": 0.21718616455320855, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 24903 + }, + { + "epoch": 0.21719488583837715, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 24904 + }, + { + "epoch": 0.21720360712354572, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 24905 + }, + { + "epoch": 0.2172123284087143, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 24906 + }, + { + "epoch": 0.2172210496938829, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 24907 + }, + { + "epoch": 0.21722977097905147, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 24908 + }, + { + "epoch": 0.21723849226422007, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 24909 + }, + { + "epoch": 0.21724721354938864, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 24910 + }, + { + "epoch": 0.2172559348345572, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 24911 + }, + { + "epoch": 0.2172646561197258, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 24912 + }, + { + "epoch": 0.21727337740489439, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 24913 + }, + { + "epoch": 0.21728209869006296, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 24914 + }, + { + "epoch": 0.21729081997523156, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 24915 + }, + { + "epoch": 0.21729954126040013, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 24916 + }, + { + "epoch": 0.2173082625455687, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 24917 + }, + { + "epoch": 0.2173169838307373, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 24918 + }, + { + "epoch": 0.21732570511590588, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24919 + }, + { + "epoch": 0.21733442640107445, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 24920 + }, + { + "epoch": 0.21734314768624305, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 24921 + }, + { + "epoch": 0.21735186897141162, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 24922 + }, + { + "epoch": 0.21736059025658022, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 24923 + }, + { + "epoch": 0.2173693115417488, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 24924 + }, + { + "epoch": 0.21737803282691737, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 24925 + }, + { + "epoch": 0.21738675411208597, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 24926 + }, + { + "epoch": 0.21739547539725454, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 24927 + }, + { + "epoch": 0.2174041966824231, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 24928 + }, + { + "epoch": 0.2174129179675917, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 24929 + }, + { + "epoch": 0.21742163925276028, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 24930 + }, + { + "epoch": 0.21743036053792886, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 24931 + }, + { + "epoch": 0.21743908182309746, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 24932 + }, + { + "epoch": 0.21744780310826603, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 24933 + }, + { + "epoch": 0.21745652439343463, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 24934 + }, + { + "epoch": 0.2174652456786032, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 24935 + }, + { + "epoch": 0.21747396696377178, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 24936 + }, + { + "epoch": 0.21748268824894038, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 24937 + }, + { + "epoch": 0.21749140953410895, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 24938 + }, + { + "epoch": 0.21750013081927752, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 24939 + }, + { + "epoch": 0.21750885210444612, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 24940 + }, + { + "epoch": 0.2175175733896147, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 24941 + }, + { + "epoch": 0.21752629467478327, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 24942 + }, + { + "epoch": 0.21753501595995187, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 24943 + }, + { + "epoch": 0.21754373724512044, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 24944 + }, + { + "epoch": 0.217552458530289, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 24945 + }, + { + "epoch": 0.2175611798154576, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 24946 + }, + { + "epoch": 0.21756990110062618, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 24947 + }, + { + "epoch": 0.21757862238579478, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 24948 + }, + { + "epoch": 0.21758734367096336, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 24949 + }, + { + "epoch": 0.21759606495613193, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 24950 + }, + { + "epoch": 0.21760478624130053, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 24951 + }, + { + "epoch": 0.2176135075264691, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 24952 + }, + { + "epoch": 0.21762222881163767, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24953 + }, + { + "epoch": 0.21763095009680627, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 24954 + }, + { + "epoch": 0.21763967138197485, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 24955 + }, + { + "epoch": 0.21764839266714342, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 24956 + }, + { + "epoch": 0.21765711395231202, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 24957 + }, + { + "epoch": 0.2176658352374806, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 24958 + }, + { + "epoch": 0.21767455652264917, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 24959 + }, + { + "epoch": 0.21768327780781777, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 24960 + }, + { + "epoch": 0.21769199909298634, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 24961 + }, + { + "epoch": 0.21770072037815494, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 24962 + }, + { + "epoch": 0.2177094416633235, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 24963 + }, + { + "epoch": 0.21771816294849208, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 24964 + }, + { + "epoch": 0.21772688423366068, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 24965 + }, + { + "epoch": 0.21773560551882926, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 24966 + }, + { + "epoch": 0.21774432680399783, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 24967 + }, + { + "epoch": 0.21775304808916643, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 24968 + }, + { + "epoch": 0.217761769374335, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 24969 + }, + { + "epoch": 0.21777049065950357, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 24970 + }, + { + "epoch": 0.21777921194467217, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 24971 + }, + { + "epoch": 0.21778793322984075, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 24972 + }, + { + "epoch": 0.21779665451500932, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 24973 + }, + { + "epoch": 0.21780537580017792, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 24974 + }, + { + "epoch": 0.2178140970853465, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 24975 + }, + { + "epoch": 0.2178228183705151, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 24976 + }, + { + "epoch": 0.21783153965568366, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 24977 + }, + { + "epoch": 0.21784026094085224, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 24978 + }, + { + "epoch": 0.21784898222602084, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 24979 + }, + { + "epoch": 0.2178577035111894, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 24980 + }, + { + "epoch": 0.21786642479635798, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 24981 + }, + { + "epoch": 0.21787514608152658, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 24982 + }, + { + "epoch": 0.21788386736669516, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 24983 + }, + { + "epoch": 0.21789258865186373, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 24984 + }, + { + "epoch": 0.21790130993703233, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 24985 + }, + { + "epoch": 0.2179100312222009, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 24986 + }, + { + "epoch": 0.21791875250736947, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 24987 + }, + { + "epoch": 0.21792747379253807, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 24988 + }, + { + "epoch": 0.21793619507770665, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 24989 + }, + { + "epoch": 0.21794491636287525, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 24990 + }, + { + "epoch": 0.21795363764804382, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 24991 + }, + { + "epoch": 0.2179623589332124, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 24992 + }, + { + "epoch": 0.217971080218381, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 24993 + }, + { + "epoch": 0.21797980150354956, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 24994 + }, + { + "epoch": 0.21798852278871814, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 24995 + }, + { + "epoch": 0.21799724407388674, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 24996 + }, + { + "epoch": 0.2180059653590553, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 24997 + }, + { + "epoch": 0.21801468664422388, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 24998 + }, + { + "epoch": 0.21802340792939248, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 24999 + }, + { + "epoch": 0.21803212921456105, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 25000 + }, + { + "epoch": 0.21804085049972963, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 25001 + }, + { + "epoch": 0.21804957178489823, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 25002 + }, + { + "epoch": 0.2180582930700668, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25003 + }, + { + "epoch": 0.2180670143552354, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25004 + }, + { + "epoch": 0.21807573564040397, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 25005 + }, + { + "epoch": 0.21808445692557255, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 25006 + }, + { + "epoch": 0.21809317821074115, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 25007 + }, + { + "epoch": 0.21810189949590972, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 25008 + }, + { + "epoch": 0.2181106207810783, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25009 + }, + { + "epoch": 0.2181193420662469, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 25010 + }, + { + "epoch": 0.21812806335141546, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 25011 + }, + { + "epoch": 0.21813678463658404, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 25012 + }, + { + "epoch": 0.21814550592175264, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 25013 + }, + { + "epoch": 0.2181542272069212, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 25014 + }, + { + "epoch": 0.21816294849208978, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 25015 + }, + { + "epoch": 0.21817166977725838, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 25016 + }, + { + "epoch": 0.21818039106242695, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 25017 + }, + { + "epoch": 0.21818911234759555, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 25018 + }, + { + "epoch": 0.21819783363276413, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 25019 + }, + { + "epoch": 0.2182065549179327, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 25020 + }, + { + "epoch": 0.2182152762031013, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 25021 + }, + { + "epoch": 0.21822399748826987, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 25022 + }, + { + "epoch": 0.21823271877343844, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 25023 + }, + { + "epoch": 0.21824144005860704, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 25024 + }, + { + "epoch": 0.21825016134377562, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 25025 + }, + { + "epoch": 0.2182588826289442, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 25026 + }, + { + "epoch": 0.2182676039141128, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 25027 + }, + { + "epoch": 0.21827632519928136, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 25028 + }, + { + "epoch": 0.21828504648444993, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 25029 + }, + { + "epoch": 0.21829376776961854, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 25030 + }, + { + "epoch": 0.2183024890547871, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 25031 + }, + { + "epoch": 0.2183112103399557, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 25032 + }, + { + "epoch": 0.21831993162512428, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 25033 + }, + { + "epoch": 0.21832865291029285, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 25034 + }, + { + "epoch": 0.21833737419546145, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 25035 + }, + { + "epoch": 0.21834609548063003, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 25036 + }, + { + "epoch": 0.2183548167657986, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 25037 + }, + { + "epoch": 0.2183635380509672, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25038 + }, + { + "epoch": 0.21837225933613577, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 25039 + }, + { + "epoch": 0.21838098062130434, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 25040 + }, + { + "epoch": 0.21838970190647294, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 25041 + }, + { + "epoch": 0.21839842319164152, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 25042 + }, + { + "epoch": 0.2184071444768101, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 25043 + }, + { + "epoch": 0.2184158657619787, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 25044 + }, + { + "epoch": 0.21842458704714726, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 25045 + }, + { + "epoch": 0.21843330833231586, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 25046 + }, + { + "epoch": 0.21844202961748443, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 25047 + }, + { + "epoch": 0.218450750902653, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 25048 + }, + { + "epoch": 0.2184594721878216, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 25049 + }, + { + "epoch": 0.21846819347299018, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 25050 + }, + { + "epoch": 0.21847691475815875, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 25051 + }, + { + "epoch": 0.21848563604332735, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 25052 + }, + { + "epoch": 0.21849435732849593, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 25053 + }, + { + "epoch": 0.2185030786136645, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 25054 + }, + { + "epoch": 0.2185117998988331, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 25055 + }, + { + "epoch": 0.21852052118400167, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 25056 + }, + { + "epoch": 0.21852924246917027, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 25057 + }, + { + "epoch": 0.21853796375433884, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 25058 + }, + { + "epoch": 0.21854668503950742, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 25059 + }, + { + "epoch": 0.21855540632467602, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 25060 + }, + { + "epoch": 0.2185641276098446, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 25061 + }, + { + "epoch": 0.21857284889501316, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 25062 + }, + { + "epoch": 0.21858157018018176, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 25063 + }, + { + "epoch": 0.21859029146535033, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 25064 + }, + { + "epoch": 0.2185990127505189, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25065 + }, + { + "epoch": 0.2186077340356875, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 25066 + }, + { + "epoch": 0.21861645532085608, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 25067 + }, + { + "epoch": 0.21862517660602465, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 25068 + }, + { + "epoch": 0.21863389789119325, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 25069 + }, + { + "epoch": 0.21864261917636182, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 25070 + }, + { + "epoch": 0.21865134046153042, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 25071 + }, + { + "epoch": 0.218660061746699, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 25072 + }, + { + "epoch": 0.21866878303186757, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 25073 + }, + { + "epoch": 0.21867750431703617, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 25074 + }, + { + "epoch": 0.21868622560220474, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 25075 + }, + { + "epoch": 0.21869494688737331, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 25076 + }, + { + "epoch": 0.21870366817254192, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 25077 + }, + { + "epoch": 0.2187123894577105, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 25078 + }, + { + "epoch": 0.21872111074287906, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 25079 + }, + { + "epoch": 0.21872983202804766, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 25080 + }, + { + "epoch": 0.21873855331321623, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 25081 + }, + { + "epoch": 0.2187472745983848, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 25082 + }, + { + "epoch": 0.2187559958835534, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 25083 + }, + { + "epoch": 0.21876471716872198, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 25084 + }, + { + "epoch": 0.21877343845389058, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 25085 + }, + { + "epoch": 0.21878215973905915, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 25086 + }, + { + "epoch": 0.21879088102422772, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 25087 + }, + { + "epoch": 0.21879960230939632, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 25088 + }, + { + "epoch": 0.2188083235945649, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 25089 + }, + { + "epoch": 0.21881704487973347, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 25090 + }, + { + "epoch": 0.21882576616490207, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 25091 + }, + { + "epoch": 0.21883448745007064, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 25092 + }, + { + "epoch": 0.21884320873523921, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 25093 + }, + { + "epoch": 0.21885193002040781, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 25094 + }, + { + "epoch": 0.2188606513055764, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 25095 + }, + { + "epoch": 0.21886937259074496, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 25096 + }, + { + "epoch": 0.21887809387591356, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 25097 + }, + { + "epoch": 0.21888681516108213, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 25098 + }, + { + "epoch": 0.21889553644625073, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 25099 + }, + { + "epoch": 0.2189042577314193, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 25100 + }, + { + "epoch": 0.21891297901658788, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 25101 + }, + { + "epoch": 0.21892170030175648, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 25102 + }, + { + "epoch": 0.21893042158692505, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 25103 + }, + { + "epoch": 0.21893914287209362, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 25104 + }, + { + "epoch": 0.21894786415726222, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 25105 + }, + { + "epoch": 0.2189565854424308, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 25106 + }, + { + "epoch": 0.21896530672759937, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 25107 + }, + { + "epoch": 0.21897402801276797, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 25108 + }, + { + "epoch": 0.21898274929793654, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 25109 + }, + { + "epoch": 0.2189914705831051, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25110 + }, + { + "epoch": 0.2190001918682737, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 25111 + }, + { + "epoch": 0.2190089131534423, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 25112 + }, + { + "epoch": 0.2190176344386109, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 25113 + }, + { + "epoch": 0.21902635572377946, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 25114 + }, + { + "epoch": 0.21903507700894803, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 25115 + }, + { + "epoch": 0.21904379829411663, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 25116 + }, + { + "epoch": 0.2190525195792852, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 25117 + }, + { + "epoch": 0.21906124086445378, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 25118 + }, + { + "epoch": 0.21906996214962238, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 25119 + }, + { + "epoch": 0.21907868343479095, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 25120 + }, + { + "epoch": 0.21908740471995952, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 25121 + }, + { + "epoch": 0.21909612600512812, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 25122 + }, + { + "epoch": 0.2191048472902967, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 25123 + }, + { + "epoch": 0.21911356857546527, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 25124 + }, + { + "epoch": 0.21912228986063387, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 25125 + }, + { + "epoch": 0.21913101114580244, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 25126 + }, + { + "epoch": 0.21913973243097104, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 25127 + }, + { + "epoch": 0.2191484537161396, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 25128 + }, + { + "epoch": 0.21915717500130819, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 25129 + }, + { + "epoch": 0.21916589628647679, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 25130 + }, + { + "epoch": 0.21917461757164536, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 25131 + }, + { + "epoch": 0.21918333885681393, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 25132 + }, + { + "epoch": 0.21919206014198253, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 25133 + }, + { + "epoch": 0.2192007814271511, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 25134 + }, + { + "epoch": 0.21920950271231968, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 25135 + }, + { + "epoch": 0.21921822399748828, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 25136 + }, + { + "epoch": 0.21922694528265685, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 25137 + }, + { + "epoch": 0.21923566656782542, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 25138 + }, + { + "epoch": 0.21924438785299402, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 25139 + }, + { + "epoch": 0.2192531091381626, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 25140 + }, + { + "epoch": 0.2192618304233312, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 25141 + }, + { + "epoch": 0.21927055170849977, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 25142 + }, + { + "epoch": 0.21927927299366834, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 25143 + }, + { + "epoch": 0.21928799427883694, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 25144 + }, + { + "epoch": 0.2192967155640055, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 25145 + }, + { + "epoch": 0.21930543684917408, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 25146 + }, + { + "epoch": 0.21931415813434268, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 25147 + }, + { + "epoch": 0.21932287941951126, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25148 + }, + { + "epoch": 0.21933160070467983, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 25149 + }, + { + "epoch": 0.21934032198984843, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 25150 + }, + { + "epoch": 0.219349043275017, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 25151 + }, + { + "epoch": 0.21935776456018558, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 25152 + }, + { + "epoch": 0.21936648584535418, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 25153 + }, + { + "epoch": 0.21937520713052275, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 25154 + }, + { + "epoch": 0.21938392841569135, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 25155 + }, + { + "epoch": 0.21939264970085992, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 25156 + }, + { + "epoch": 0.2194013709860285, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 25157 + }, + { + "epoch": 0.2194100922711971, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 25158 + }, + { + "epoch": 0.21941881355636567, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 25159 + }, + { + "epoch": 0.21942753484153424, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 25160 + }, + { + "epoch": 0.21943625612670284, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 25161 + }, + { + "epoch": 0.2194449774118714, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 25162 + }, + { + "epoch": 0.21945369869703998, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 25163 + }, + { + "epoch": 0.21946241998220858, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 25164 + }, + { + "epoch": 0.21947114126737716, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 25165 + }, + { + "epoch": 0.21947986255254576, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25166 + }, + { + "epoch": 0.21948858383771433, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 25167 + }, + { + "epoch": 0.2194973051228829, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 25168 + }, + { + "epoch": 0.2195060264080515, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25169 + }, + { + "epoch": 0.21951474769322007, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 25170 + }, + { + "epoch": 0.21952346897838865, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 25171 + }, + { + "epoch": 0.21953219026355725, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 25172 + }, + { + "epoch": 0.21954091154872582, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 25173 + }, + { + "epoch": 0.2195496328338944, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 25174 + }, + { + "epoch": 0.219558354119063, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 25175 + }, + { + "epoch": 0.21956707540423157, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 25176 + }, + { + "epoch": 0.21957579668940014, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 25177 + }, + { + "epoch": 0.21958451797456874, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 25178 + }, + { + "epoch": 0.2195932392597373, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 25179 + }, + { + "epoch": 0.2196019605449059, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 25180 + }, + { + "epoch": 0.21961068183007448, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 25181 + }, + { + "epoch": 0.21961940311524306, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 25182 + }, + { + "epoch": 0.21962812440041166, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 25183 + }, + { + "epoch": 0.21963684568558023, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 25184 + }, + { + "epoch": 0.2196455669707488, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 25185 + }, + { + "epoch": 0.2196542882559174, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 25186 + }, + { + "epoch": 0.21966300954108597, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 25187 + }, + { + "epoch": 0.21967173082625455, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 25188 + }, + { + "epoch": 0.21968045211142315, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 25189 + }, + { + "epoch": 0.21968917339659172, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 25190 + }, + { + "epoch": 0.2196978946817603, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 25191 + }, + { + "epoch": 0.2197066159669289, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 25192 + }, + { + "epoch": 0.21971533725209746, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 25193 + }, + { + "epoch": 0.21972405853726606, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 25194 + }, + { + "epoch": 0.21973277982243464, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 25195 + }, + { + "epoch": 0.2197415011076032, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 25196 + }, + { + "epoch": 0.2197502223927718, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 25197 + }, + { + "epoch": 0.21975894367794038, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 25198 + }, + { + "epoch": 0.21976766496310896, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 25199 + }, + { + "epoch": 0.21977638624827756, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 25200 + }, + { + "epoch": 0.21978510753344613, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 25201 + }, + { + "epoch": 0.2197938288186147, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 25202 + }, + { + "epoch": 0.2198025501037833, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 25203 + }, + { + "epoch": 0.21981127138895187, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 25204 + }, + { + "epoch": 0.21981999267412045, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 25205 + }, + { + "epoch": 0.21982871395928905, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 25206 + }, + { + "epoch": 0.21983743524445762, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 25207 + }, + { + "epoch": 0.21984615652962622, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 25208 + }, + { + "epoch": 0.2198548778147948, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 25209 + }, + { + "epoch": 0.21986359909996336, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 25210 + }, + { + "epoch": 0.21987232038513196, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 25211 + }, + { + "epoch": 0.21988104167030054, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 25212 + }, + { + "epoch": 0.2198897629554691, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 25213 + }, + { + "epoch": 0.2198984842406377, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 25214 + }, + { + "epoch": 0.21990720552580628, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 25215 + }, + { + "epoch": 0.21991592681097485, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 25216 + }, + { + "epoch": 0.21992464809614345, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 25217 + }, + { + "epoch": 0.21993336938131203, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 25218 + }, + { + "epoch": 0.2199420906664806, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 25219 + }, + { + "epoch": 0.2199508119516492, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 25220 + }, + { + "epoch": 0.21995953323681777, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 25221 + }, + { + "epoch": 0.21996825452198637, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 25222 + }, + { + "epoch": 0.21997697580715495, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 25223 + }, + { + "epoch": 0.21998569709232352, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 25224 + }, + { + "epoch": 0.21999441837749212, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 25225 + }, + { + "epoch": 0.2200031396626607, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25226 + }, + { + "epoch": 0.22001186094782926, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 25227 + }, + { + "epoch": 0.22002058223299786, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25228 + }, + { + "epoch": 0.22002930351816644, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 25229 + }, + { + "epoch": 0.220038024803335, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 25230 + }, + { + "epoch": 0.2200467460885036, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 25231 + }, + { + "epoch": 0.22005546737367218, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 25232 + }, + { + "epoch": 0.22006418865884075, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 25233 + }, + { + "epoch": 0.22007290994400935, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 25234 + }, + { + "epoch": 0.22008163122917793, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 25235 + }, + { + "epoch": 0.22009035251434653, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 25236 + }, + { + "epoch": 0.2200990737995151, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 25237 + }, + { + "epoch": 0.22010779508468367, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 25238 + }, + { + "epoch": 0.22011651636985227, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 25239 + }, + { + "epoch": 0.22012523765502084, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 25240 + }, + { + "epoch": 0.22013395894018942, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 25241 + }, + { + "epoch": 0.22014268022535802, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 25242 + }, + { + "epoch": 0.2201514015105266, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 25243 + }, + { + "epoch": 0.22016012279569516, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 25244 + }, + { + "epoch": 0.22016884408086376, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 25245 + }, + { + "epoch": 0.22017756536603234, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 25246 + }, + { + "epoch": 0.2201862866512009, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 25247 + }, + { + "epoch": 0.2201950079363695, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 25248 + }, + { + "epoch": 0.22020372922153808, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 25249 + }, + { + "epoch": 0.22021245050670668, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 25250 + }, + { + "epoch": 0.22022117179187525, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 25251 + }, + { + "epoch": 0.22022989307704383, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 25252 + }, + { + "epoch": 0.22023861436221243, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 25253 + }, + { + "epoch": 0.220247335647381, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 25254 + }, + { + "epoch": 0.22025605693254957, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 25255 + }, + { + "epoch": 0.22026477821771817, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 25256 + }, + { + "epoch": 0.22027349950288674, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 25257 + }, + { + "epoch": 0.22028222078805532, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 25258 + }, + { + "epoch": 0.22029094207322392, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 25259 + }, + { + "epoch": 0.2202996633583925, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 25260 + }, + { + "epoch": 0.22030838464356106, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 25261 + }, + { + "epoch": 0.22031710592872966, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 25262 + }, + { + "epoch": 0.22032582721389823, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 25263 + }, + { + "epoch": 0.22033454849906683, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 25264 + }, + { + "epoch": 0.2203432697842354, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25265 + }, + { + "epoch": 0.22035199106940398, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 25266 + }, + { + "epoch": 0.22036071235457258, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 25267 + }, + { + "epoch": 0.22036943363974115, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 25268 + }, + { + "epoch": 0.22037815492490972, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 25269 + }, + { + "epoch": 0.22038687621007833, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 25270 + }, + { + "epoch": 0.2203955974952469, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 25271 + }, + { + "epoch": 0.22040431878041547, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 25272 + }, + { + "epoch": 0.22041304006558407, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 25273 + }, + { + "epoch": 0.22042176135075264, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 25274 + }, + { + "epoch": 0.22043048263592122, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25275 + }, + { + "epoch": 0.22043920392108982, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 25276 + }, + { + "epoch": 0.2204479252062584, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 25277 + }, + { + "epoch": 0.220456646491427, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 25278 + }, + { + "epoch": 0.22046536777659556, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 25279 + }, + { + "epoch": 0.22047408906176413, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 25280 + }, + { + "epoch": 0.22048281034693273, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25281 + }, + { + "epoch": 0.2204915316321013, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 25282 + }, + { + "epoch": 0.22050025291726988, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 25283 + }, + { + "epoch": 0.22050897420243848, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 25284 + }, + { + "epoch": 0.22051769548760705, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 25285 + }, + { + "epoch": 0.22052641677277562, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 25286 + }, + { + "epoch": 0.22053513805794422, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 25287 + }, + { + "epoch": 0.2205438593431128, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 25288 + }, + { + "epoch": 0.2205525806282814, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 25289 + }, + { + "epoch": 0.22056130191344997, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25290 + }, + { + "epoch": 0.22057002319861854, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 25291 + }, + { + "epoch": 0.22057874448378714, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 25292 + }, + { + "epoch": 0.22058746576895572, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 25293 + }, + { + "epoch": 0.2205961870541243, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 25294 + }, + { + "epoch": 0.2206049083392929, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 25295 + }, + { + "epoch": 0.22061362962446146, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 25296 + }, + { + "epoch": 0.22062235090963003, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 25297 + }, + { + "epoch": 0.22063107219479863, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25298 + }, + { + "epoch": 0.2206397934799672, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 25299 + }, + { + "epoch": 0.22064851476513578, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 25300 + }, + { + "epoch": 0.22065723605030438, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 25301 + }, + { + "epoch": 0.22066595733547295, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 25302 + }, + { + "epoch": 0.22067467862064155, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 25303 + }, + { + "epoch": 0.22068339990581012, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 25304 + }, + { + "epoch": 0.2206921211909787, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 25305 + }, + { + "epoch": 0.2207008424761473, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 25306 + }, + { + "epoch": 0.22070956376131587, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 25307 + }, + { + "epoch": 0.22071828504648444, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 25308 + }, + { + "epoch": 0.22072700633165304, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 25309 + }, + { + "epoch": 0.22073572761682161, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 25310 + }, + { + "epoch": 0.2207444489019902, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 25311 + }, + { + "epoch": 0.2207531701871588, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 25312 + }, + { + "epoch": 0.22076189147232736, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 25313 + }, + { + "epoch": 0.22077061275749593, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 25314 + }, + { + "epoch": 0.22077933404266453, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 25315 + }, + { + "epoch": 0.2207880553278331, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 25316 + }, + { + "epoch": 0.2207967766130017, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 25317 + }, + { + "epoch": 0.22080549789817028, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 25318 + }, + { + "epoch": 0.22081421918333885, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 25319 + }, + { + "epoch": 0.22082294046850745, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 25320 + }, + { + "epoch": 0.22083166175367602, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 25321 + }, + { + "epoch": 0.2208403830388446, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 25322 + }, + { + "epoch": 0.2208491043240132, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 25323 + }, + { + "epoch": 0.22085782560918177, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 25324 + }, + { + "epoch": 0.22086654689435034, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 25325 + }, + { + "epoch": 0.22087526817951894, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 25326 + }, + { + "epoch": 0.2208839894646875, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 25327 + }, + { + "epoch": 0.22089271074985609, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 25328 + }, + { + "epoch": 0.2209014320350247, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 25329 + }, + { + "epoch": 0.22091015332019326, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 25330 + }, + { + "epoch": 0.22091887460536186, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 25331 + }, + { + "epoch": 0.22092759589053043, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 25332 + }, + { + "epoch": 0.220936317175699, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 25333 + }, + { + "epoch": 0.2209450384608676, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 25334 + }, + { + "epoch": 0.22095375974603618, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 25335 + }, + { + "epoch": 0.22096248103120475, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25336 + }, + { + "epoch": 0.22097120231637335, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 25337 + }, + { + "epoch": 0.22097992360154192, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 25338 + }, + { + "epoch": 0.2209886448867105, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 25339 + }, + { + "epoch": 0.2209973661718791, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 25340 + }, + { + "epoch": 0.22100608745704767, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 25341 + }, + { + "epoch": 0.22101480874221624, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 25342 + }, + { + "epoch": 0.22102353002738484, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 25343 + }, + { + "epoch": 0.2210322513125534, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 25344 + }, + { + "epoch": 0.221040972597722, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 25345 + }, + { + "epoch": 0.22104969388289059, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 25346 + }, + { + "epoch": 0.22105841516805916, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 25347 + }, + { + "epoch": 0.22106713645322776, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 25348 + }, + { + "epoch": 0.22107585773839633, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 25349 + }, + { + "epoch": 0.2210845790235649, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 25350 + }, + { + "epoch": 0.2210933003087335, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 25351 + }, + { + "epoch": 0.22110202159390208, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 25352 + }, + { + "epoch": 0.22111074287907065, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 25353 + }, + { + "epoch": 0.22111946416423925, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 25354 + }, + { + "epoch": 0.22112818544940782, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 25355 + }, + { + "epoch": 0.2211369067345764, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25356 + }, + { + "epoch": 0.221145628019745, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 25357 + }, + { + "epoch": 0.22115434930491357, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 25358 + }, + { + "epoch": 0.22116307059008217, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 25359 + }, + { + "epoch": 0.22117179187525074, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 25360 + }, + { + "epoch": 0.2211805131604193, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 25361 + }, + { + "epoch": 0.2211892344455879, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 25362 + }, + { + "epoch": 0.22119795573075648, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 25363 + }, + { + "epoch": 0.22120667701592506, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 25364 + }, + { + "epoch": 0.22121539830109366, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 25365 + }, + { + "epoch": 0.22122411958626223, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 25366 + }, + { + "epoch": 0.2212328408714308, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 25367 + }, + { + "epoch": 0.2212415621565994, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 25368 + }, + { + "epoch": 0.22125028344176798, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 25369 + }, + { + "epoch": 0.22125900472693655, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 25370 + }, + { + "epoch": 0.22126772601210515, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 25371 + }, + { + "epoch": 0.22127644729727372, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 25372 + }, + { + "epoch": 0.22128516858244232, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 25373 + }, + { + "epoch": 0.2212938898676109, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 25374 + }, + { + "epoch": 0.22130261115277947, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 25375 + }, + { + "epoch": 0.22131133243794807, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 25376 + }, + { + "epoch": 0.22132005372311664, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 25377 + }, + { + "epoch": 0.2213287750082852, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 25378 + }, + { + "epoch": 0.2213374962934538, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 25379 + }, + { + "epoch": 0.22134621757862238, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 25380 + }, + { + "epoch": 0.22135493886379096, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 25381 + }, + { + "epoch": 0.22136366014895956, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 25382 + }, + { + "epoch": 0.22137238143412813, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 25383 + }, + { + "epoch": 0.2213811027192967, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 25384 + }, + { + "epoch": 0.2213898240044653, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 25385 + }, + { + "epoch": 0.22139854528963387, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 25386 + }, + { + "epoch": 0.22140726657480247, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 25387 + }, + { + "epoch": 0.22141598785997105, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 25388 + }, + { + "epoch": 0.22142470914513962, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 25389 + }, + { + "epoch": 0.22143343043030822, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 25390 + }, + { + "epoch": 0.2214421517154768, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 25391 + }, + { + "epoch": 0.22145087300064537, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 25392 + }, + { + "epoch": 0.22145959428581397, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 25393 + }, + { + "epoch": 0.22146831557098254, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 25394 + }, + { + "epoch": 0.2214770368561511, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 25395 + }, + { + "epoch": 0.2214857581413197, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 25396 + }, + { + "epoch": 0.22149447942648828, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 25397 + }, + { + "epoch": 0.22150320071165688, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 25398 + }, + { + "epoch": 0.22151192199682546, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 25399 + }, + { + "epoch": 0.22152064328199403, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 25400 + }, + { + "epoch": 0.22152936456716263, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 25401 + }, + { + "epoch": 0.2215380858523312, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 25402 + }, + { + "epoch": 0.22154680713749977, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 25403 + }, + { + "epoch": 0.22155552842266837, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 25404 + }, + { + "epoch": 0.22156424970783695, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 25405 + }, + { + "epoch": 0.22157297099300552, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 25406 + }, + { + "epoch": 0.22158169227817412, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 25407 + }, + { + "epoch": 0.2215904135633427, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 25408 + }, + { + "epoch": 0.22159913484851126, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25409 + }, + { + "epoch": 0.22160785613367986, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 25410 + }, + { + "epoch": 0.22161657741884844, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 25411 + }, + { + "epoch": 0.22162529870401704, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 25412 + }, + { + "epoch": 0.2216340199891856, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 25413 + }, + { + "epoch": 0.22164274127435418, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 25414 + }, + { + "epoch": 0.22165146255952278, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 25415 + }, + { + "epoch": 0.22166018384469136, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 25416 + }, + { + "epoch": 0.22166890512985993, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 25417 + }, + { + "epoch": 0.22167762641502853, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 25418 + }, + { + "epoch": 0.2216863477001971, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 25419 + }, + { + "epoch": 0.22169506898536567, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 25420 + }, + { + "epoch": 0.22170379027053427, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 25421 + }, + { + "epoch": 0.22171251155570285, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 25422 + }, + { + "epoch": 0.22172123284087142, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 25423 + }, + { + "epoch": 0.22172995412604002, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 25424 + }, + { + "epoch": 0.2217386754112086, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 25425 + }, + { + "epoch": 0.2217473966963772, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 25426 + }, + { + "epoch": 0.22175611798154576, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 25427 + }, + { + "epoch": 0.22176483926671434, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25428 + }, + { + "epoch": 0.22177356055188294, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 25429 + }, + { + "epoch": 0.2217822818370515, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 25430 + }, + { + "epoch": 0.22179100312222008, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 25431 + }, + { + "epoch": 0.22179972440738868, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 25432 + }, + { + "epoch": 0.22180844569255725, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 25433 + }, + { + "epoch": 0.22181716697772583, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 25434 + }, + { + "epoch": 0.22182588826289443, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 25435 + }, + { + "epoch": 0.221834609548063, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 25436 + }, + { + "epoch": 0.22184333083323157, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 25437 + }, + { + "epoch": 0.22185205211840017, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 25438 + }, + { + "epoch": 0.22186077340356875, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 25439 + }, + { + "epoch": 0.22186949468873735, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 25440 + }, + { + "epoch": 0.22187821597390592, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 25441 + }, + { + "epoch": 0.2218869372590745, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 25442 + }, + { + "epoch": 0.2218956585442431, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 25443 + }, + { + "epoch": 0.22190437982941166, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 25444 + }, + { + "epoch": 0.22191310111458024, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 25445 + }, + { + "epoch": 0.22192182239974884, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 25446 + }, + { + "epoch": 0.2219305436849174, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 25447 + }, + { + "epoch": 0.22193926497008598, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 25448 + }, + { + "epoch": 0.22194798625525458, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25449 + }, + { + "epoch": 0.22195670754042315, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 25450 + }, + { + "epoch": 0.22196542882559173, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 25451 + }, + { + "epoch": 0.22197415011076033, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25452 + }, + { + "epoch": 0.2219828713959289, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 25453 + }, + { + "epoch": 0.2219915926810975, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 25454 + }, + { + "epoch": 0.22200031396626607, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 25455 + }, + { + "epoch": 0.22200903525143464, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 25456 + }, + { + "epoch": 0.22201775653660324, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 25457 + }, + { + "epoch": 0.22202647782177182, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.975, + "step": 25458 + }, + { + "epoch": 0.2220351991069404, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 25459 + }, + { + "epoch": 0.222043920392109, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 25460 + }, + { + "epoch": 0.22205264167727756, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 25461 + }, + { + "epoch": 0.22206136296244613, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 25462 + }, + { + "epoch": 0.22207008424761474, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 25463 + }, + { + "epoch": 0.2220788055327833, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 25464 + }, + { + "epoch": 0.22208752681795188, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 25465 + }, + { + "epoch": 0.22209624810312048, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 25466 + }, + { + "epoch": 0.22210496938828905, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 25467 + }, + { + "epoch": 0.22211369067345765, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 25468 + }, + { + "epoch": 0.22212241195862623, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 25469 + }, + { + "epoch": 0.2221311332437948, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 25470 + }, + { + "epoch": 0.2221398545289634, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 25471 + }, + { + "epoch": 0.22214857581413197, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 25472 + }, + { + "epoch": 0.22215729709930054, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 25473 + }, + { + "epoch": 0.22216601838446914, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 25474 + }, + { + "epoch": 0.22217473966963772, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 25475 + }, + { + "epoch": 0.2221834609548063, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25476 + }, + { + "epoch": 0.2221921822399749, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 25477 + }, + { + "epoch": 0.22220090352514346, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 25478 + }, + { + "epoch": 0.22220962481031203, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 25479 + }, + { + "epoch": 0.22221834609548063, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 25480 + }, + { + "epoch": 0.2222270673806492, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 25481 + }, + { + "epoch": 0.2222357886658178, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 25482 + }, + { + "epoch": 0.22224450995098638, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 25483 + }, + { + "epoch": 0.22225323123615495, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 25484 + }, + { + "epoch": 0.22226195252132355, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 25485 + }, + { + "epoch": 0.22227067380649213, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 25486 + }, + { + "epoch": 0.2222793950916607, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 25487 + }, + { + "epoch": 0.2222881163768293, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 25488 + }, + { + "epoch": 0.22229683766199787, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 25489 + }, + { + "epoch": 0.22230555894716644, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 25490 + }, + { + "epoch": 0.22231428023233504, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 25491 + }, + { + "epoch": 0.22232300151750362, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 25492 + }, + { + "epoch": 0.2223317228026722, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 25493 + }, + { + "epoch": 0.2223404440878408, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 25494 + }, + { + "epoch": 0.22234916537300936, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 25495 + }, + { + "epoch": 0.22235788665817796, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 25496 + }, + { + "epoch": 0.22236660794334653, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 25497 + }, + { + "epoch": 0.2223753292285151, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 25498 + }, + { + "epoch": 0.2223840505136837, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 25499 + }, + { + "epoch": 0.22239277179885228, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25500 + }, + { + "epoch": 0.22240149308402085, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 25501 + }, + { + "epoch": 0.22241021436918945, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 25502 + }, + { + "epoch": 0.22241893565435802, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 25503 + }, + { + "epoch": 0.2224276569395266, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 25504 + }, + { + "epoch": 0.2224363782246952, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 25505 + }, + { + "epoch": 0.22244509950986377, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 25506 + }, + { + "epoch": 0.22245382079503237, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25507 + }, + { + "epoch": 0.22246254208020094, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 25508 + }, + { + "epoch": 0.22247126336536951, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 25509 + }, + { + "epoch": 0.22247998465053812, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 25510 + }, + { + "epoch": 0.2224887059357067, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25511 + }, + { + "epoch": 0.22249742722087526, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 25512 + }, + { + "epoch": 0.22250614850604386, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 25513 + }, + { + "epoch": 0.22251486979121243, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 25514 + }, + { + "epoch": 0.222523591076381, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 25515 + }, + { + "epoch": 0.2225323123615496, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 25516 + }, + { + "epoch": 0.22254103364671818, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25517 + }, + { + "epoch": 0.22254975493188675, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 25518 + }, + { + "epoch": 0.22255847621705535, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 25519 + }, + { + "epoch": 0.22256719750222392, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 25520 + }, + { + "epoch": 0.22257591878739252, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 25521 + }, + { + "epoch": 0.2225846400725611, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 25522 + }, + { + "epoch": 0.22259336135772967, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 25523 + }, + { + "epoch": 0.22260208264289827, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25524 + }, + { + "epoch": 0.22261080392806684, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 25525 + }, + { + "epoch": 0.22261952521323541, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 25526 + }, + { + "epoch": 0.22262824649840401, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 25527 + }, + { + "epoch": 0.2226369677835726, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 25528 + }, + { + "epoch": 0.22264568906874116, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 25529 + }, + { + "epoch": 0.22265441035390976, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 25530 + }, + { + "epoch": 0.22266313163907833, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 25531 + }, + { + "epoch": 0.2226718529242469, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 25532 + }, + { + "epoch": 0.2226805742094155, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 25533 + }, + { + "epoch": 0.22268929549458408, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 25534 + }, + { + "epoch": 0.22269801677975268, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 25535 + }, + { + "epoch": 0.22270673806492125, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25536 + }, + { + "epoch": 0.22271545935008982, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 25537 + }, + { + "epoch": 0.22272418063525842, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 25538 + }, + { + "epoch": 0.222732901920427, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 25539 + }, + { + "epoch": 0.22274162320559557, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 25540 + }, + { + "epoch": 0.22275034449076417, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 25541 + }, + { + "epoch": 0.22275906577593274, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 25542 + }, + { + "epoch": 0.2227677870611013, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 25543 + }, + { + "epoch": 0.2227765083462699, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 25544 + }, + { + "epoch": 0.22278522963143849, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 25545 + }, + { + "epoch": 0.22279395091660706, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25546 + }, + { + "epoch": 0.22280267220177566, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 25547 + }, + { + "epoch": 0.22281139348694423, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 25548 + }, + { + "epoch": 0.22282011477211283, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 25549 + }, + { + "epoch": 0.2228288360572814, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 25550 + }, + { + "epoch": 0.22283755734244998, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 25551 + }, + { + "epoch": 0.22284627862761858, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 25552 + }, + { + "epoch": 0.22285499991278715, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 25553 + }, + { + "epoch": 0.22286372119795572, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25554 + }, + { + "epoch": 0.22287244248312432, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 25555 + }, + { + "epoch": 0.2228811637682929, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 25556 + }, + { + "epoch": 0.22288988505346147, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 25557 + }, + { + "epoch": 0.22289860633863007, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 25558 + }, + { + "epoch": 0.22290732762379864, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 25559 + }, + { + "epoch": 0.2229160489089672, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 25560 + }, + { + "epoch": 0.2229247701941358, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 25561 + }, + { + "epoch": 0.22293349147930439, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 25562 + }, + { + "epoch": 0.22294221276447299, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 25563 + }, + { + "epoch": 0.22295093404964156, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 25564 + }, + { + "epoch": 0.22295965533481013, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 25565 + }, + { + "epoch": 0.22296837661997873, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 25566 + }, + { + "epoch": 0.2229770979051473, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 25567 + }, + { + "epoch": 0.22298581919031588, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 25568 + }, + { + "epoch": 0.22299454047548448, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 25569 + }, + { + "epoch": 0.22300326176065305, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 25570 + }, + { + "epoch": 0.22301198304582162, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 25571 + }, + { + "epoch": 0.22302070433099022, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 25572 + }, + { + "epoch": 0.2230294256161588, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 25573 + }, + { + "epoch": 0.22303814690132737, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 25574 + }, + { + "epoch": 0.22304686818649597, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 25575 + }, + { + "epoch": 0.22305558947166454, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 25576 + }, + { + "epoch": 0.22306431075683314, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 25577 + }, + { + "epoch": 0.2230730320420017, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 25578 + }, + { + "epoch": 0.22308175332717028, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 25579 + }, + { + "epoch": 0.22309047461233888, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 25580 + }, + { + "epoch": 0.22309919589750746, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25581 + }, + { + "epoch": 0.22310791718267603, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 25582 + }, + { + "epoch": 0.22311663846784463, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 25583 + }, + { + "epoch": 0.2231253597530132, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 25584 + }, + { + "epoch": 0.22313408103818178, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 25585 + }, + { + "epoch": 0.22314280232335038, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 25586 + }, + { + "epoch": 0.22315152360851895, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25587 + }, + { + "epoch": 0.22316024489368752, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 25588 + }, + { + "epoch": 0.22316896617885612, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25589 + }, + { + "epoch": 0.2231776874640247, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 25590 + }, + { + "epoch": 0.2231864087491933, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 25591 + }, + { + "epoch": 0.22319513003436187, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 25592 + }, + { + "epoch": 0.22320385131953044, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25593 + }, + { + "epoch": 0.22321257260469904, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 25594 + }, + { + "epoch": 0.2232212938898676, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 25595 + }, + { + "epoch": 0.22323001517503618, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 25596 + }, + { + "epoch": 0.22323873646020478, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25597 + }, + { + "epoch": 0.22324745774537336, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 25598 + }, + { + "epoch": 0.22325617903054193, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 25599 + }, + { + "epoch": 0.22326490031571053, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 25600 + }, + { + "epoch": 0.2232736216008791, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 25601 + }, + { + "epoch": 0.22328234288604767, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 25602 + }, + { + "epoch": 0.22329106417121627, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 25603 + }, + { + "epoch": 0.22329978545638485, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 25604 + }, + { + "epoch": 0.22330850674155345, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 25605 + }, + { + "epoch": 0.22331722802672202, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 25606 + }, + { + "epoch": 0.2233259493118906, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 25607 + }, + { + "epoch": 0.2233346705970592, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 25608 + }, + { + "epoch": 0.22334339188222777, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 25609 + }, + { + "epoch": 0.22335211316739634, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 25610 + }, + { + "epoch": 0.22336083445256494, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 25611 + }, + { + "epoch": 0.2233695557377335, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 25612 + }, + { + "epoch": 0.22337827702290208, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 25613 + }, + { + "epoch": 0.22338699830807068, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 25614 + }, + { + "epoch": 0.22339571959323926, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 25615 + }, + { + "epoch": 0.22340444087840783, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 25616 + }, + { + "epoch": 0.22341316216357643, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 25617 + }, + { + "epoch": 0.223421883448745, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 25618 + }, + { + "epoch": 0.2234306047339136, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 25619 + }, + { + "epoch": 0.22343932601908217, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 25620 + }, + { + "epoch": 0.22344804730425075, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 25621 + }, + { + "epoch": 0.22345676858941935, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 25622 + }, + { + "epoch": 0.22346548987458792, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 25623 + }, + { + "epoch": 0.2234742111597565, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 25624 + }, + { + "epoch": 0.2234829324449251, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 25625 + }, + { + "epoch": 0.22349165373009366, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 25626 + }, + { + "epoch": 0.22350037501526224, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 25627 + }, + { + "epoch": 0.22350909630043084, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 25628 + }, + { + "epoch": 0.2235178175855994, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25629 + }, + { + "epoch": 0.223526538870768, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 25630 + }, + { + "epoch": 0.22353526015593658, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 25631 + }, + { + "epoch": 0.22354398144110516, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 25632 + }, + { + "epoch": 0.22355270272627376, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 25633 + }, + { + "epoch": 0.22356142401144233, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 25634 + }, + { + "epoch": 0.2235701452966109, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 25635 + }, + { + "epoch": 0.2235788665817795, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 25636 + }, + { + "epoch": 0.22358758786694807, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 25637 + }, + { + "epoch": 0.22359630915211665, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25638 + }, + { + "epoch": 0.22360503043728525, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 25639 + }, + { + "epoch": 0.22361375172245382, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25640 + }, + { + "epoch": 0.2236224730076224, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 25641 + }, + { + "epoch": 0.223631194292791, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25642 + }, + { + "epoch": 0.22363991557795956, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 25643 + }, + { + "epoch": 0.22364863686312816, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 25644 + }, + { + "epoch": 0.22365735814829674, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 25645 + }, + { + "epoch": 0.2236660794334653, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 25646 + }, + { + "epoch": 0.2236748007186339, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 25647 + }, + { + "epoch": 0.22368352200380248, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 25648 + }, + { + "epoch": 0.22369224328897105, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 25649 + }, + { + "epoch": 0.22370096457413965, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 25650 + }, + { + "epoch": 0.22370968585930823, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 25651 + }, + { + "epoch": 0.2237184071444768, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 25652 + }, + { + "epoch": 0.2237271284296454, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 25653 + }, + { + "epoch": 0.22373584971481397, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 25654 + }, + { + "epoch": 0.22374457099998254, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 25655 + }, + { + "epoch": 0.22375329228515115, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 25656 + }, + { + "epoch": 0.22376201357031972, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 25657 + }, + { + "epoch": 0.22377073485548832, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 25658 + }, + { + "epoch": 0.2237794561406569, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 25659 + }, + { + "epoch": 0.22378817742582546, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 25660 + }, + { + "epoch": 0.22379689871099406, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 25661 + }, + { + "epoch": 0.22380561999616264, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 25662 + }, + { + "epoch": 0.2238143412813312, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 25663 + }, + { + "epoch": 0.2238230625664998, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 25664 + }, + { + "epoch": 0.22383178385166838, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 25665 + }, + { + "epoch": 0.22384050513683695, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 25666 + }, + { + "epoch": 0.22384922642200555, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 25667 + }, + { + "epoch": 0.22385794770717413, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 25668 + }, + { + "epoch": 0.2238666689923427, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 25669 + }, + { + "epoch": 0.2238753902775113, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 25670 + }, + { + "epoch": 0.22388411156267987, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25671 + }, + { + "epoch": 0.22389283284784847, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 25672 + }, + { + "epoch": 0.22390155413301704, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 25673 + }, + { + "epoch": 0.22391027541818562, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25674 + }, + { + "epoch": 0.22391899670335422, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 25675 + }, + { + "epoch": 0.2239277179885228, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 25676 + }, + { + "epoch": 0.22393643927369136, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 25677 + }, + { + "epoch": 0.22394516055885996, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 25678 + }, + { + "epoch": 0.22395388184402854, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 25679 + }, + { + "epoch": 0.2239626031291971, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 25680 + }, + { + "epoch": 0.2239713244143657, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 25681 + }, + { + "epoch": 0.22398004569953428, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 25682 + }, + { + "epoch": 0.22398876698470285, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 25683 + }, + { + "epoch": 0.22399748826987145, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 25684 + }, + { + "epoch": 0.22400620955504003, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 25685 + }, + { + "epoch": 0.22401493084020863, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 25686 + }, + { + "epoch": 0.2240236521253772, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 25687 + }, + { + "epoch": 0.22403237341054577, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 25688 + }, + { + "epoch": 0.22404109469571437, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 25689 + }, + { + "epoch": 0.22404981598088294, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 25690 + }, + { + "epoch": 0.22405853726605152, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 25691 + }, + { + "epoch": 0.22406725855122012, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 25692 + }, + { + "epoch": 0.2240759798363887, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 25693 + }, + { + "epoch": 0.22408470112155726, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 25694 + }, + { + "epoch": 0.22409342240672586, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25695 + }, + { + "epoch": 0.22410214369189443, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 25696 + }, + { + "epoch": 0.224110864977063, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 25697 + }, + { + "epoch": 0.2241195862622316, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 25698 + }, + { + "epoch": 0.22412830754740018, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 25699 + }, + { + "epoch": 0.22413702883256878, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 25700 + }, + { + "epoch": 0.22414575011773735, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 25701 + }, + { + "epoch": 0.22415447140290592, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 25702 + }, + { + "epoch": 0.22416319268807453, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 25703 + }, + { + "epoch": 0.2241719139732431, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 25704 + }, + { + "epoch": 0.22418063525841167, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 25705 + }, + { + "epoch": 0.22418935654358027, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 25706 + }, + { + "epoch": 0.22419807782874884, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 25707 + }, + { + "epoch": 0.22420679911391742, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 25708 + }, + { + "epoch": 0.22421552039908602, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 25709 + }, + { + "epoch": 0.2242242416842546, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25710 + }, + { + "epoch": 0.22423296296942316, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 25711 + }, + { + "epoch": 0.22424168425459176, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 25712 + }, + { + "epoch": 0.22425040553976033, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 25713 + }, + { + "epoch": 0.22425912682492893, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25714 + }, + { + "epoch": 0.2242678481100975, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 25715 + }, + { + "epoch": 0.22427656939526608, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 25716 + }, + { + "epoch": 0.22428529068043468, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 25717 + }, + { + "epoch": 0.22429401196560325, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 25718 + }, + { + "epoch": 0.22430273325077182, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 25719 + }, + { + "epoch": 0.22431145453594042, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 25720 + }, + { + "epoch": 0.224320175821109, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 25721 + }, + { + "epoch": 0.22432889710627757, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 25722 + }, + { + "epoch": 0.22433761839144617, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 25723 + }, + { + "epoch": 0.22434633967661474, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25724 + }, + { + "epoch": 0.22435506096178331, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 25725 + }, + { + "epoch": 0.22436378224695191, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 25726 + }, + { + "epoch": 0.2243725035321205, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 25727 + }, + { + "epoch": 0.2243812248172891, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 25728 + }, + { + "epoch": 0.22438994610245766, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 25729 + }, + { + "epoch": 0.22439866738762623, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 25730 + }, + { + "epoch": 0.22440738867279483, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 25731 + }, + { + "epoch": 0.2244161099579634, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 25732 + }, + { + "epoch": 0.22442483124313198, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 25733 + }, + { + "epoch": 0.22443355252830058, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 25734 + }, + { + "epoch": 0.22444227381346915, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 25735 + }, + { + "epoch": 0.22445099509863772, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 25736 + }, + { + "epoch": 0.22445971638380632, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 25737 + }, + { + "epoch": 0.2244684376689749, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 25738 + }, + { + "epoch": 0.2244771589541435, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 25739 + }, + { + "epoch": 0.22448588023931207, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 25740 + }, + { + "epoch": 0.22449460152448064, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 25741 + }, + { + "epoch": 0.22450332280964924, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 25742 + }, + { + "epoch": 0.22451204409481781, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 25743 + }, + { + "epoch": 0.2245207653799864, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 25744 + }, + { + "epoch": 0.224529486665155, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 25745 + }, + { + "epoch": 0.22453820795032356, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 25746 + }, + { + "epoch": 0.22454692923549213, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 25747 + }, + { + "epoch": 0.22455565052066073, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 25748 + }, + { + "epoch": 0.2245643718058293, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 25749 + }, + { + "epoch": 0.22457309309099788, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 25750 + }, + { + "epoch": 0.22458181437616648, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 25751 + }, + { + "epoch": 0.22459053566133505, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 25752 + }, + { + "epoch": 0.22459925694650365, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 25753 + }, + { + "epoch": 0.22460797823167222, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 25754 + }, + { + "epoch": 0.2246166995168408, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 25755 + }, + { + "epoch": 0.2246254208020094, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 25756 + }, + { + "epoch": 0.22463414208717797, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 25757 + }, + { + "epoch": 0.22464286337234654, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 25758 + }, + { + "epoch": 0.22465158465751514, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 25759 + }, + { + "epoch": 0.2246603059426837, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 25760 + }, + { + "epoch": 0.22466902722785229, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 25761 + }, + { + "epoch": 0.2246777485130209, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 25762 + }, + { + "epoch": 0.22468646979818946, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 25763 + }, + { + "epoch": 0.22469519108335803, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 25764 + }, + { + "epoch": 0.22470391236852663, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 25765 + }, + { + "epoch": 0.2247126336536952, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 25766 + }, + { + "epoch": 0.2247213549388638, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 25767 + }, + { + "epoch": 0.22473007622403238, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 25768 + }, + { + "epoch": 0.22473879750920095, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 25769 + }, + { + "epoch": 0.22474751879436955, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25770 + }, + { + "epoch": 0.22475624007953812, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 25771 + }, + { + "epoch": 0.2247649613647067, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 25772 + }, + { + "epoch": 0.2247736826498753, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 25773 + }, + { + "epoch": 0.22478240393504387, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 25774 + }, + { + "epoch": 0.22479112522021244, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 25775 + }, + { + "epoch": 0.22479984650538104, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 25776 + }, + { + "epoch": 0.2248085677905496, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 25777 + }, + { + "epoch": 0.22481728907571819, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 25778 + }, + { + "epoch": 0.22482601036088679, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 25779 + }, + { + "epoch": 0.22483473164605536, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 25780 + }, + { + "epoch": 0.22484345293122396, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 25781 + }, + { + "epoch": 0.22485217421639253, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 25782 + }, + { + "epoch": 0.2248608955015611, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 25783 + }, + { + "epoch": 0.2248696167867297, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 25784 + }, + { + "epoch": 0.22487833807189828, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 25785 + }, + { + "epoch": 0.22488705935706685, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25786 + }, + { + "epoch": 0.22489578064223545, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 25787 + }, + { + "epoch": 0.22490450192740402, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 25788 + }, + { + "epoch": 0.2249132232125726, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 25789 + }, + { + "epoch": 0.2249219444977412, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 25790 + }, + { + "epoch": 0.22493066578290977, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 25791 + }, + { + "epoch": 0.22493938706807834, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 25792 + }, + { + "epoch": 0.22494810835324694, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 25793 + }, + { + "epoch": 0.2249568296384155, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25794 + }, + { + "epoch": 0.2249655509235841, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 25795 + }, + { + "epoch": 0.22497427220875268, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 25796 + }, + { + "epoch": 0.22498299349392126, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 25797 + }, + { + "epoch": 0.22499171477908986, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 25798 + }, + { + "epoch": 0.22500043606425843, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 25799 + }, + { + "epoch": 0.225009157349427, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 25800 + }, + { + "epoch": 0.2250178786345956, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 25801 + }, + { + "epoch": 0.22502659991976418, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 25802 + }, + { + "epoch": 0.22503532120493275, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 25803 + }, + { + "epoch": 0.22504404249010135, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 25804 + }, + { + "epoch": 0.22505276377526992, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 25805 + }, + { + "epoch": 0.2250614850604385, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 25806 + }, + { + "epoch": 0.2250702063456071, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 25807 + }, + { + "epoch": 0.22507892763077567, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 25808 + }, + { + "epoch": 0.22508764891594427, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 25809 + }, + { + "epoch": 0.22509637020111284, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 25810 + }, + { + "epoch": 0.2251050914862814, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 25811 + }, + { + "epoch": 0.22511381277145, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 25812 + }, + { + "epoch": 0.22512253405661858, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25813 + }, + { + "epoch": 0.22513125534178716, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 25814 + }, + { + "epoch": 0.22513997662695576, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 25815 + }, + { + "epoch": 0.22514869791212433, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 25816 + }, + { + "epoch": 0.2251574191972929, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25817 + }, + { + "epoch": 0.2251661404824615, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 25818 + }, + { + "epoch": 0.22517486176763007, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 25819 + }, + { + "epoch": 0.22518358305279865, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 25820 + }, + { + "epoch": 0.22519230433796725, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25821 + }, + { + "epoch": 0.22520102562313582, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 25822 + }, + { + "epoch": 0.22520974690830442, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 25823 + }, + { + "epoch": 0.225218468193473, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 25824 + }, + { + "epoch": 0.22522718947864157, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 25825 + }, + { + "epoch": 0.22523591076381017, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 25826 + }, + { + "epoch": 0.22524463204897874, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 25827 + }, + { + "epoch": 0.2252533533341473, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 25828 + }, + { + "epoch": 0.2252620746193159, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 25829 + }, + { + "epoch": 0.22527079590448448, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 25830 + }, + { + "epoch": 0.22527951718965306, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 25831 + }, + { + "epoch": 0.22528823847482166, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 25832 + }, + { + "epoch": 0.22529695975999023, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25833 + }, + { + "epoch": 0.2253056810451588, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 25834 + }, + { + "epoch": 0.2253144023303274, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 25835 + }, + { + "epoch": 0.22532312361549597, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 25836 + }, + { + "epoch": 0.22533184490066457, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 25837 + }, + { + "epoch": 0.22534056618583315, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 25838 + }, + { + "epoch": 0.22534928747100172, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 25839 + }, + { + "epoch": 0.22535800875617032, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 25840 + }, + { + "epoch": 0.2253667300413389, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 25841 + }, + { + "epoch": 0.22537545132650746, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 25842 + }, + { + "epoch": 0.22538417261167606, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 25843 + }, + { + "epoch": 0.22539289389684464, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 25844 + }, + { + "epoch": 0.2254016151820132, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 25845 + }, + { + "epoch": 0.2254103364671818, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 25846 + }, + { + "epoch": 0.22541905775235038, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25847 + }, + { + "epoch": 0.22542777903751895, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 25848 + }, + { + "epoch": 0.22543650032268756, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 25849 + }, + { + "epoch": 0.22544522160785613, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 25850 + }, + { + "epoch": 0.22545394289302473, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 25851 + }, + { + "epoch": 0.2254626641781933, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 25852 + }, + { + "epoch": 0.22547138546336187, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 25853 + }, + { + "epoch": 0.22548010674853047, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 25854 + }, + { + "epoch": 0.22548882803369905, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 25855 + }, + { + "epoch": 0.22549754931886762, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 25856 + }, + { + "epoch": 0.22550627060403622, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 25857 + }, + { + "epoch": 0.2255149918892048, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 25858 + }, + { + "epoch": 0.22552371317437336, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25859 + }, + { + "epoch": 0.22553243445954196, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 25860 + }, + { + "epoch": 0.22554115574471054, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 25861 + }, + { + "epoch": 0.22554987702987914, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 25862 + }, + { + "epoch": 0.2255585983150477, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 25863 + }, + { + "epoch": 0.22556731960021628, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 25864 + }, + { + "epoch": 0.22557604088538488, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 25865 + }, + { + "epoch": 0.22558476217055345, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 25866 + }, + { + "epoch": 0.22559348345572203, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 25867 + }, + { + "epoch": 0.22560220474089063, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 25868 + }, + { + "epoch": 0.2256109260260592, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 25869 + }, + { + "epoch": 0.22561964731122777, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 25870 + }, + { + "epoch": 0.22562836859639637, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 25871 + }, + { + "epoch": 0.22563708988156495, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 25872 + }, + { + "epoch": 0.22564581116673352, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 25873 + }, + { + "epoch": 0.22565453245190212, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 25874 + }, + { + "epoch": 0.2256632537370707, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 25875 + }, + { + "epoch": 0.2256719750222393, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 25876 + }, + { + "epoch": 0.22568069630740786, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 25877 + }, + { + "epoch": 0.22568941759257644, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 25878 + }, + { + "epoch": 0.22569813887774504, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 25879 + }, + { + "epoch": 0.2257068601629136, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 25880 + }, + { + "epoch": 0.22571558144808218, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 25881 + }, + { + "epoch": 0.22572430273325078, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 25882 + }, + { + "epoch": 0.22573302401841935, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25883 + }, + { + "epoch": 0.22574174530358793, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 25884 + }, + { + "epoch": 0.22575046658875653, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 25885 + }, + { + "epoch": 0.2257591878739251, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 25886 + }, + { + "epoch": 0.22576790915909367, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 25887 + }, + { + "epoch": 0.22577663044426227, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 25888 + }, + { + "epoch": 0.22578535172943084, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 25889 + }, + { + "epoch": 0.22579407301459944, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 25890 + }, + { + "epoch": 0.22580279429976802, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 25891 + }, + { + "epoch": 0.2258115155849366, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 25892 + }, + { + "epoch": 0.2258202368701052, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 25893 + }, + { + "epoch": 0.22582895815527376, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 25894 + }, + { + "epoch": 0.22583767944044233, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 25895 + }, + { + "epoch": 0.22584640072561094, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 25896 + }, + { + "epoch": 0.2258551220107795, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 25897 + }, + { + "epoch": 0.22586384329594808, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 25898 + }, + { + "epoch": 0.22587256458111668, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 25899 + }, + { + "epoch": 0.22588128586628525, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 25900 + }, + { + "epoch": 0.22589000715145383, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 25901 + }, + { + "epoch": 0.22589872843662243, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 25902 + }, + { + "epoch": 0.225907449721791, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 25903 + }, + { + "epoch": 0.2259161710069596, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 25904 + }, + { + "epoch": 0.22592489229212817, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 25905 + }, + { + "epoch": 0.22593361357729674, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 25906 + }, + { + "epoch": 0.22594233486246534, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25907 + }, + { + "epoch": 0.22595105614763392, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 25908 + }, + { + "epoch": 0.2259597774328025, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 25909 + }, + { + "epoch": 0.2259684987179711, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 25910 + }, + { + "epoch": 0.22597722000313966, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 25911 + }, + { + "epoch": 0.22598594128830823, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 25912 + }, + { + "epoch": 0.22599466257347683, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 25913 + }, + { + "epoch": 0.2260033838586454, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 25914 + }, + { + "epoch": 0.22601210514381398, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 25915 + }, + { + "epoch": 0.22602082642898258, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 25916 + }, + { + "epoch": 0.22602954771415115, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 25917 + }, + { + "epoch": 0.22603826899931975, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 25918 + }, + { + "epoch": 0.22604699028448832, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 25919 + }, + { + "epoch": 0.2260557115696569, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 25920 + }, + { + "epoch": 0.2260644328548255, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 25921 + }, + { + "epoch": 0.22607315413999407, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 25922 + }, + { + "epoch": 0.22608187542516264, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 25923 + }, + { + "epoch": 0.22609059671033124, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 25924 + }, + { + "epoch": 0.22609931799549982, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 25925 + }, + { + "epoch": 0.2261080392806684, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 25926 + }, + { + "epoch": 0.226116760565837, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 25927 + }, + { + "epoch": 0.22612548185100556, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 25928 + }, + { + "epoch": 0.22613420313617413, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 25929 + }, + { + "epoch": 0.22614292442134273, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25930 + }, + { + "epoch": 0.2261516457065113, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 25931 + }, + { + "epoch": 0.2261603669916799, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 25932 + }, + { + "epoch": 0.22616908827684848, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 25933 + }, + { + "epoch": 0.22617780956201705, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 25934 + }, + { + "epoch": 0.22618653084718565, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 25935 + }, + { + "epoch": 0.22619525213235422, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 25936 + }, + { + "epoch": 0.2262039734175228, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 25937 + }, + { + "epoch": 0.2262126947026914, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 25938 + }, + { + "epoch": 0.22622141598785997, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 25939 + }, + { + "epoch": 0.22623013727302854, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25940 + }, + { + "epoch": 0.22623885855819714, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 25941 + }, + { + "epoch": 0.22624757984336571, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 25942 + }, + { + "epoch": 0.2262563011285343, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 25943 + }, + { + "epoch": 0.2262650224137029, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 25944 + }, + { + "epoch": 0.22627374369887146, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 25945 + }, + { + "epoch": 0.22628246498404006, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 25946 + }, + { + "epoch": 0.22629118626920863, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 25947 + }, + { + "epoch": 0.2262999075543772, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 25948 + }, + { + "epoch": 0.2263086288395458, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 25949 + }, + { + "epoch": 0.22631735012471438, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 25950 + }, + { + "epoch": 0.22632607140988295, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 25951 + }, + { + "epoch": 0.22633479269505155, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 25952 + }, + { + "epoch": 0.22634351398022012, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 25953 + }, + { + "epoch": 0.2263522352653887, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 25954 + }, + { + "epoch": 0.2263609565505573, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 25955 + }, + { + "epoch": 0.22636967783572587, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 25956 + }, + { + "epoch": 0.22637839912089444, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 25957 + }, + { + "epoch": 0.22638712040606304, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 25958 + }, + { + "epoch": 0.22639584169123161, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 25959 + }, + { + "epoch": 0.22640456297640021, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 25960 + }, + { + "epoch": 0.2264132842615688, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 25961 + }, + { + "epoch": 0.22642200554673736, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 25962 + }, + { + "epoch": 0.22643072683190596, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 25963 + }, + { + "epoch": 0.22643944811707453, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 25964 + }, + { + "epoch": 0.2264481694022431, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 25965 + }, + { + "epoch": 0.2264568906874117, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 25966 + }, + { + "epoch": 0.22646561197258028, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 25967 + }, + { + "epoch": 0.22647433325774885, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 25968 + }, + { + "epoch": 0.22648305454291745, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 25969 + }, + { + "epoch": 0.22649177582808602, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 25970 + }, + { + "epoch": 0.22650049711325462, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 25971 + }, + { + "epoch": 0.2265092183984232, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 25972 + }, + { + "epoch": 0.22651793968359177, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 25973 + }, + { + "epoch": 0.22652666096876037, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 25974 + }, + { + "epoch": 0.22653538225392894, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 25975 + }, + { + "epoch": 0.2265441035390975, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 25976 + }, + { + "epoch": 0.2265528248242661, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 25977 + }, + { + "epoch": 0.22656154610943469, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 25978 + }, + { + "epoch": 0.22657026739460326, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 25979 + }, + { + "epoch": 0.22657898867977186, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 25980 + }, + { + "epoch": 0.22658770996494043, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 25981 + }, + { + "epoch": 0.226596431250109, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 25982 + }, + { + "epoch": 0.2266051525352776, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 25983 + }, + { + "epoch": 0.22661387382044618, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 25984 + }, + { + "epoch": 0.22662259510561478, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 25985 + }, + { + "epoch": 0.22663131639078335, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 25986 + }, + { + "epoch": 0.22664003767595192, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 25987 + }, + { + "epoch": 0.22664875896112052, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 25988 + }, + { + "epoch": 0.2266574802462891, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 25989 + }, + { + "epoch": 0.22666620153145767, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 25990 + }, + { + "epoch": 0.22667492281662627, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 25991 + }, + { + "epoch": 0.22668364410179484, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 25992 + }, + { + "epoch": 0.2266923653869634, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 25993 + }, + { + "epoch": 0.226701086672132, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 25994 + }, + { + "epoch": 0.22670980795730059, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 25995 + }, + { + "epoch": 0.22671852924246916, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 25996 + }, + { + "epoch": 0.22672725052763776, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 25997 + }, + { + "epoch": 0.22673597181280633, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 25998 + }, + { + "epoch": 0.22674469309797493, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 25999 + }, + { + "epoch": 0.2267534143831435, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 26000 + }, + { + "epoch": 0.22676213566831208, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 26001 + }, + { + "epoch": 0.22677085695348068, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 26002 + }, + { + "epoch": 0.22677957823864925, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 26003 + }, + { + "epoch": 0.22678829952381782, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 26004 + }, + { + "epoch": 0.22679702080898642, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 26005 + }, + { + "epoch": 0.226805742094155, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 26006 + }, + { + "epoch": 0.22681446337932357, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 26007 + }, + { + "epoch": 0.22682318466449217, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 26008 + }, + { + "epoch": 0.22683190594966074, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 26009 + }, + { + "epoch": 0.2268406272348293, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 26010 + }, + { + "epoch": 0.2268493485199979, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 26011 + }, + { + "epoch": 0.22685806980516648, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 26012 + }, + { + "epoch": 0.22686679109033508, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 26013 + }, + { + "epoch": 0.22687551237550366, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 26014 + }, + { + "epoch": 0.22688423366067223, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 26015 + }, + { + "epoch": 0.22689295494584083, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 26016 + }, + { + "epoch": 0.2269016762310094, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 26017 + }, + { + "epoch": 0.22691039751617798, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 26018 + }, + { + "epoch": 0.22691911880134658, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 26019 + }, + { + "epoch": 0.22692784008651515, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 26020 + }, + { + "epoch": 0.22693656137168372, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 26021 + }, + { + "epoch": 0.22694528265685232, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26022 + }, + { + "epoch": 0.2269540039420209, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 26023 + }, + { + "epoch": 0.22696272522718947, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 26024 + }, + { + "epoch": 0.22697144651235807, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 26025 + }, + { + "epoch": 0.22698016779752664, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 26026 + }, + { + "epoch": 0.22698888908269524, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 26027 + }, + { + "epoch": 0.2269976103678638, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 26028 + }, + { + "epoch": 0.22700633165303238, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 26029 + }, + { + "epoch": 0.22701505293820098, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 26030 + }, + { + "epoch": 0.22702377422336956, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 26031 + }, + { + "epoch": 0.22703249550853813, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 26032 + }, + { + "epoch": 0.22704121679370673, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 26033 + }, + { + "epoch": 0.2270499380788753, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 26034 + }, + { + "epoch": 0.22705865936404387, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 26035 + }, + { + "epoch": 0.22706738064921247, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 26036 + }, + { + "epoch": 0.22707610193438105, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 26037 + }, + { + "epoch": 0.22708482321954962, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 26038 + }, + { + "epoch": 0.22709354450471822, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 26039 + }, + { + "epoch": 0.2271022657898868, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 26040 + }, + { + "epoch": 0.2271109870750554, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 26041 + }, + { + "epoch": 0.22711970836022397, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 26042 + }, + { + "epoch": 0.22712842964539254, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 26043 + }, + { + "epoch": 0.22713715093056114, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 26044 + }, + { + "epoch": 0.2271458722157297, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 26045 + }, + { + "epoch": 0.22715459350089828, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 26046 + }, + { + "epoch": 0.22716331478606688, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 26047 + }, + { + "epoch": 0.22717203607123546, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 26048 + }, + { + "epoch": 0.22718075735640403, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 26049 + }, + { + "epoch": 0.22718947864157263, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 26050 + }, + { + "epoch": 0.2271981999267412, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 26051 + }, + { + "epoch": 0.22720692121190977, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 26052 + }, + { + "epoch": 0.22721564249707837, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 26053 + }, + { + "epoch": 0.22722436378224695, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 26054 + }, + { + "epoch": 0.22723308506741555, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 26055 + }, + { + "epoch": 0.22724180635258412, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 26056 + }, + { + "epoch": 0.2272505276377527, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 26057 + }, + { + "epoch": 0.2272592489229213, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 26058 + }, + { + "epoch": 0.22726797020808986, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 26059 + }, + { + "epoch": 0.22727669149325844, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 26060 + }, + { + "epoch": 0.22728541277842704, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 26061 + }, + { + "epoch": 0.2272941340635956, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 26062 + }, + { + "epoch": 0.22730285534876418, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 26063 + }, + { + "epoch": 0.22731157663393278, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26064 + }, + { + "epoch": 0.22732029791910136, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 26065 + }, + { + "epoch": 0.22732901920426993, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 26066 + }, + { + "epoch": 0.22733774048943853, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 26067 + }, + { + "epoch": 0.2273464617746071, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 26068 + }, + { + "epoch": 0.2273551830597757, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 26069 + }, + { + "epoch": 0.22736390434494427, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 26070 + }, + { + "epoch": 0.22737262563011285, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 26071 + }, + { + "epoch": 0.22738134691528145, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 26072 + }, + { + "epoch": 0.22739006820045002, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 26073 + }, + { + "epoch": 0.2273987894856186, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 26074 + }, + { + "epoch": 0.2274075107707872, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 26075 + }, + { + "epoch": 0.22741623205595576, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 26076 + }, + { + "epoch": 0.22742495334112434, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 26077 + }, + { + "epoch": 0.22743367462629294, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 26078 + }, + { + "epoch": 0.2274423959114615, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 26079 + }, + { + "epoch": 0.22745111719663008, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 26080 + }, + { + "epoch": 0.22745983848179868, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 26081 + }, + { + "epoch": 0.22746855976696725, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 26082 + }, + { + "epoch": 0.22747728105213585, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 26083 + }, + { + "epoch": 0.22748600233730443, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 26084 + }, + { + "epoch": 0.227494723622473, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 26085 + }, + { + "epoch": 0.2275034449076416, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 26086 + }, + { + "epoch": 0.22751216619281017, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 26087 + }, + { + "epoch": 0.22752088747797874, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 26088 + }, + { + "epoch": 0.22752960876314735, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 26089 + }, + { + "epoch": 0.22753833004831592, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 26090 + }, + { + "epoch": 0.2275470513334845, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 26091 + }, + { + "epoch": 0.2275557726186531, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 26092 + }, + { + "epoch": 0.22756449390382166, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 26093 + }, + { + "epoch": 0.22757321518899026, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 26094 + }, + { + "epoch": 0.22758193647415884, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 26095 + }, + { + "epoch": 0.2275906577593274, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 26096 + }, + { + "epoch": 0.227599379044496, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 26097 + }, + { + "epoch": 0.22760810032966458, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 26098 + }, + { + "epoch": 0.22761682161483315, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 26099 + }, + { + "epoch": 0.22762554290000175, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 26100 + }, + { + "epoch": 0.22763426418517033, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 26101 + }, + { + "epoch": 0.2276429854703389, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 26102 + }, + { + "epoch": 0.2276517067555075, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 26103 + }, + { + "epoch": 0.22766042804067607, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 26104 + }, + { + "epoch": 0.22766914932584464, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 26105 + }, + { + "epoch": 0.22767787061101324, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 26106 + }, + { + "epoch": 0.22768659189618182, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 26107 + }, + { + "epoch": 0.22769531318135042, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 26108 + }, + { + "epoch": 0.227704034466519, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 26109 + }, + { + "epoch": 0.22771275575168756, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 26110 + }, + { + "epoch": 0.22772147703685616, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 26111 + }, + { + "epoch": 0.22773019832202474, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 26112 + }, + { + "epoch": 0.2277389196071933, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.981, + "step": 26113 + }, + { + "epoch": 0.2277476408923619, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 26114 + }, + { + "epoch": 0.22775636217753048, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 26115 + }, + { + "epoch": 0.22776508346269905, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 26116 + }, + { + "epoch": 0.22777380474786765, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 26117 + }, + { + "epoch": 0.22778252603303623, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26118 + }, + { + "epoch": 0.2277912473182048, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 26119 + }, + { + "epoch": 0.2277999686033734, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 26120 + }, + { + "epoch": 0.22780868988854197, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 26121 + }, + { + "epoch": 0.22781741117371057, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 26122 + }, + { + "epoch": 0.22782613245887914, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 26123 + }, + { + "epoch": 0.22783485374404772, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 26124 + }, + { + "epoch": 0.22784357502921632, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 26125 + }, + { + "epoch": 0.2278522963143849, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 26126 + }, + { + "epoch": 0.22786101759955346, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 26127 + }, + { + "epoch": 0.22786973888472206, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 26128 + }, + { + "epoch": 0.22787846016989063, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 26129 + }, + { + "epoch": 0.2278871814550592, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 26130 + }, + { + "epoch": 0.2278959027402278, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 26131 + }, + { + "epoch": 0.22790462402539638, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 26132 + }, + { + "epoch": 0.22791334531056495, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 26133 + }, + { + "epoch": 0.22792206659573355, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 26134 + }, + { + "epoch": 0.22793078788090212, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 26135 + }, + { + "epoch": 0.22793950916607073, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 26136 + }, + { + "epoch": 0.2279482304512393, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 26137 + }, + { + "epoch": 0.22795695173640787, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 26138 + }, + { + "epoch": 0.22796567302157647, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 26139 + }, + { + "epoch": 0.22797439430674504, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 26140 + }, + { + "epoch": 0.22798311559191362, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 26141 + }, + { + "epoch": 0.22799183687708222, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 26142 + }, + { + "epoch": 0.2280005581622508, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 26143 + }, + { + "epoch": 0.22800927944741936, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 26144 + }, + { + "epoch": 0.22801800073258796, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 26145 + }, + { + "epoch": 0.22802672201775653, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 26146 + }, + { + "epoch": 0.2280354433029251, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 26147 + }, + { + "epoch": 0.2280441645880937, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 26148 + }, + { + "epoch": 0.22805288587326228, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26149 + }, + { + "epoch": 0.22806160715843088, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 26150 + }, + { + "epoch": 0.22807032844359945, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 26151 + }, + { + "epoch": 0.22807904972876802, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 26152 + }, + { + "epoch": 0.22808777101393662, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 26153 + }, + { + "epoch": 0.2280964922991052, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 26154 + }, + { + "epoch": 0.22810521358427377, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 26155 + }, + { + "epoch": 0.22811393486944237, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 26156 + }, + { + "epoch": 0.22812265615461094, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 26157 + }, + { + "epoch": 0.22813137743977951, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 26158 + }, + { + "epoch": 0.22814009872494811, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 26159 + }, + { + "epoch": 0.2281488200101167, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 26160 + }, + { + "epoch": 0.22815754129528526, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 26161 + }, + { + "epoch": 0.22816626258045386, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 26162 + }, + { + "epoch": 0.22817498386562243, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 26163 + }, + { + "epoch": 0.22818370515079103, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 26164 + }, + { + "epoch": 0.2281924264359596, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 26165 + }, + { + "epoch": 0.22820114772112818, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 26166 + }, + { + "epoch": 0.22820986900629678, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 26167 + }, + { + "epoch": 0.22821859029146535, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 26168 + }, + { + "epoch": 0.22822731157663392, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 26169 + }, + { + "epoch": 0.22823603286180252, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 26170 + }, + { + "epoch": 0.2282447541469711, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 26171 + }, + { + "epoch": 0.22825347543213967, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 26172 + }, + { + "epoch": 0.22826219671730827, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 26173 + }, + { + "epoch": 0.22827091800247684, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26174 + }, + { + "epoch": 0.22827963928764541, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 26175 + }, + { + "epoch": 0.22828836057281401, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 26176 + }, + { + "epoch": 0.2282970818579826, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 26177 + }, + { + "epoch": 0.2283058031431512, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 26178 + }, + { + "epoch": 0.22831452442831976, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 26179 + }, + { + "epoch": 0.22832324571348833, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 26180 + }, + { + "epoch": 0.22833196699865693, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 26181 + }, + { + "epoch": 0.2283406882838255, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 26182 + }, + { + "epoch": 0.22834940956899408, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 26183 + }, + { + "epoch": 0.22835813085416268, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 26184 + }, + { + "epoch": 0.22836685213933125, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 26185 + }, + { + "epoch": 0.22837557342449982, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 26186 + }, + { + "epoch": 0.22838429470966842, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 26187 + }, + { + "epoch": 0.228393015994837, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 26188 + }, + { + "epoch": 0.22840173728000557, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 26189 + }, + { + "epoch": 0.22841045856517417, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 26190 + }, + { + "epoch": 0.22841917985034274, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 26191 + }, + { + "epoch": 0.22842790113551134, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 26192 + }, + { + "epoch": 0.2284366224206799, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26193 + }, + { + "epoch": 0.22844534370584849, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 26194 + }, + { + "epoch": 0.2284540649910171, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 26195 + }, + { + "epoch": 0.22846278627618566, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 26196 + }, + { + "epoch": 0.22847150756135423, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 26197 + }, + { + "epoch": 0.22848022884652283, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 26198 + }, + { + "epoch": 0.2284889501316914, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 26199 + }, + { + "epoch": 0.22849767141685998, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 26200 + }, + { + "epoch": 0.22850639270202858, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 26201 + }, + { + "epoch": 0.22851511398719715, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 26202 + }, + { + "epoch": 0.22852383527236575, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 26203 + }, + { + "epoch": 0.22853255655753432, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 26204 + }, + { + "epoch": 0.2285412778427029, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26205 + }, + { + "epoch": 0.2285499991278715, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 26206 + }, + { + "epoch": 0.22855872041304007, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 26207 + }, + { + "epoch": 0.22856744169820864, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 26208 + }, + { + "epoch": 0.22857616298337724, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 26209 + }, + { + "epoch": 0.2285848842685458, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 26210 + }, + { + "epoch": 0.22859360555371439, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 26211 + }, + { + "epoch": 0.22860232683888299, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 26212 + }, + { + "epoch": 0.22861104812405156, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 26213 + }, + { + "epoch": 0.22861976940922013, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 26214 + }, + { + "epoch": 0.22862849069438873, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 26215 + }, + { + "epoch": 0.2286372119795573, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 26216 + }, + { + "epoch": 0.2286459332647259, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 26217 + }, + { + "epoch": 0.22865465454989448, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 26218 + }, + { + "epoch": 0.22866337583506305, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 26219 + }, + { + "epoch": 0.22867209712023165, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 26220 + }, + { + "epoch": 0.22868081840540022, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 26221 + }, + { + "epoch": 0.2286895396905688, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 26222 + }, + { + "epoch": 0.2286982609757374, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 26223 + }, + { + "epoch": 0.22870698226090597, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 26224 + }, + { + "epoch": 0.22871570354607454, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 26225 + }, + { + "epoch": 0.22872442483124314, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 26226 + }, + { + "epoch": 0.2287331461164117, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 26227 + }, + { + "epoch": 0.22874186740158028, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 26228 + }, + { + "epoch": 0.22875058868674888, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 26229 + }, + { + "epoch": 0.22875930997191746, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 26230 + }, + { + "epoch": 0.22876803125708606, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 26231 + }, + { + "epoch": 0.22877675254225463, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 26232 + }, + { + "epoch": 0.2287854738274232, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 26233 + }, + { + "epoch": 0.2287941951125918, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 26234 + }, + { + "epoch": 0.22880291639776038, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 26235 + }, + { + "epoch": 0.22881163768292895, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 26236 + }, + { + "epoch": 0.22882035896809755, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 26237 + }, + { + "epoch": 0.22882908025326612, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 26238 + }, + { + "epoch": 0.2288378015384347, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 26239 + }, + { + "epoch": 0.2288465228236033, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 26240 + }, + { + "epoch": 0.22885524410877187, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 26241 + }, + { + "epoch": 0.22886396539394044, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 26242 + }, + { + "epoch": 0.22887268667910904, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 26243 + }, + { + "epoch": 0.2288814079642776, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 26244 + }, + { + "epoch": 0.2288901292494462, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 26245 + }, + { + "epoch": 0.22889885053461478, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 26246 + }, + { + "epoch": 0.22890757181978336, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 26247 + }, + { + "epoch": 0.22891629310495196, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 26248 + }, + { + "epoch": 0.22892501439012053, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 26249 + }, + { + "epoch": 0.2289337356752891, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 26250 + }, + { + "epoch": 0.2289424569604577, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 26251 + }, + { + "epoch": 0.22895117824562627, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 26252 + }, + { + "epoch": 0.22895989953079485, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 26253 + }, + { + "epoch": 0.22896862081596345, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 26254 + }, + { + "epoch": 0.22897734210113202, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 26255 + }, + { + "epoch": 0.2289860633863006, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 26256 + }, + { + "epoch": 0.2289947846714692, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 26257 + }, + { + "epoch": 0.22900350595663777, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 26258 + }, + { + "epoch": 0.22901222724180637, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 26259 + }, + { + "epoch": 0.22902094852697494, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 26260 + }, + { + "epoch": 0.2290296698121435, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 26261 + }, + { + "epoch": 0.2290383910973121, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 26262 + }, + { + "epoch": 0.22904711238248068, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 26263 + }, + { + "epoch": 0.22905583366764926, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 26264 + }, + { + "epoch": 0.22906455495281786, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 26265 + }, + { + "epoch": 0.22907327623798643, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 26266 + }, + { + "epoch": 0.229081997523155, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 26267 + }, + { + "epoch": 0.2290907188083236, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 26268 + }, + { + "epoch": 0.22909944009349217, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 26269 + }, + { + "epoch": 0.22910816137866075, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 26270 + }, + { + "epoch": 0.22911688266382935, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 26271 + }, + { + "epoch": 0.22912560394899792, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 26272 + }, + { + "epoch": 0.22913432523416652, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 26273 + }, + { + "epoch": 0.2291430465193351, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 26274 + }, + { + "epoch": 0.22915176780450366, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 26275 + }, + { + "epoch": 0.22916048908967226, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 26276 + }, + { + "epoch": 0.22916921037484084, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 26277 + }, + { + "epoch": 0.2291779316600094, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 26278 + }, + { + "epoch": 0.229186652945178, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 26279 + }, + { + "epoch": 0.22919537423034658, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 26280 + }, + { + "epoch": 0.22920409551551515, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 26281 + }, + { + "epoch": 0.22921281680068376, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 26282 + }, + { + "epoch": 0.22922153808585233, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 26283 + }, + { + "epoch": 0.2292302593710209, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 26284 + }, + { + "epoch": 0.2292389806561895, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 26285 + }, + { + "epoch": 0.22924770194135807, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 26286 + }, + { + "epoch": 0.22925642322652667, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 26287 + }, + { + "epoch": 0.22926514451169525, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 26288 + }, + { + "epoch": 0.22927386579686382, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 26289 + }, + { + "epoch": 0.22928258708203242, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 26290 + }, + { + "epoch": 0.229291308367201, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 26291 + }, + { + "epoch": 0.22930002965236956, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 26292 + }, + { + "epoch": 0.22930875093753816, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 26293 + }, + { + "epoch": 0.22931747222270674, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 26294 + }, + { + "epoch": 0.2293261935078753, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 26295 + }, + { + "epoch": 0.2293349147930439, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 26296 + }, + { + "epoch": 0.22934363607821248, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 26297 + }, + { + "epoch": 0.22935235736338105, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 26298 + }, + { + "epoch": 0.22936107864854965, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 26299 + }, + { + "epoch": 0.22936979993371823, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 26300 + }, + { + "epoch": 0.22937852121888683, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26301 + }, + { + "epoch": 0.2293872425040554, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 26302 + }, + { + "epoch": 0.22939596378922397, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 26303 + }, + { + "epoch": 0.22940468507439257, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 26304 + }, + { + "epoch": 0.22941340635956115, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 26305 + }, + { + "epoch": 0.22942212764472972, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 26306 + }, + { + "epoch": 0.22943084892989832, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 26307 + }, + { + "epoch": 0.2294395702150669, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 26308 + }, + { + "epoch": 0.22944829150023546, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26309 + }, + { + "epoch": 0.22945701278540406, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 26310 + }, + { + "epoch": 0.22946573407057264, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 26311 + }, + { + "epoch": 0.22947445535574124, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 26312 + }, + { + "epoch": 0.2294831766409098, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 26313 + }, + { + "epoch": 0.22949189792607838, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 26314 + }, + { + "epoch": 0.22950061921124698, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 26315 + }, + { + "epoch": 0.22950934049641555, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 26316 + }, + { + "epoch": 0.22951806178158413, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 26317 + }, + { + "epoch": 0.22952678306675273, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 26318 + }, + { + "epoch": 0.2295355043519213, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26319 + }, + { + "epoch": 0.22954422563708987, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 26320 + }, + { + "epoch": 0.22955294692225847, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26321 + }, + { + "epoch": 0.22956166820742704, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 26322 + }, + { + "epoch": 0.22957038949259562, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 26323 + }, + { + "epoch": 0.22957911077776422, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 26324 + }, + { + "epoch": 0.2295878320629328, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 26325 + }, + { + "epoch": 0.2295965533481014, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 26326 + }, + { + "epoch": 0.22960527463326996, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 26327 + }, + { + "epoch": 0.22961399591843853, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 26328 + }, + { + "epoch": 0.22962271720360714, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 26329 + }, + { + "epoch": 0.2296314384887757, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 26330 + }, + { + "epoch": 0.22964015977394428, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26331 + }, + { + "epoch": 0.22964888105911288, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 26332 + }, + { + "epoch": 0.22965760234428145, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 26333 + }, + { + "epoch": 0.22966632362945003, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 26334 + }, + { + "epoch": 0.22967504491461863, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 26335 + }, + { + "epoch": 0.2296837661997872, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 26336 + }, + { + "epoch": 0.22969248748495577, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 26337 + }, + { + "epoch": 0.22970120877012437, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 26338 + }, + { + "epoch": 0.22970993005529294, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26339 + }, + { + "epoch": 0.22971865134046154, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 26340 + }, + { + "epoch": 0.22972737262563012, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 26341 + }, + { + "epoch": 0.2297360939107987, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 26342 + }, + { + "epoch": 0.2297448151959673, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 26343 + }, + { + "epoch": 0.22975353648113586, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 26344 + }, + { + "epoch": 0.22976225776630443, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 26345 + }, + { + "epoch": 0.22977097905147303, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 26346 + }, + { + "epoch": 0.2297797003366416, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 26347 + }, + { + "epoch": 0.22978842162181018, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 26348 + }, + { + "epoch": 0.22979714290697878, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 26349 + }, + { + "epoch": 0.22980586419214735, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 26350 + }, + { + "epoch": 0.22981458547731592, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 26351 + }, + { + "epoch": 0.22982330676248452, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 26352 + }, + { + "epoch": 0.2298320280476531, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 26353 + }, + { + "epoch": 0.2298407493328217, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 26354 + }, + { + "epoch": 0.22984947061799027, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 26355 + }, + { + "epoch": 0.22985819190315884, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 26356 + }, + { + "epoch": 0.22986691318832744, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 26357 + }, + { + "epoch": 0.22987563447349602, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 26358 + }, + { + "epoch": 0.2298843557586646, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 26359 + }, + { + "epoch": 0.2298930770438332, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 26360 + }, + { + "epoch": 0.22990179832900176, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 26361 + }, + { + "epoch": 0.22991051961417033, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 26362 + }, + { + "epoch": 0.22991924089933893, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 26363 + }, + { + "epoch": 0.2299279621845075, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26364 + }, + { + "epoch": 0.22993668346967608, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 26365 + }, + { + "epoch": 0.22994540475484468, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 26366 + }, + { + "epoch": 0.22995412604001325, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 26367 + }, + { + "epoch": 0.22996284732518185, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 26368 + }, + { + "epoch": 0.22997156861035042, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 26369 + }, + { + "epoch": 0.229980289895519, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 26370 + }, + { + "epoch": 0.2299890111806876, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 26371 + }, + { + "epoch": 0.22999773246585617, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 26372 + }, + { + "epoch": 0.23000645375102474, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 26373 + }, + { + "epoch": 0.23001517503619334, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 26374 + }, + { + "epoch": 0.23002389632136191, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 26375 + }, + { + "epoch": 0.2300326176065305, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 26376 + }, + { + "epoch": 0.2300413388916991, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 26377 + }, + { + "epoch": 0.23005006017686766, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 26378 + }, + { + "epoch": 0.23005878146203623, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 26379 + }, + { + "epoch": 0.23006750274720483, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 26380 + }, + { + "epoch": 0.2300762240323734, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 26381 + }, + { + "epoch": 0.230084945317542, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 26382 + }, + { + "epoch": 0.23009366660271058, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 26383 + }, + { + "epoch": 0.23010238788787915, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 26384 + }, + { + "epoch": 0.23011110917304775, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 26385 + }, + { + "epoch": 0.23011983045821632, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 26386 + }, + { + "epoch": 0.2301285517433849, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 26387 + }, + { + "epoch": 0.2301372730285535, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26388 + }, + { + "epoch": 0.23014599431372207, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 26389 + }, + { + "epoch": 0.23015471559889064, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 26390 + }, + { + "epoch": 0.23016343688405924, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 26391 + }, + { + "epoch": 0.23017215816922781, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 26392 + }, + { + "epoch": 0.2301808794543964, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 26393 + }, + { + "epoch": 0.230189600739565, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 26394 + }, + { + "epoch": 0.23019832202473356, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 26395 + }, + { + "epoch": 0.23020704330990216, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 26396 + }, + { + "epoch": 0.23021576459507073, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 26397 + }, + { + "epoch": 0.2302244858802393, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 26398 + }, + { + "epoch": 0.2302332071654079, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 26399 + }, + { + "epoch": 0.23024192845057648, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 26400 + }, + { + "epoch": 0.23025064973574505, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 26401 + }, + { + "epoch": 0.23025937102091365, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 26402 + }, + { + "epoch": 0.23026809230608222, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 26403 + }, + { + "epoch": 0.2302768135912508, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 26404 + }, + { + "epoch": 0.2302855348764194, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 26405 + }, + { + "epoch": 0.23029425616158797, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 26406 + }, + { + "epoch": 0.23030297744675654, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26407 + }, + { + "epoch": 0.23031169873192514, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26408 + }, + { + "epoch": 0.2303204200170937, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 26409 + }, + { + "epoch": 0.2303291413022623, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 26410 + }, + { + "epoch": 0.23033786258743089, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 26411 + }, + { + "epoch": 0.23034658387259946, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 26412 + }, + { + "epoch": 0.23035530515776806, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 26413 + }, + { + "epoch": 0.23036402644293663, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 26414 + }, + { + "epoch": 0.2303727477281052, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 26415 + }, + { + "epoch": 0.2303814690132738, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 26416 + }, + { + "epoch": 0.23039019029844238, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 26417 + }, + { + "epoch": 0.23039891158361095, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 26418 + }, + { + "epoch": 0.23040763286877955, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 26419 + }, + { + "epoch": 0.23041635415394812, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 26420 + }, + { + "epoch": 0.2304250754391167, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 26421 + }, + { + "epoch": 0.2304337967242853, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 26422 + }, + { + "epoch": 0.23044251800945387, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 26423 + }, + { + "epoch": 0.23045123929462247, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 26424 + }, + { + "epoch": 0.23045996057979104, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 26425 + }, + { + "epoch": 0.2304686818649596, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 26426 + }, + { + "epoch": 0.2304774031501282, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 26427 + }, + { + "epoch": 0.23048612443529679, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 26428 + }, + { + "epoch": 0.23049484572046536, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 26429 + }, + { + "epoch": 0.23050356700563396, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 26430 + }, + { + "epoch": 0.23051228829080253, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 26431 + }, + { + "epoch": 0.2305210095759711, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 26432 + }, + { + "epoch": 0.2305297308611397, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 26433 + }, + { + "epoch": 0.23053845214630828, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 26434 + }, + { + "epoch": 0.23054717343147688, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 26435 + }, + { + "epoch": 0.23055589471664545, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 26436 + }, + { + "epoch": 0.23056461600181402, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 26437 + }, + { + "epoch": 0.23057333728698262, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26438 + }, + { + "epoch": 0.2305820585721512, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 26439 + }, + { + "epoch": 0.23059077985731977, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 26440 + }, + { + "epoch": 0.23059950114248837, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 26441 + }, + { + "epoch": 0.23060822242765694, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 26442 + }, + { + "epoch": 0.2306169437128255, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 26443 + }, + { + "epoch": 0.2306256649979941, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 26444 + }, + { + "epoch": 0.23063438628316268, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 26445 + }, + { + "epoch": 0.23064310756833126, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 26446 + }, + { + "epoch": 0.23065182885349986, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 26447 + }, + { + "epoch": 0.23066055013866843, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 26448 + }, + { + "epoch": 0.23066927142383703, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 26449 + }, + { + "epoch": 0.2306779927090056, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 26450 + }, + { + "epoch": 0.23068671399417418, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 26451 + }, + { + "epoch": 0.23069543527934278, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 26452 + }, + { + "epoch": 0.23070415656451135, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 26453 + }, + { + "epoch": 0.23071287784967992, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 26454 + }, + { + "epoch": 0.23072159913484852, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 26455 + }, + { + "epoch": 0.2307303204200171, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 26456 + }, + { + "epoch": 0.23073904170518567, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 26457 + }, + { + "epoch": 0.23074776299035427, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 26458 + }, + { + "epoch": 0.23075648427552284, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 26459 + }, + { + "epoch": 0.2307652055606914, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 26460 + }, + { + "epoch": 0.23077392684586, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 26461 + }, + { + "epoch": 0.23078264813102858, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 26462 + }, + { + "epoch": 0.23079136941619718, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 26463 + }, + { + "epoch": 0.23080009070136576, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 26464 + }, + { + "epoch": 0.23080881198653433, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26465 + }, + { + "epoch": 0.23081753327170293, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 26466 + }, + { + "epoch": 0.2308262545568715, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 26467 + }, + { + "epoch": 0.23083497584204007, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 26468 + }, + { + "epoch": 0.23084369712720867, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 26469 + }, + { + "epoch": 0.23085241841237725, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 26470 + }, + { + "epoch": 0.23086113969754582, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 26471 + }, + { + "epoch": 0.23086986098271442, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 26472 + }, + { + "epoch": 0.230878582267883, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 26473 + }, + { + "epoch": 0.23088730355305156, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 26474 + }, + { + "epoch": 0.23089602483822017, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 26475 + }, + { + "epoch": 0.23090474612338874, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 26476 + }, + { + "epoch": 0.23091346740855734, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 26477 + }, + { + "epoch": 0.2309221886937259, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 26478 + }, + { + "epoch": 0.23093090997889448, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 26479 + }, + { + "epoch": 0.23093963126406308, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 26480 + }, + { + "epoch": 0.23094835254923166, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 26481 + }, + { + "epoch": 0.23095707383440023, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 26482 + }, + { + "epoch": 0.23096579511956883, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 26483 + }, + { + "epoch": 0.2309745164047374, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26484 + }, + { + "epoch": 0.23098323768990597, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 26485 + }, + { + "epoch": 0.23099195897507457, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 26486 + }, + { + "epoch": 0.23100068026024315, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 26487 + }, + { + "epoch": 0.23100940154541172, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 26488 + }, + { + "epoch": 0.23101812283058032, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 26489 + }, + { + "epoch": 0.2310268441157489, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 26490 + }, + { + "epoch": 0.2310355654009175, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 26491 + }, + { + "epoch": 0.23104428668608606, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 26492 + }, + { + "epoch": 0.23105300797125464, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 26493 + }, + { + "epoch": 0.23106172925642324, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 26494 + }, + { + "epoch": 0.2310704505415918, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 26495 + }, + { + "epoch": 0.23107917182676038, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 26496 + }, + { + "epoch": 0.23108789311192898, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 26497 + }, + { + "epoch": 0.23109661439709756, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 26498 + }, + { + "epoch": 0.23110533568226613, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 26499 + }, + { + "epoch": 0.23111405696743473, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26500 + }, + { + "epoch": 0.2311227782526033, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 26501 + }, + { + "epoch": 0.23113149953777187, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 26502 + }, + { + "epoch": 0.23114022082294047, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 26503 + }, + { + "epoch": 0.23114894210810905, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 26504 + }, + { + "epoch": 0.23115766339327765, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 26505 + }, + { + "epoch": 0.23116638467844622, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 26506 + }, + { + "epoch": 0.2311751059636148, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 26507 + }, + { + "epoch": 0.2311838272487834, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 26508 + }, + { + "epoch": 0.23119254853395196, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 26509 + }, + { + "epoch": 0.23120126981912054, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 26510 + }, + { + "epoch": 0.23120999110428914, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 26511 + }, + { + "epoch": 0.2312187123894577, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 26512 + }, + { + "epoch": 0.23122743367462628, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 26513 + }, + { + "epoch": 0.23123615495979488, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 26514 + }, + { + "epoch": 0.23124487624496345, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 26515 + }, + { + "epoch": 0.23125359753013203, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 26516 + }, + { + "epoch": 0.23126231881530063, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 26517 + }, + { + "epoch": 0.2312710401004692, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 26518 + }, + { + "epoch": 0.2312797613856378, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 26519 + }, + { + "epoch": 0.23128848267080637, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 26520 + }, + { + "epoch": 0.23129720395597494, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 26521 + }, + { + "epoch": 0.23130592524114355, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 26522 + }, + { + "epoch": 0.23131464652631212, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 26523 + }, + { + "epoch": 0.2313233678114807, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 26524 + }, + { + "epoch": 0.2313320890966493, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 26525 + }, + { + "epoch": 0.23134081038181786, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26526 + }, + { + "epoch": 0.23134953166698644, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 26527 + }, + { + "epoch": 0.23135825295215504, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 26528 + }, + { + "epoch": 0.2313669742373236, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 26529 + }, + { + "epoch": 0.23137569552249218, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 26530 + }, + { + "epoch": 0.23138441680766078, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 26531 + }, + { + "epoch": 0.23139313809282935, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 26532 + }, + { + "epoch": 0.23140185937799795, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 26533 + }, + { + "epoch": 0.23141058066316653, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 26534 + }, + { + "epoch": 0.2314193019483351, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 26535 + }, + { + "epoch": 0.2314280232335037, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 26536 + }, + { + "epoch": 0.23143674451867227, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 26537 + }, + { + "epoch": 0.23144546580384084, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 26538 + }, + { + "epoch": 0.23145418708900944, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 26539 + }, + { + "epoch": 0.23146290837417802, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 26540 + }, + { + "epoch": 0.2314716296593466, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 26541 + }, + { + "epoch": 0.2314803509445152, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9826, + "step": 26542 + }, + { + "epoch": 0.23148907222968376, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26543 + }, + { + "epoch": 0.23149779351485236, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 26544 + }, + { + "epoch": 0.23150651480002093, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 26545 + }, + { + "epoch": 0.2315152360851895, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 26546 + }, + { + "epoch": 0.2315239573703581, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 26547 + }, + { + "epoch": 0.23153267865552668, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26548 + }, + { + "epoch": 0.23154139994069525, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26549 + }, + { + "epoch": 0.23155012122586385, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 26550 + }, + { + "epoch": 0.23155884251103243, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 26551 + }, + { + "epoch": 0.231567563796201, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 26552 + }, + { + "epoch": 0.2315762850813696, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 26553 + }, + { + "epoch": 0.23158500636653817, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26554 + }, + { + "epoch": 0.23159372765170674, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 26555 + }, + { + "epoch": 0.23160244893687534, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 26556 + }, + { + "epoch": 0.23161117022204392, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 26557 + }, + { + "epoch": 0.23161989150721252, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 26558 + }, + { + "epoch": 0.2316286127923811, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 26559 + }, + { + "epoch": 0.23163733407754966, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 26560 + }, + { + "epoch": 0.23164605536271826, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 26561 + }, + { + "epoch": 0.23165477664788683, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 26562 + }, + { + "epoch": 0.2316634979330554, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 26563 + }, + { + "epoch": 0.231672219218224, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 26564 + }, + { + "epoch": 0.23168094050339258, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 26565 + }, + { + "epoch": 0.23168966178856115, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 26566 + }, + { + "epoch": 0.23169838307372975, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 26567 + }, + { + "epoch": 0.23170710435889832, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 26568 + }, + { + "epoch": 0.2317158256440669, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 26569 + }, + { + "epoch": 0.2317245469292355, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 26570 + }, + { + "epoch": 0.23173326821440407, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 26571 + }, + { + "epoch": 0.23174198949957267, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 26572 + }, + { + "epoch": 0.23175071078474124, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 26573 + }, + { + "epoch": 0.23175943206990982, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 26574 + }, + { + "epoch": 0.23176815335507842, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 26575 + }, + { + "epoch": 0.231776874640247, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 26576 + }, + { + "epoch": 0.23178559592541556, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 26577 + }, + { + "epoch": 0.23179431721058416, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 26578 + }, + { + "epoch": 0.23180303849575273, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 26579 + }, + { + "epoch": 0.2318117597809213, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26580 + }, + { + "epoch": 0.2318204810660899, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 26581 + }, + { + "epoch": 0.23182920235125848, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 26582 + }, + { + "epoch": 0.23183792363642705, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 26583 + }, + { + "epoch": 0.23184664492159565, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 26584 + }, + { + "epoch": 0.23185536620676422, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 26585 + }, + { + "epoch": 0.23186408749193282, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 26586 + }, + { + "epoch": 0.2318728087771014, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 26587 + }, + { + "epoch": 0.23188153006226997, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 26588 + }, + { + "epoch": 0.23189025134743857, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 26589 + }, + { + "epoch": 0.23189897263260714, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 26590 + }, + { + "epoch": 0.23190769391777571, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 26591 + }, + { + "epoch": 0.23191641520294431, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 26592 + }, + { + "epoch": 0.2319251364881129, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 26593 + }, + { + "epoch": 0.23193385777328146, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 26594 + }, + { + "epoch": 0.23194257905845006, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 26595 + }, + { + "epoch": 0.23195130034361863, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 26596 + }, + { + "epoch": 0.2319600216287872, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 26597 + }, + { + "epoch": 0.2319687429139558, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 26598 + }, + { + "epoch": 0.23197746419912438, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 26599 + }, + { + "epoch": 0.23198618548429298, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 26600 + }, + { + "epoch": 0.23199490676946155, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 26601 + }, + { + "epoch": 0.23200362805463012, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 26602 + }, + { + "epoch": 0.23201234933979872, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 26603 + }, + { + "epoch": 0.2320210706249673, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 26604 + }, + { + "epoch": 0.23202979191013587, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 26605 + }, + { + "epoch": 0.23203851319530447, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 26606 + }, + { + "epoch": 0.23204723448047304, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 26607 + }, + { + "epoch": 0.2320559557656416, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 26608 + }, + { + "epoch": 0.23206467705081021, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26609 + }, + { + "epoch": 0.2320733983359788, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 26610 + }, + { + "epoch": 0.23208211962114736, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 26611 + }, + { + "epoch": 0.23209084090631596, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 26612 + }, + { + "epoch": 0.23209956219148453, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 26613 + }, + { + "epoch": 0.23210828347665313, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 26614 + }, + { + "epoch": 0.2321170047618217, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 26615 + }, + { + "epoch": 0.23212572604699028, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 26616 + }, + { + "epoch": 0.23213444733215888, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 26617 + }, + { + "epoch": 0.23214316861732745, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 26618 + }, + { + "epoch": 0.23215188990249602, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 26619 + }, + { + "epoch": 0.23216061118766462, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 26620 + }, + { + "epoch": 0.2321693324728332, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 26621 + }, + { + "epoch": 0.23217805375800177, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 26622 + }, + { + "epoch": 0.23218677504317037, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 26623 + }, + { + "epoch": 0.23219549632833894, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 26624 + }, + { + "epoch": 0.2322042176135075, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 26625 + }, + { + "epoch": 0.2322129388986761, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 26626 + }, + { + "epoch": 0.23222166018384469, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26627 + }, + { + "epoch": 0.2322303814690133, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 26628 + }, + { + "epoch": 0.23223910275418186, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 26629 + }, + { + "epoch": 0.23224782403935043, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 26630 + }, + { + "epoch": 0.23225654532451903, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 26631 + }, + { + "epoch": 0.2322652666096876, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 26632 + }, + { + "epoch": 0.23227398789485618, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 26633 + }, + { + "epoch": 0.23228270918002478, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 26634 + }, + { + "epoch": 0.23229143046519335, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 26635 + }, + { + "epoch": 0.23230015175036192, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 26636 + }, + { + "epoch": 0.23230887303553052, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 26637 + }, + { + "epoch": 0.2323175943206991, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 26638 + }, + { + "epoch": 0.23232631560586767, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 26639 + }, + { + "epoch": 0.23233503689103627, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 26640 + }, + { + "epoch": 0.23234375817620484, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 26641 + }, + { + "epoch": 0.23235247946137344, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 26642 + }, + { + "epoch": 0.232361200746542, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26643 + }, + { + "epoch": 0.23236992203171059, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26644 + }, + { + "epoch": 0.23237864331687919, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 26645 + }, + { + "epoch": 0.23238736460204776, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 26646 + }, + { + "epoch": 0.23239608588721633, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 26647 + }, + { + "epoch": 0.23240480717238493, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 26648 + }, + { + "epoch": 0.2324135284575535, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 26649 + }, + { + "epoch": 0.23242224974272208, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 26650 + }, + { + "epoch": 0.23243097102789068, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 26651 + }, + { + "epoch": 0.23243969231305925, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 26652 + }, + { + "epoch": 0.23244841359822782, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 26653 + }, + { + "epoch": 0.23245713488339642, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 26654 + }, + { + "epoch": 0.232465856168565, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 26655 + }, + { + "epoch": 0.2324745774537336, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26656 + }, + { + "epoch": 0.23248329873890217, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 26657 + }, + { + "epoch": 0.23249202002407074, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 26658 + }, + { + "epoch": 0.23250074130923934, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 26659 + }, + { + "epoch": 0.2325094625944079, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 26660 + }, + { + "epoch": 0.23251818387957648, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 26661 + }, + { + "epoch": 0.23252690516474508, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 26662 + }, + { + "epoch": 0.23253562644991366, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 26663 + }, + { + "epoch": 0.23254434773508223, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 26664 + }, + { + "epoch": 0.23255306902025083, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 26665 + }, + { + "epoch": 0.2325617903054194, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 26666 + }, + { + "epoch": 0.232570511590588, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 26667 + }, + { + "epoch": 0.23257923287575658, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 26668 + }, + { + "epoch": 0.23258795416092515, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26669 + }, + { + "epoch": 0.23259667544609375, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 26670 + }, + { + "epoch": 0.23260539673126232, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 26671 + }, + { + "epoch": 0.2326141180164309, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 26672 + }, + { + "epoch": 0.2326228393015995, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 26673 + }, + { + "epoch": 0.23263156058676807, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 26674 + }, + { + "epoch": 0.23264028187193664, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 26675 + }, + { + "epoch": 0.23264900315710524, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 26676 + }, + { + "epoch": 0.2326577244422738, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 26677 + }, + { + "epoch": 0.23266644572744238, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 26678 + }, + { + "epoch": 0.23267516701261098, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 26679 + }, + { + "epoch": 0.23268388829777956, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 26680 + }, + { + "epoch": 0.23269260958294816, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 26681 + }, + { + "epoch": 0.23270133086811673, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 26682 + }, + { + "epoch": 0.2327100521532853, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 26683 + }, + { + "epoch": 0.2327187734384539, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 26684 + }, + { + "epoch": 0.23272749472362247, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 26685 + }, + { + "epoch": 0.23273621600879105, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 26686 + }, + { + "epoch": 0.23274493729395965, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 26687 + }, + { + "epoch": 0.23275365857912822, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 26688 + }, + { + "epoch": 0.2327623798642968, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 26689 + }, + { + "epoch": 0.2327711011494654, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 26690 + }, + { + "epoch": 0.23277982243463397, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 26691 + }, + { + "epoch": 0.23278854371980254, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26692 + }, + { + "epoch": 0.23279726500497114, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 26693 + }, + { + "epoch": 0.2328059862901397, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 26694 + }, + { + "epoch": 0.2328147075753083, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 26695 + }, + { + "epoch": 0.23282342886047688, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 26696 + }, + { + "epoch": 0.23283215014564546, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 26697 + }, + { + "epoch": 0.23284087143081406, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26698 + }, + { + "epoch": 0.23284959271598263, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 26699 + }, + { + "epoch": 0.2328583140011512, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 26700 + }, + { + "epoch": 0.2328670352863198, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 26701 + }, + { + "epoch": 0.23287575657148837, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 26702 + }, + { + "epoch": 0.23288447785665695, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 26703 + }, + { + "epoch": 0.23289319914182555, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 26704 + }, + { + "epoch": 0.23290192042699412, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 26705 + }, + { + "epoch": 0.2329106417121627, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 26706 + }, + { + "epoch": 0.2329193629973313, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 26707 + }, + { + "epoch": 0.23292808428249986, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 26708 + }, + { + "epoch": 0.23293680556766846, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 26709 + }, + { + "epoch": 0.23294552685283704, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 26710 + }, + { + "epoch": 0.2329542481380056, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 26711 + }, + { + "epoch": 0.2329629694231742, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 26712 + }, + { + "epoch": 0.23297169070834278, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 26713 + }, + { + "epoch": 0.23298041199351135, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 26714 + }, + { + "epoch": 0.23298913327867996, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 26715 + }, + { + "epoch": 0.23299785456384853, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 26716 + }, + { + "epoch": 0.2330065758490171, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 26717 + }, + { + "epoch": 0.2330152971341857, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 26718 + }, + { + "epoch": 0.23302401841935427, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 26719 + }, + { + "epoch": 0.23303273970452285, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 26720 + }, + { + "epoch": 0.23304146098969145, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 26721 + }, + { + "epoch": 0.23305018227486002, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 26722 + }, + { + "epoch": 0.23305890356002862, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 26723 + }, + { + "epoch": 0.2330676248451972, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 26724 + }, + { + "epoch": 0.23307634613036576, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 26725 + }, + { + "epoch": 0.23308506741553436, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 26726 + }, + { + "epoch": 0.23309378870070294, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 26727 + }, + { + "epoch": 0.2331025099858715, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 26728 + }, + { + "epoch": 0.2331112312710401, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 26729 + }, + { + "epoch": 0.23311995255620868, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 26730 + }, + { + "epoch": 0.23312867384137725, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 26731 + }, + { + "epoch": 0.23313739512654585, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 26732 + }, + { + "epoch": 0.23314611641171443, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 26733 + }, + { + "epoch": 0.233154837696883, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 26734 + }, + { + "epoch": 0.2331635589820516, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 26735 + }, + { + "epoch": 0.23317228026722017, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 26736 + }, + { + "epoch": 0.23318100155238877, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 26737 + }, + { + "epoch": 0.23318972283755734, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 26738 + }, + { + "epoch": 0.23319844412272592, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 26739 + }, + { + "epoch": 0.23320716540789452, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 26740 + }, + { + "epoch": 0.2332158866930631, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 26741 + }, + { + "epoch": 0.23322460797823166, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 26742 + }, + { + "epoch": 0.23323332926340026, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 26743 + }, + { + "epoch": 0.23324205054856884, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 26744 + }, + { + "epoch": 0.2332507718337374, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 26745 + }, + { + "epoch": 0.233259493118906, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 26746 + }, + { + "epoch": 0.23326821440407458, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 26747 + }, + { + "epoch": 0.23327693568924315, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 26748 + }, + { + "epoch": 0.23328565697441175, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 26749 + }, + { + "epoch": 0.23329437825958033, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 26750 + }, + { + "epoch": 0.23330309954474893, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 26751 + }, + { + "epoch": 0.2333118208299175, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 26752 + }, + { + "epoch": 0.23332054211508607, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 26753 + }, + { + "epoch": 0.23332926340025467, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26754 + }, + { + "epoch": 0.23333798468542324, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 26755 + }, + { + "epoch": 0.23334670597059182, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 26756 + }, + { + "epoch": 0.23335542725576042, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 26757 + }, + { + "epoch": 0.233364148540929, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 26758 + }, + { + "epoch": 0.23337286982609756, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 26759 + }, + { + "epoch": 0.23338159111126616, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 26760 + }, + { + "epoch": 0.23339031239643473, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 26761 + }, + { + "epoch": 0.2333990336816033, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 26762 + }, + { + "epoch": 0.2334077549667719, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 26763 + }, + { + "epoch": 0.23341647625194048, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 26764 + }, + { + "epoch": 0.23342519753710908, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 26765 + }, + { + "epoch": 0.23343391882227765, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 26766 + }, + { + "epoch": 0.23344264010744623, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 26767 + }, + { + "epoch": 0.23345136139261483, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 26768 + }, + { + "epoch": 0.2334600826777834, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 26769 + }, + { + "epoch": 0.23346880396295197, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 26770 + }, + { + "epoch": 0.23347752524812057, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 26771 + }, + { + "epoch": 0.23348624653328914, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 26772 + }, + { + "epoch": 0.23349496781845772, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 26773 + }, + { + "epoch": 0.23350368910362632, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 26774 + }, + { + "epoch": 0.2335124103887949, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 26775 + }, + { + "epoch": 0.2335211316739635, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 26776 + }, + { + "epoch": 0.23352985295913206, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 26777 + }, + { + "epoch": 0.23353857424430063, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 26778 + }, + { + "epoch": 0.23354729552946923, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 26779 + }, + { + "epoch": 0.2335560168146378, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 26780 + }, + { + "epoch": 0.23356473809980638, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 26781 + }, + { + "epoch": 0.23357345938497498, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 26782 + }, + { + "epoch": 0.23358218067014355, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 26783 + }, + { + "epoch": 0.23359090195531212, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 26784 + }, + { + "epoch": 0.23359962324048072, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 26785 + }, + { + "epoch": 0.2336083445256493, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 26786 + }, + { + "epoch": 0.23361706581081787, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 26787 + }, + { + "epoch": 0.23362578709598647, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 26788 + }, + { + "epoch": 0.23363450838115504, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 26789 + }, + { + "epoch": 0.23364322966632364, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 26790 + }, + { + "epoch": 0.23365195095149222, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 26791 + }, + { + "epoch": 0.2336606722366608, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 26792 + }, + { + "epoch": 0.2336693935218294, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 26793 + }, + { + "epoch": 0.23367811480699796, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 26794 + }, + { + "epoch": 0.23368683609216653, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 26795 + }, + { + "epoch": 0.23369555737733513, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 26796 + }, + { + "epoch": 0.2337042786625037, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 26797 + }, + { + "epoch": 0.23371299994767228, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 26798 + }, + { + "epoch": 0.23372172123284088, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 26799 + }, + { + "epoch": 0.23373044251800945, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 26800 + }, + { + "epoch": 0.23373916380317802, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 26801 + }, + { + "epoch": 0.23374788508834662, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 26802 + }, + { + "epoch": 0.2337566063735152, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 26803 + }, + { + "epoch": 0.2337653276586838, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 26804 + }, + { + "epoch": 0.23377404894385237, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 26805 + }, + { + "epoch": 0.23378277022902094, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 26806 + }, + { + "epoch": 0.23379149151418954, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 26807 + }, + { + "epoch": 0.23380021279935811, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 26808 + }, + { + "epoch": 0.2338089340845267, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26809 + }, + { + "epoch": 0.2338176553696953, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 26810 + }, + { + "epoch": 0.23382637665486386, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 26811 + }, + { + "epoch": 0.23383509794003243, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 26812 + }, + { + "epoch": 0.23384381922520103, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26813 + }, + { + "epoch": 0.2338525405103696, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 26814 + }, + { + "epoch": 0.23386126179553818, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 26815 + }, + { + "epoch": 0.23386998308070678, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 26816 + }, + { + "epoch": 0.23387870436587535, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 26817 + }, + { + "epoch": 0.23388742565104395, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 26818 + }, + { + "epoch": 0.23389614693621252, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 26819 + }, + { + "epoch": 0.2339048682213811, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 26820 + }, + { + "epoch": 0.2339135895065497, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 26821 + }, + { + "epoch": 0.23392231079171827, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 26822 + }, + { + "epoch": 0.23393103207688684, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 26823 + }, + { + "epoch": 0.23393975336205544, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 26824 + }, + { + "epoch": 0.23394847464722401, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 26825 + }, + { + "epoch": 0.2339571959323926, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 26826 + }, + { + "epoch": 0.2339659172175612, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 26827 + }, + { + "epoch": 0.23397463850272976, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 26828 + }, + { + "epoch": 0.23398335978789833, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 26829 + }, + { + "epoch": 0.23399208107306693, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 26830 + }, + { + "epoch": 0.2340008023582355, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 26831 + }, + { + "epoch": 0.2340095236434041, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 26832 + }, + { + "epoch": 0.23401824492857268, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 26833 + }, + { + "epoch": 0.23402696621374125, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 26834 + }, + { + "epoch": 0.23403568749890985, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 26835 + }, + { + "epoch": 0.23404440878407842, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 26836 + }, + { + "epoch": 0.234053130069247, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 26837 + }, + { + "epoch": 0.2340618513544156, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 26838 + }, + { + "epoch": 0.23407057263958417, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 26839 + }, + { + "epoch": 0.23407929392475274, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 26840 + }, + { + "epoch": 0.23408801520992134, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 26841 + }, + { + "epoch": 0.2340967364950899, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 26842 + }, + { + "epoch": 0.23410545778025849, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 26843 + }, + { + "epoch": 0.23411417906542709, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 26844 + }, + { + "epoch": 0.23412290035059566, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 26845 + }, + { + "epoch": 0.23413162163576426, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 26846 + }, + { + "epoch": 0.23414034292093283, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 26847 + }, + { + "epoch": 0.2341490642061014, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 26848 + }, + { + "epoch": 0.23415778549127, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 26849 + }, + { + "epoch": 0.23416650677643858, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 26850 + }, + { + "epoch": 0.23417522806160715, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 26851 + }, + { + "epoch": 0.23418394934677575, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 26852 + }, + { + "epoch": 0.23419267063194432, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 26853 + }, + { + "epoch": 0.2342013919171129, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 26854 + }, + { + "epoch": 0.2342101132022815, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 26855 + }, + { + "epoch": 0.23421883448745007, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 26856 + }, + { + "epoch": 0.23422755577261864, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 26857 + }, + { + "epoch": 0.23423627705778724, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 26858 + }, + { + "epoch": 0.2342449983429558, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 26859 + }, + { + "epoch": 0.2342537196281244, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 26860 + }, + { + "epoch": 0.23426244091329299, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 26861 + }, + { + "epoch": 0.23427116219846156, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 26862 + }, + { + "epoch": 0.23427988348363016, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 26863 + }, + { + "epoch": 0.23428860476879873, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 26864 + }, + { + "epoch": 0.2342973260539673, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 26865 + }, + { + "epoch": 0.2343060473391359, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 26866 + }, + { + "epoch": 0.23431476862430448, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 26867 + }, + { + "epoch": 0.23432348990947305, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 26868 + }, + { + "epoch": 0.23433221119464165, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 26869 + }, + { + "epoch": 0.23434093247981022, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 26870 + }, + { + "epoch": 0.2343496537649788, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 26871 + }, + { + "epoch": 0.2343583750501474, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 26872 + }, + { + "epoch": 0.23436709633531597, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 26873 + }, + { + "epoch": 0.23437581762048457, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 26874 + }, + { + "epoch": 0.23438453890565314, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 26875 + }, + { + "epoch": 0.2343932601908217, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 26876 + }, + { + "epoch": 0.2344019814759903, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 26877 + }, + { + "epoch": 0.23441070276115888, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 26878 + }, + { + "epoch": 0.23441942404632746, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 26879 + }, + { + "epoch": 0.23442814533149606, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 26880 + }, + { + "epoch": 0.23443686661666463, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 26881 + }, + { + "epoch": 0.2344455879018332, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 26882 + }, + { + "epoch": 0.2344543091870018, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 26883 + }, + { + "epoch": 0.23446303047217038, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 26884 + }, + { + "epoch": 0.23447175175733898, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 26885 + }, + { + "epoch": 0.23448047304250755, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 26886 + }, + { + "epoch": 0.23448919432767612, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 26887 + }, + { + "epoch": 0.23449791561284472, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 26888 + }, + { + "epoch": 0.2345066368980133, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 26889 + }, + { + "epoch": 0.23451535818318187, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 26890 + }, + { + "epoch": 0.23452407946835047, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 26891 + }, + { + "epoch": 0.23453280075351904, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 26892 + }, + { + "epoch": 0.2345415220386876, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 26893 + }, + { + "epoch": 0.2345502433238562, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 26894 + }, + { + "epoch": 0.23455896460902478, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 26895 + }, + { + "epoch": 0.23456768589419336, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 26896 + }, + { + "epoch": 0.23457640717936196, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 26897 + }, + { + "epoch": 0.23458512846453053, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 26898 + }, + { + "epoch": 0.23459384974969913, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 26899 + }, + { + "epoch": 0.2346025710348677, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 26900 + }, + { + "epoch": 0.23461129232003627, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 26901 + }, + { + "epoch": 0.23462001360520487, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 26902 + }, + { + "epoch": 0.23462873489037345, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 26903 + }, + { + "epoch": 0.23463745617554202, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 26904 + }, + { + "epoch": 0.23464617746071062, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 26905 + }, + { + "epoch": 0.2346548987458792, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 26906 + }, + { + "epoch": 0.23466362003104776, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 26907 + }, + { + "epoch": 0.23467234131621637, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 26908 + }, + { + "epoch": 0.23468106260138494, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 26909 + }, + { + "epoch": 0.2346897838865535, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 26910 + }, + { + "epoch": 0.2346985051717221, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 26911 + }, + { + "epoch": 0.23470722645689068, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 26912 + }, + { + "epoch": 0.23471594774205928, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26913 + }, + { + "epoch": 0.23472466902722786, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 26914 + }, + { + "epoch": 0.23473339031239643, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 26915 + }, + { + "epoch": 0.23474211159756503, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 26916 + }, + { + "epoch": 0.2347508328827336, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 26917 + }, + { + "epoch": 0.23475955416790217, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 26918 + }, + { + "epoch": 0.23476827545307077, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 26919 + }, + { + "epoch": 0.23477699673823935, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 26920 + }, + { + "epoch": 0.23478571802340792, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 26921 + }, + { + "epoch": 0.23479443930857652, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 26922 + }, + { + "epoch": 0.2348031605937451, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 26923 + }, + { + "epoch": 0.23481188187891366, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 26924 + }, + { + "epoch": 0.23482060316408226, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 26925 + }, + { + "epoch": 0.23482932444925084, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 26926 + }, + { + "epoch": 0.23483804573441944, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 26927 + }, + { + "epoch": 0.234846767019588, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 26928 + }, + { + "epoch": 0.23485548830475658, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 26929 + }, + { + "epoch": 0.23486420958992518, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 26930 + }, + { + "epoch": 0.23487293087509375, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 26931 + }, + { + "epoch": 0.23488165216026233, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 26932 + }, + { + "epoch": 0.23489037344543093, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 26933 + }, + { + "epoch": 0.2348990947305995, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 26934 + }, + { + "epoch": 0.23490781601576807, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 26935 + }, + { + "epoch": 0.23491653730093667, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 26936 + }, + { + "epoch": 0.23492525858610525, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 26937 + }, + { + "epoch": 0.23493397987127382, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 26938 + }, + { + "epoch": 0.23494270115644242, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 26939 + }, + { + "epoch": 0.234951422441611, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 26940 + }, + { + "epoch": 0.2349601437267796, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 26941 + }, + { + "epoch": 0.23496886501194816, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 26942 + }, + { + "epoch": 0.23497758629711674, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 26943 + }, + { + "epoch": 0.23498630758228534, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 26944 + }, + { + "epoch": 0.2349950288674539, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 26945 + }, + { + "epoch": 0.23500375015262248, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 26946 + }, + { + "epoch": 0.23501247143779108, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 26947 + }, + { + "epoch": 0.23502119272295965, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 26948 + }, + { + "epoch": 0.23502991400812823, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 26949 + }, + { + "epoch": 0.23503863529329683, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 26950 + }, + { + "epoch": 0.2350473565784654, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 26951 + }, + { + "epoch": 0.23505607786363397, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 26952 + }, + { + "epoch": 0.23506479914880257, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 26953 + }, + { + "epoch": 0.23507352043397114, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 26954 + }, + { + "epoch": 0.23508224171913975, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 26955 + }, + { + "epoch": 0.23509096300430832, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 26956 + }, + { + "epoch": 0.2350996842894769, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 26957 + }, + { + "epoch": 0.2351084055746455, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 26958 + }, + { + "epoch": 0.23511712685981406, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 26959 + }, + { + "epoch": 0.23512584814498264, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 26960 + }, + { + "epoch": 0.23513456943015124, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 26961 + }, + { + "epoch": 0.2351432907153198, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 26962 + }, + { + "epoch": 0.23515201200048838, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 26963 + }, + { + "epoch": 0.23516073328565698, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 26964 + }, + { + "epoch": 0.23516945457082555, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 26965 + }, + { + "epoch": 0.23517817585599413, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 26966 + }, + { + "epoch": 0.23518689714116273, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 26967 + }, + { + "epoch": 0.2351956184263313, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 26968 + }, + { + "epoch": 0.2352043397114999, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 26969 + }, + { + "epoch": 0.23521306099666847, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 26970 + }, + { + "epoch": 0.23522178228183704, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 26971 + }, + { + "epoch": 0.23523050356700564, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 26972 + }, + { + "epoch": 0.23523922485217422, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 26973 + }, + { + "epoch": 0.2352479461373428, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 26974 + }, + { + "epoch": 0.2352566674225114, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 26975 + }, + { + "epoch": 0.23526538870767996, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 26976 + }, + { + "epoch": 0.23527410999284853, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 26977 + }, + { + "epoch": 0.23528283127801713, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 26978 + }, + { + "epoch": 0.2352915525631857, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 26979 + }, + { + "epoch": 0.23530027384835428, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 26980 + }, + { + "epoch": 0.23530899513352288, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 26981 + }, + { + "epoch": 0.23531771641869145, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 26982 + }, + { + "epoch": 0.23532643770386005, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 26983 + }, + { + "epoch": 0.23533515898902863, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 26984 + }, + { + "epoch": 0.2353438802741972, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 26985 + }, + { + "epoch": 0.2353526015593658, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 26986 + }, + { + "epoch": 0.23536132284453437, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 26987 + }, + { + "epoch": 0.23537004412970294, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 26988 + }, + { + "epoch": 0.23537876541487154, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 26989 + }, + { + "epoch": 0.23538748670004012, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 26990 + }, + { + "epoch": 0.2353962079852087, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 26991 + }, + { + "epoch": 0.2354049292703773, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 26992 + }, + { + "epoch": 0.23541365055554586, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 26993 + }, + { + "epoch": 0.23542237184071443, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 26994 + }, + { + "epoch": 0.23543109312588303, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 26995 + }, + { + "epoch": 0.2354398144110516, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 26996 + }, + { + "epoch": 0.2354485356962202, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 26997 + }, + { + "epoch": 0.23545725698138878, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 26998 + }, + { + "epoch": 0.23546597826655735, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 26999 + }, + { + "epoch": 0.23547469955172595, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 27000 + }, + { + "epoch": 0.23548342083689452, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 27001 + }, + { + "epoch": 0.2354921421220631, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 27002 + }, + { + "epoch": 0.2355008634072317, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 27003 + }, + { + "epoch": 0.23550958469240027, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 27004 + }, + { + "epoch": 0.23551830597756884, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 27005 + }, + { + "epoch": 0.23552702726273744, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 27006 + }, + { + "epoch": 0.23553574854790602, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 27007 + }, + { + "epoch": 0.23554446983307462, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 27008 + }, + { + "epoch": 0.2355531911182432, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 0.9706, + "step": 27009 + }, + { + "epoch": 0.23556191240341176, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 27010 + }, + { + "epoch": 0.23557063368858036, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 27011 + }, + { + "epoch": 0.23557935497374893, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 27012 + }, + { + "epoch": 0.2355880762589175, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 27013 + }, + { + "epoch": 0.2355967975440861, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 27014 + }, + { + "epoch": 0.23560551882925468, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 27015 + }, + { + "epoch": 0.23561424011442325, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 27016 + }, + { + "epoch": 0.23562296139959185, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 27017 + }, + { + "epoch": 0.23563168268476042, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 27018 + }, + { + "epoch": 0.235640403969929, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 27019 + }, + { + "epoch": 0.2356491252550976, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 27020 + }, + { + "epoch": 0.23565784654026617, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 27021 + }, + { + "epoch": 0.23566656782543477, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 27022 + }, + { + "epoch": 0.23567528911060334, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 27023 + }, + { + "epoch": 0.23568401039577191, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 27024 + }, + { + "epoch": 0.23569273168094051, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 27025 + }, + { + "epoch": 0.2357014529661091, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 27026 + }, + { + "epoch": 0.23571017425127766, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 27027 + }, + { + "epoch": 0.23571889553644626, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27028 + }, + { + "epoch": 0.23572761682161483, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 27029 + }, + { + "epoch": 0.2357363381067834, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 27030 + }, + { + "epoch": 0.235745059391952, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 27031 + }, + { + "epoch": 0.23575378067712058, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 27032 + }, + { + "epoch": 0.23576250196228915, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 27033 + }, + { + "epoch": 0.23577122324745775, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 27034 + }, + { + "epoch": 0.23577994453262632, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 27035 + }, + { + "epoch": 0.23578866581779492, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 27036 + }, + { + "epoch": 0.2357973871029635, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 27037 + }, + { + "epoch": 0.23580610838813207, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 27038 + }, + { + "epoch": 0.23581482967330067, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 27039 + }, + { + "epoch": 0.23582355095846924, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 27040 + }, + { + "epoch": 0.2358322722436378, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 27041 + }, + { + "epoch": 0.23584099352880641, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 27042 + }, + { + "epoch": 0.235849714813975, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 27043 + }, + { + "epoch": 0.23585843609914356, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 27044 + }, + { + "epoch": 0.23586715738431216, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 27045 + }, + { + "epoch": 0.23587587866948073, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 27046 + }, + { + "epoch": 0.2358845999546493, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 27047 + }, + { + "epoch": 0.2358933212398179, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 27048 + }, + { + "epoch": 0.23590204252498648, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 27049 + }, + { + "epoch": 0.23591076381015508, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 27050 + }, + { + "epoch": 0.23591948509532365, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 27051 + }, + { + "epoch": 0.23592820638049222, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 27052 + }, + { + "epoch": 0.23593692766566082, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 27053 + }, + { + "epoch": 0.2359456489508294, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 27054 + }, + { + "epoch": 0.23595437023599797, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 27055 + }, + { + "epoch": 0.23596309152116657, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 27056 + }, + { + "epoch": 0.23597181280633514, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 27057 + }, + { + "epoch": 0.2359805340915037, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 27058 + }, + { + "epoch": 0.2359892553766723, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 27059 + }, + { + "epoch": 0.23599797666184089, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 27060 + }, + { + "epoch": 0.23600669794700946, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 27061 + }, + { + "epoch": 0.23601541923217806, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 27062 + }, + { + "epoch": 0.23602414051734663, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 27063 + }, + { + "epoch": 0.23603286180251523, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 27064 + }, + { + "epoch": 0.2360415830876838, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 27065 + }, + { + "epoch": 0.23605030437285238, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 27066 + }, + { + "epoch": 0.23605902565802098, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 27067 + }, + { + "epoch": 0.23606774694318955, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 27068 + }, + { + "epoch": 0.23607646822835812, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 27069 + }, + { + "epoch": 0.23608518951352672, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 27070 + }, + { + "epoch": 0.2360939107986953, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 27071 + }, + { + "epoch": 0.23610263208386387, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 27072 + }, + { + "epoch": 0.23611135336903247, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 27073 + }, + { + "epoch": 0.23612007465420104, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 27074 + }, + { + "epoch": 0.2361287959393696, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 27075 + }, + { + "epoch": 0.2361375172245382, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 27076 + }, + { + "epoch": 0.23614623850970679, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 27077 + }, + { + "epoch": 0.23615495979487539, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 27078 + }, + { + "epoch": 0.23616368108004396, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 27079 + }, + { + "epoch": 0.23617240236521253, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 27080 + }, + { + "epoch": 0.23618112365038113, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 27081 + }, + { + "epoch": 0.2361898449355497, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 27082 + }, + { + "epoch": 0.23619856622071828, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 27083 + }, + { + "epoch": 0.23620728750588688, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 27084 + }, + { + "epoch": 0.23621600879105545, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 27085 + }, + { + "epoch": 0.23622473007622402, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 27086 + }, + { + "epoch": 0.23623345136139262, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 27087 + }, + { + "epoch": 0.2362421726465612, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 27088 + }, + { + "epoch": 0.23625089393172977, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 27089 + }, + { + "epoch": 0.23625961521689837, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 27090 + }, + { + "epoch": 0.23626833650206694, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 27091 + }, + { + "epoch": 0.23627705778723554, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 27092 + }, + { + "epoch": 0.2362857790724041, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 27093 + }, + { + "epoch": 0.23629450035757268, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 27094 + }, + { + "epoch": 0.23630322164274128, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 27095 + }, + { + "epoch": 0.23631194292790986, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 27096 + }, + { + "epoch": 0.23632066421307843, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 27097 + }, + { + "epoch": 0.23632938549824703, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 27098 + }, + { + "epoch": 0.2363381067834156, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 27099 + }, + { + "epoch": 0.23634682806858417, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 27100 + }, + { + "epoch": 0.23635554935375278, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 27101 + }, + { + "epoch": 0.23636427063892135, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 27102 + }, + { + "epoch": 0.23637299192408992, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 27103 + }, + { + "epoch": 0.23638171320925852, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 27104 + }, + { + "epoch": 0.2363904344944271, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27105 + }, + { + "epoch": 0.2363991557795957, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 27106 + }, + { + "epoch": 0.23640787706476427, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 27107 + }, + { + "epoch": 0.23641659834993284, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 27108 + }, + { + "epoch": 0.23642531963510144, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 27109 + }, + { + "epoch": 0.23643404092027, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 27110 + }, + { + "epoch": 0.23644276220543858, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 27111 + }, + { + "epoch": 0.23645148349060718, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 27112 + }, + { + "epoch": 0.23646020477577576, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 27113 + }, + { + "epoch": 0.23646892606094433, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 27114 + }, + { + "epoch": 0.23647764734611293, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 27115 + }, + { + "epoch": 0.2364863686312815, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 27116 + }, + { + "epoch": 0.2364950899164501, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 27117 + }, + { + "epoch": 0.23650381120161867, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 27118 + }, + { + "epoch": 0.23651253248678725, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 27119 + }, + { + "epoch": 0.23652125377195585, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27120 + }, + { + "epoch": 0.23652997505712442, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 27121 + }, + { + "epoch": 0.236538696342293, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 27122 + }, + { + "epoch": 0.2365474176274616, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 27123 + }, + { + "epoch": 0.23655613891263017, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 27124 + }, + { + "epoch": 0.23656486019779874, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 27125 + }, + { + "epoch": 0.23657358148296734, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 27126 + }, + { + "epoch": 0.2365823027681359, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 27127 + }, + { + "epoch": 0.23659102405330448, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 27128 + }, + { + "epoch": 0.23659974533847308, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 27129 + }, + { + "epoch": 0.23660846662364166, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 27130 + }, + { + "epoch": 0.23661718790881026, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 27131 + }, + { + "epoch": 0.23662590919397883, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 27132 + }, + { + "epoch": 0.2366346304791474, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 27133 + }, + { + "epoch": 0.236643351764316, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27134 + }, + { + "epoch": 0.23665207304948457, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 27135 + }, + { + "epoch": 0.23666079433465315, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 27136 + }, + { + "epoch": 0.23666951561982175, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 27137 + }, + { + "epoch": 0.23667823690499032, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 27138 + }, + { + "epoch": 0.2366869581901589, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 27139 + }, + { + "epoch": 0.2366956794753275, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 27140 + }, + { + "epoch": 0.23670440076049606, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 27141 + }, + { + "epoch": 0.23671312204566464, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 27142 + }, + { + "epoch": 0.23672184333083324, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 27143 + }, + { + "epoch": 0.2367305646160018, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 27144 + }, + { + "epoch": 0.2367392859011704, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 27145 + }, + { + "epoch": 0.23674800718633898, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 27146 + }, + { + "epoch": 0.23675672847150755, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 27147 + }, + { + "epoch": 0.23676544975667616, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 27148 + }, + { + "epoch": 0.23677417104184473, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 27149 + }, + { + "epoch": 0.2367828923270133, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 27150 + }, + { + "epoch": 0.2367916136121819, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 27151 + }, + { + "epoch": 0.23680033489735047, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 27152 + }, + { + "epoch": 0.23680905618251905, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 27153 + }, + { + "epoch": 0.23681777746768765, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 27154 + }, + { + "epoch": 0.23682649875285622, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 27155 + }, + { + "epoch": 0.2368352200380248, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 27156 + }, + { + "epoch": 0.2368439413231934, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 27157 + }, + { + "epoch": 0.23685266260836196, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 27158 + }, + { + "epoch": 0.23686138389353056, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 27159 + }, + { + "epoch": 0.23687010517869914, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 27160 + }, + { + "epoch": 0.2368788264638677, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27161 + }, + { + "epoch": 0.2368875477490363, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 27162 + }, + { + "epoch": 0.23689626903420488, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 27163 + }, + { + "epoch": 0.23690499031937345, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 27164 + }, + { + "epoch": 0.23691371160454205, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 27165 + }, + { + "epoch": 0.23692243288971063, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 27166 + }, + { + "epoch": 0.2369311541748792, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 27167 + }, + { + "epoch": 0.2369398754600478, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 27168 + }, + { + "epoch": 0.23694859674521637, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 27169 + }, + { + "epoch": 0.23695731803038494, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 27170 + }, + { + "epoch": 0.23696603931555354, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 27171 + }, + { + "epoch": 0.23697476060072212, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 27172 + }, + { + "epoch": 0.23698348188589072, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 27173 + }, + { + "epoch": 0.2369922031710593, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 27174 + }, + { + "epoch": 0.23700092445622786, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 27175 + }, + { + "epoch": 0.23700964574139646, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 27176 + }, + { + "epoch": 0.23701836702656504, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 27177 + }, + { + "epoch": 0.2370270883117336, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 27178 + }, + { + "epoch": 0.2370358095969022, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 27179 + }, + { + "epoch": 0.23704453088207078, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 27180 + }, + { + "epoch": 0.23705325216723935, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 27181 + }, + { + "epoch": 0.23706197345240795, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 27182 + }, + { + "epoch": 0.23707069473757653, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 27183 + }, + { + "epoch": 0.2370794160227451, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 27184 + }, + { + "epoch": 0.2370881373079137, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 27185 + }, + { + "epoch": 0.23709685859308227, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 27186 + }, + { + "epoch": 0.23710557987825087, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 27187 + }, + { + "epoch": 0.23711430116341944, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 27188 + }, + { + "epoch": 0.23712302244858802, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 27189 + }, + { + "epoch": 0.23713174373375662, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 27190 + }, + { + "epoch": 0.2371404650189252, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 27191 + }, + { + "epoch": 0.23714918630409376, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 27192 + }, + { + "epoch": 0.23715790758926236, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 27193 + }, + { + "epoch": 0.23716662887443093, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 27194 + }, + { + "epoch": 0.2371753501595995, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 27195 + }, + { + "epoch": 0.2371840714447681, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 27196 + }, + { + "epoch": 0.23719279272993668, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 27197 + }, + { + "epoch": 0.23720151401510525, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 27198 + }, + { + "epoch": 0.23721023530027385, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 27199 + }, + { + "epoch": 0.23721895658544243, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 27200 + }, + { + "epoch": 0.23722767787061103, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 27201 + }, + { + "epoch": 0.2372363991557796, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 27202 + }, + { + "epoch": 0.23724512044094817, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 27203 + }, + { + "epoch": 0.23725384172611677, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 27204 + }, + { + "epoch": 0.23726256301128534, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 27205 + }, + { + "epoch": 0.23727128429645392, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 27206 + }, + { + "epoch": 0.23728000558162252, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 27207 + }, + { + "epoch": 0.2372887268667911, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 27208 + }, + { + "epoch": 0.23729744815195966, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 27209 + }, + { + "epoch": 0.23730616943712826, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9777, + "step": 27210 + }, + { + "epoch": 0.23731489072229683, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 27211 + }, + { + "epoch": 0.2373236120074654, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 27212 + }, + { + "epoch": 0.237332333292634, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 27213 + }, + { + "epoch": 0.23734105457780258, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 27214 + }, + { + "epoch": 0.23734977586297118, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 27215 + }, + { + "epoch": 0.23735849714813975, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 27216 + }, + { + "epoch": 0.23736721843330832, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 27217 + }, + { + "epoch": 0.23737593971847692, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 27218 + }, + { + "epoch": 0.2373846610036455, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 27219 + }, + { + "epoch": 0.23739338228881407, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27220 + }, + { + "epoch": 0.23740210357398267, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 27221 + }, + { + "epoch": 0.23741082485915124, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27222 + }, + { + "epoch": 0.23741954614431982, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27223 + }, + { + "epoch": 0.23742826742948842, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 27224 + }, + { + "epoch": 0.237436988714657, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 27225 + }, + { + "epoch": 0.23744570999982556, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 27226 + }, + { + "epoch": 0.23745443128499416, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 27227 + }, + { + "epoch": 0.23746315257016273, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 27228 + }, + { + "epoch": 0.23747187385533133, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 27229 + }, + { + "epoch": 0.2374805951404999, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 27230 + }, + { + "epoch": 0.23748931642566848, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 27231 + }, + { + "epoch": 0.23749803771083708, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27232 + }, + { + "epoch": 0.23750675899600565, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 27233 + }, + { + "epoch": 0.23751548028117422, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 27234 + }, + { + "epoch": 0.23752420156634282, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 27235 + }, + { + "epoch": 0.2375329228515114, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 27236 + }, + { + "epoch": 0.23754164413667997, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27237 + }, + { + "epoch": 0.23755036542184857, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 27238 + }, + { + "epoch": 0.23755908670701714, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 27239 + }, + { + "epoch": 0.23756780799218574, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 27240 + }, + { + "epoch": 0.23757652927735431, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 27241 + }, + { + "epoch": 0.2375852505625229, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 27242 + }, + { + "epoch": 0.2375939718476915, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 27243 + }, + { + "epoch": 0.23760269313286006, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27244 + }, + { + "epoch": 0.23761141441802863, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27245 + }, + { + "epoch": 0.23762013570319723, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 27246 + }, + { + "epoch": 0.2376288569883658, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27247 + }, + { + "epoch": 0.23763757827353438, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 27248 + }, + { + "epoch": 0.23764629955870298, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 27249 + }, + { + "epoch": 0.23765502084387155, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 27250 + }, + { + "epoch": 0.23766374212904012, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 27251 + }, + { + "epoch": 0.23767246341420872, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27252 + }, + { + "epoch": 0.2376811846993773, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 27253 + }, + { + "epoch": 0.2376899059845459, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 27254 + }, + { + "epoch": 0.23769862726971447, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 27255 + }, + { + "epoch": 0.23770734855488304, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 27256 + }, + { + "epoch": 0.23771606984005164, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 27257 + }, + { + "epoch": 0.23772479112522021, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27258 + }, + { + "epoch": 0.2377335124103888, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 27259 + }, + { + "epoch": 0.2377422336955574, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 27260 + }, + { + "epoch": 0.23775095498072596, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 27261 + }, + { + "epoch": 0.23775967626589453, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27262 + }, + { + "epoch": 0.23776839755106313, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 27263 + }, + { + "epoch": 0.2377771188362317, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 27264 + }, + { + "epoch": 0.23778584012140028, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 27265 + }, + { + "epoch": 0.23779456140656888, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 27266 + }, + { + "epoch": 0.23780328269173745, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 27267 + }, + { + "epoch": 0.23781200397690605, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27268 + }, + { + "epoch": 0.23782072526207462, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 27269 + }, + { + "epoch": 0.2378294465472432, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 27270 + }, + { + "epoch": 0.2378381678324118, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 27271 + }, + { + "epoch": 0.23784688911758037, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 27272 + }, + { + "epoch": 0.23785561040274894, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 27273 + }, + { + "epoch": 0.23786433168791754, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 27274 + }, + { + "epoch": 0.2378730529730861, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27275 + }, + { + "epoch": 0.23788177425825469, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 27276 + }, + { + "epoch": 0.23789049554342329, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 27277 + }, + { + "epoch": 0.23789921682859186, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 27278 + }, + { + "epoch": 0.23790793811376043, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 27279 + }, + { + "epoch": 0.23791665939892903, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 27280 + }, + { + "epoch": 0.2379253806840976, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 27281 + }, + { + "epoch": 0.2379341019692662, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 27282 + }, + { + "epoch": 0.23794282325443478, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 27283 + }, + { + "epoch": 0.23795154453960335, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 27284 + }, + { + "epoch": 0.23796026582477195, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 27285 + }, + { + "epoch": 0.23796898710994052, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 27286 + }, + { + "epoch": 0.2379777083951091, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 27287 + }, + { + "epoch": 0.2379864296802777, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 27288 + }, + { + "epoch": 0.23799515096544627, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 27289 + }, + { + "epoch": 0.23800387225061484, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 27290 + }, + { + "epoch": 0.23801259353578344, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 27291 + }, + { + "epoch": 0.238021314820952, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 27292 + }, + { + "epoch": 0.23803003610612058, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 27293 + }, + { + "epoch": 0.23803875739128919, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 27294 + }, + { + "epoch": 0.23804747867645776, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 27295 + }, + { + "epoch": 0.23805619996162636, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 27296 + }, + { + "epoch": 0.23806492124679493, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27297 + }, + { + "epoch": 0.2380736425319635, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 27298 + }, + { + "epoch": 0.2380823638171321, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 27299 + }, + { + "epoch": 0.23809108510230068, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 27300 + }, + { + "epoch": 0.23809980638746925, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 27301 + }, + { + "epoch": 0.23810852767263785, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 27302 + }, + { + "epoch": 0.23811724895780642, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 27303 + }, + { + "epoch": 0.238125970242975, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 27304 + }, + { + "epoch": 0.2381346915281436, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 27305 + }, + { + "epoch": 0.23814341281331217, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 27306 + }, + { + "epoch": 0.23815213409848074, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 27307 + }, + { + "epoch": 0.23816085538364934, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 27308 + }, + { + "epoch": 0.2381695766688179, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 27309 + }, + { + "epoch": 0.2381782979539865, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 27310 + }, + { + "epoch": 0.23818701923915508, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 27311 + }, + { + "epoch": 0.23819574052432366, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 27312 + }, + { + "epoch": 0.23820446180949226, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 27313 + }, + { + "epoch": 0.23821318309466083, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 27314 + }, + { + "epoch": 0.2382219043798294, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 27315 + }, + { + "epoch": 0.238230625664998, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27316 + }, + { + "epoch": 0.23823934695016658, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 27317 + }, + { + "epoch": 0.23824806823533515, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 27318 + }, + { + "epoch": 0.23825678952050375, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 27319 + }, + { + "epoch": 0.23826551080567232, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 27320 + }, + { + "epoch": 0.2382742320908409, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27321 + }, + { + "epoch": 0.2382829533760095, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 27322 + }, + { + "epoch": 0.23829167466117807, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27323 + }, + { + "epoch": 0.23830039594634667, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 27324 + }, + { + "epoch": 0.23830911723151524, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 27325 + }, + { + "epoch": 0.2383178385166838, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 27326 + }, + { + "epoch": 0.2383265598018524, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 27327 + }, + { + "epoch": 0.23833528108702098, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 27328 + }, + { + "epoch": 0.23834400237218956, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 27329 + }, + { + "epoch": 0.23835272365735816, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 27330 + }, + { + "epoch": 0.23836144494252673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 27331 + }, + { + "epoch": 0.2383701662276953, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 27332 + }, + { + "epoch": 0.2383788875128639, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 27333 + }, + { + "epoch": 0.23838760879803247, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 27334 + }, + { + "epoch": 0.23839633008320105, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 27335 + }, + { + "epoch": 0.23840505136836965, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 27336 + }, + { + "epoch": 0.23841377265353822, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 27337 + }, + { + "epoch": 0.23842249393870682, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 27338 + }, + { + "epoch": 0.2384312152238754, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 27339 + }, + { + "epoch": 0.23843993650904396, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 27340 + }, + { + "epoch": 0.23844865779421257, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 27341 + }, + { + "epoch": 0.23845737907938114, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 27342 + }, + { + "epoch": 0.2384661003645497, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 27343 + }, + { + "epoch": 0.2384748216497183, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27344 + }, + { + "epoch": 0.23848354293488688, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 27345 + }, + { + "epoch": 0.23849226422005546, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 27346 + }, + { + "epoch": 0.23850098550522406, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 0.9839, + "step": 27347 + }, + { + "epoch": 0.23850970679039263, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 27348 + }, + { + "epoch": 0.23851842807556123, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 27349 + }, + { + "epoch": 0.2385271493607298, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 27350 + }, + { + "epoch": 0.23853587064589837, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27351 + }, + { + "epoch": 0.23854459193106697, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 27352 + }, + { + "epoch": 0.23855331321623555, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 27353 + }, + { + "epoch": 0.23856203450140412, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 27354 + }, + { + "epoch": 0.23857075578657272, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 27355 + }, + { + "epoch": 0.2385794770717413, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 27356 + }, + { + "epoch": 0.23858819835690986, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 27357 + }, + { + "epoch": 0.23859691964207846, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 27358 + }, + { + "epoch": 0.23860564092724704, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 27359 + }, + { + "epoch": 0.2386143622124156, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 27360 + }, + { + "epoch": 0.2386230834975842, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 27361 + }, + { + "epoch": 0.23863180478275278, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 27362 + }, + { + "epoch": 0.23864052606792138, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 27363 + }, + { + "epoch": 0.23864924735308995, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 27364 + }, + { + "epoch": 0.23865796863825853, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 27365 + }, + { + "epoch": 0.23866668992342713, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 27366 + }, + { + "epoch": 0.2386754112085957, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27367 + }, + { + "epoch": 0.23868413249376427, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27368 + }, + { + "epoch": 0.23869285377893287, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 27369 + }, + { + "epoch": 0.23870157506410145, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 27370 + }, + { + "epoch": 0.23871029634927002, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 27371 + }, + { + "epoch": 0.23871901763443862, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 27372 + }, + { + "epoch": 0.2387277389196072, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 27373 + }, + { + "epoch": 0.23873646020477576, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 27374 + }, + { + "epoch": 0.23874518148994436, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 27375 + }, + { + "epoch": 0.23875390277511294, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 27376 + }, + { + "epoch": 0.23876262406028154, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 27377 + }, + { + "epoch": 0.2387713453454501, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27378 + }, + { + "epoch": 0.23878006663061868, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 27379 + }, + { + "epoch": 0.23878878791578728, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 27380 + }, + { + "epoch": 0.23879750920095585, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 27381 + }, + { + "epoch": 0.23880623048612443, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 27382 + }, + { + "epoch": 0.23881495177129303, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 27383 + }, + { + "epoch": 0.2388236730564616, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 27384 + }, + { + "epoch": 0.23883239434163017, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 27385 + }, + { + "epoch": 0.23884111562679877, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 27386 + }, + { + "epoch": 0.23884983691196734, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 27387 + }, + { + "epoch": 0.23885855819713592, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 27388 + }, + { + "epoch": 0.23886727948230452, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27389 + }, + { + "epoch": 0.2388760007674731, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27390 + }, + { + "epoch": 0.2388847220526417, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 27391 + }, + { + "epoch": 0.23889344333781026, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 27392 + }, + { + "epoch": 0.23890216462297884, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 27393 + }, + { + "epoch": 0.23891088590814744, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 27394 + }, + { + "epoch": 0.238919607193316, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 27395 + }, + { + "epoch": 0.23892832847848458, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 27396 + }, + { + "epoch": 0.23893704976365318, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 27397 + }, + { + "epoch": 0.23894577104882175, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 27398 + }, + { + "epoch": 0.23895449233399033, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 27399 + }, + { + "epoch": 0.23896321361915893, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 27400 + }, + { + "epoch": 0.2389719349043275, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 27401 + }, + { + "epoch": 0.23898065618949607, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 27402 + }, + { + "epoch": 0.23898937747466467, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 27403 + }, + { + "epoch": 0.23899809875983324, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 27404 + }, + { + "epoch": 0.23900682004500184, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 27405 + }, + { + "epoch": 0.23901554133017042, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 27406 + }, + { + "epoch": 0.239024262615339, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 27407 + }, + { + "epoch": 0.2390329839005076, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 27408 + }, + { + "epoch": 0.23904170518567616, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 27409 + }, + { + "epoch": 0.23905042647084473, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 27410 + }, + { + "epoch": 0.23905914775601333, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 27411 + }, + { + "epoch": 0.2390678690411819, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 27412 + }, + { + "epoch": 0.23907659032635048, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 27413 + }, + { + "epoch": 0.23908531161151908, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 27414 + }, + { + "epoch": 0.23909403289668765, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 27415 + }, + { + "epoch": 0.23910275418185623, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 27416 + }, + { + "epoch": 0.23911147546702483, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 27417 + }, + { + "epoch": 0.2391201967521934, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 27418 + }, + { + "epoch": 0.239128918037362, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 27419 + }, + { + "epoch": 0.23913763932253057, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 27420 + }, + { + "epoch": 0.23914636060769914, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 27421 + }, + { + "epoch": 0.23915508189286774, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 27422 + }, + { + "epoch": 0.23916380317803632, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 27423 + }, + { + "epoch": 0.2391725244632049, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 27424 + }, + { + "epoch": 0.2391812457483735, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 27425 + }, + { + "epoch": 0.23918996703354206, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 27426 + }, + { + "epoch": 0.23919868831871063, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 27427 + }, + { + "epoch": 0.23920740960387923, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 27428 + }, + { + "epoch": 0.2392161308890478, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 27429 + }, + { + "epoch": 0.23922485217421638, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27430 + }, + { + "epoch": 0.23923357345938498, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 27431 + }, + { + "epoch": 0.23924229474455355, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 27432 + }, + { + "epoch": 0.23925101602972215, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 27433 + }, + { + "epoch": 0.23925973731489072, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 27434 + }, + { + "epoch": 0.2392684586000593, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 27435 + }, + { + "epoch": 0.2392771798852279, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 27436 + }, + { + "epoch": 0.23928590117039647, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 27437 + }, + { + "epoch": 0.23929462245556504, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 27438 + }, + { + "epoch": 0.23930334374073364, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 27439 + }, + { + "epoch": 0.23931206502590222, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 27440 + }, + { + "epoch": 0.2393207863110708, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 27441 + }, + { + "epoch": 0.2393295075962394, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 27442 + }, + { + "epoch": 0.23933822888140796, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 27443 + }, + { + "epoch": 0.23934695016657653, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 27444 + }, + { + "epoch": 0.23935567145174513, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 27445 + }, + { + "epoch": 0.2393643927369137, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 27446 + }, + { + "epoch": 0.2393731140220823, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 27447 + }, + { + "epoch": 0.23938183530725088, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 27448 + }, + { + "epoch": 0.23939055659241945, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 27449 + }, + { + "epoch": 0.23939927787758805, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 27450 + }, + { + "epoch": 0.23940799916275662, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 27451 + }, + { + "epoch": 0.2394167204479252, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 27452 + }, + { + "epoch": 0.2394254417330938, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 27453 + }, + { + "epoch": 0.23943416301826237, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 27454 + }, + { + "epoch": 0.23944288430343094, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 27455 + }, + { + "epoch": 0.23945160558859954, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 27456 + }, + { + "epoch": 0.23946032687376811, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 27457 + }, + { + "epoch": 0.2394690481589367, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 27458 + }, + { + "epoch": 0.2394777694441053, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 27459 + }, + { + "epoch": 0.23948649072927386, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 27460 + }, + { + "epoch": 0.23949521201444246, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 27461 + }, + { + "epoch": 0.23950393329961103, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 27462 + }, + { + "epoch": 0.2395126545847796, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 27463 + }, + { + "epoch": 0.2395213758699482, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 27464 + }, + { + "epoch": 0.23953009715511678, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 27465 + }, + { + "epoch": 0.23953881844028535, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 27466 + }, + { + "epoch": 0.23954753972545395, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 27467 + }, + { + "epoch": 0.23955626101062252, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 27468 + }, + { + "epoch": 0.2395649822957911, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 27469 + }, + { + "epoch": 0.2395737035809597, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 27470 + }, + { + "epoch": 0.23958242486612827, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 27471 + }, + { + "epoch": 0.23959114615129687, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 27472 + }, + { + "epoch": 0.23959986743646544, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 27473 + }, + { + "epoch": 0.239608588721634, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 27474 + }, + { + "epoch": 0.23961731000680261, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 27475 + }, + { + "epoch": 0.2396260312919712, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 27476 + }, + { + "epoch": 0.23963475257713976, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 27477 + }, + { + "epoch": 0.23964347386230836, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 27478 + }, + { + "epoch": 0.23965219514747693, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 27479 + }, + { + "epoch": 0.2396609164326455, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 27480 + }, + { + "epoch": 0.2396696377178141, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 27481 + }, + { + "epoch": 0.23967835900298268, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 27482 + }, + { + "epoch": 0.23968708028815125, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 27483 + }, + { + "epoch": 0.23969580157331985, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27484 + }, + { + "epoch": 0.23970452285848842, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 27485 + }, + { + "epoch": 0.23971324414365702, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 27486 + }, + { + "epoch": 0.2397219654288256, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 27487 + }, + { + "epoch": 0.23973068671399417, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 27488 + }, + { + "epoch": 0.23973940799916277, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 27489 + }, + { + "epoch": 0.23974812928433134, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 27490 + }, + { + "epoch": 0.2397568505694999, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 27491 + }, + { + "epoch": 0.2397655718546685, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 27492 + }, + { + "epoch": 0.23977429313983709, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 27493 + }, + { + "epoch": 0.23978301442500566, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 27494 + }, + { + "epoch": 0.23979173571017426, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9814, + "step": 27495 + }, + { + "epoch": 0.23980045699534283, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 27496 + }, + { + "epoch": 0.2398091782805114, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 27497 + }, + { + "epoch": 0.23981789956568, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 27498 + }, + { + "epoch": 0.23982662085084858, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 27499 + }, + { + "epoch": 0.23983534213601718, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 27500 + }, + { + "epoch": 0.23984406342118575, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 27501 + }, + { + "epoch": 0.23985278470635432, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 27502 + }, + { + "epoch": 0.23986150599152292, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 27503 + }, + { + "epoch": 0.2398702272766915, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 27504 + }, + { + "epoch": 0.23987894856186007, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 27505 + }, + { + "epoch": 0.23988766984702867, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 27506 + }, + { + "epoch": 0.23989639113219724, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27507 + }, + { + "epoch": 0.2399051124173658, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 27508 + }, + { + "epoch": 0.2399138337025344, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 27509 + }, + { + "epoch": 0.23992255498770299, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 27510 + }, + { + "epoch": 0.23993127627287156, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 27511 + }, + { + "epoch": 0.23993999755804016, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 27512 + }, + { + "epoch": 0.23994871884320873, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 27513 + }, + { + "epoch": 0.23995744012837733, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 27514 + }, + { + "epoch": 0.2399661614135459, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 27515 + }, + { + "epoch": 0.23997488269871448, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 27516 + }, + { + "epoch": 0.23998360398388308, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 27517 + }, + { + "epoch": 0.23999232526905165, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 27518 + }, + { + "epoch": 0.24000104655422022, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 27519 + }, + { + "epoch": 0.24000976783938882, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 27520 + }, + { + "epoch": 0.2400184891245574, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 27521 + }, + { + "epoch": 0.24002721040972597, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 27522 + }, + { + "epoch": 0.24003593169489457, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.978, + "step": 27523 + }, + { + "epoch": 0.24004465298006314, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 27524 + }, + { + "epoch": 0.2400533742652317, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 27525 + }, + { + "epoch": 0.2400620955504003, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 27526 + }, + { + "epoch": 0.24007081683556888, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 27527 + }, + { + "epoch": 0.24007953812073748, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 27528 + }, + { + "epoch": 0.24008825940590606, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 27529 + }, + { + "epoch": 0.24009698069107463, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 27530 + }, + { + "epoch": 0.24010570197624323, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 27531 + }, + { + "epoch": 0.2401144232614118, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 27532 + }, + { + "epoch": 0.24012314454658037, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 27533 + }, + { + "epoch": 0.24013186583174898, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 27534 + }, + { + "epoch": 0.24014058711691755, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27535 + }, + { + "epoch": 0.24014930840208612, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 27536 + }, + { + "epoch": 0.24015802968725472, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 27537 + }, + { + "epoch": 0.2401667509724233, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 27538 + }, + { + "epoch": 0.24017547225759187, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 27539 + }, + { + "epoch": 0.24018419354276047, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 27540 + }, + { + "epoch": 0.24019291482792904, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 27541 + }, + { + "epoch": 0.24020163611309764, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 27542 + }, + { + "epoch": 0.2402103573982662, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 27543 + }, + { + "epoch": 0.24021907868343478, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 27544 + }, + { + "epoch": 0.24022779996860338, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 27545 + }, + { + "epoch": 0.24023652125377196, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 27546 + }, + { + "epoch": 0.24024524253894053, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 27547 + }, + { + "epoch": 0.24025396382410913, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 27548 + }, + { + "epoch": 0.2402626851092777, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 27549 + }, + { + "epoch": 0.24027140639444627, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 27550 + }, + { + "epoch": 0.24028012767961487, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 27551 + }, + { + "epoch": 0.24028884896478345, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 27552 + }, + { + "epoch": 0.24029757024995202, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 27553 + }, + { + "epoch": 0.24030629153512062, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 27554 + }, + { + "epoch": 0.2403150128202892, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 27555 + }, + { + "epoch": 0.2403237341054578, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 27556 + }, + { + "epoch": 0.24033245539062636, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 27557 + }, + { + "epoch": 0.24034117667579494, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 27558 + }, + { + "epoch": 0.24034989796096354, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 27559 + }, + { + "epoch": 0.2403586192461321, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 27560 + }, + { + "epoch": 0.24036734053130068, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 27561 + }, + { + "epoch": 0.24037606181646928, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 27562 + }, + { + "epoch": 0.24038478310163786, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 27563 + }, + { + "epoch": 0.24039350438680643, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 27564 + }, + { + "epoch": 0.24040222567197503, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 27565 + }, + { + "epoch": 0.2404109469571436, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 27566 + }, + { + "epoch": 0.24041966824231217, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 27567 + }, + { + "epoch": 0.24042838952748077, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 27568 + }, + { + "epoch": 0.24043711081264935, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 27569 + }, + { + "epoch": 0.24044583209781795, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 27570 + }, + { + "epoch": 0.24045455338298652, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 27571 + }, + { + "epoch": 0.2404632746681551, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 27572 + }, + { + "epoch": 0.2404719959533237, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 27573 + }, + { + "epoch": 0.24048071723849226, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 27574 + }, + { + "epoch": 0.24048943852366084, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 27575 + }, + { + "epoch": 0.24049815980882944, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 27576 + }, + { + "epoch": 0.240506881093998, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 27577 + }, + { + "epoch": 0.24051560237916658, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 27578 + }, + { + "epoch": 0.24052432366433518, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27579 + }, + { + "epoch": 0.24053304494950375, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 27580 + }, + { + "epoch": 0.24054176623467236, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 27581 + }, + { + "epoch": 0.24055048751984093, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 27582 + }, + { + "epoch": 0.2405592088050095, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 27583 + }, + { + "epoch": 0.2405679300901781, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 27584 + }, + { + "epoch": 0.24057665137534667, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27585 + }, + { + "epoch": 0.24058537266051525, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 27586 + }, + { + "epoch": 0.24059409394568385, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 27587 + }, + { + "epoch": 0.24060281523085242, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 27588 + }, + { + "epoch": 0.240611536516021, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 27589 + }, + { + "epoch": 0.2406202578011896, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 27590 + }, + { + "epoch": 0.24062897908635816, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 27591 + }, + { + "epoch": 0.24063770037152674, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 27592 + }, + { + "epoch": 0.24064642165669534, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 27593 + }, + { + "epoch": 0.2406551429418639, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 27594 + }, + { + "epoch": 0.2406638642270325, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 27595 + }, + { + "epoch": 0.24067258551220108, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 27596 + }, + { + "epoch": 0.24068130679736965, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27597 + }, + { + "epoch": 0.24069002808253825, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 27598 + }, + { + "epoch": 0.24069874936770683, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 27599 + }, + { + "epoch": 0.2407074706528754, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 27600 + }, + { + "epoch": 0.240716191938044, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 27601 + }, + { + "epoch": 0.24072491322321257, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 27602 + }, + { + "epoch": 0.24073363450838114, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 27603 + }, + { + "epoch": 0.24074235579354974, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 27604 + }, + { + "epoch": 0.24075107707871832, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 27605 + }, + { + "epoch": 0.2407597983638869, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 27606 + }, + { + "epoch": 0.2407685196490555, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 27607 + }, + { + "epoch": 0.24077724093422406, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 27608 + }, + { + "epoch": 0.24078596221939266, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 27609 + }, + { + "epoch": 0.24079468350456124, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 27610 + }, + { + "epoch": 0.2408034047897298, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 27611 + }, + { + "epoch": 0.2408121260748984, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 27612 + }, + { + "epoch": 0.24082084736006698, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 27613 + }, + { + "epoch": 0.24082956864523555, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 27614 + }, + { + "epoch": 0.24083828993040415, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 27615 + }, + { + "epoch": 0.24084701121557273, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27616 + }, + { + "epoch": 0.2408557325007413, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 27617 + }, + { + "epoch": 0.2408644537859099, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 27618 + }, + { + "epoch": 0.24087317507107847, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 27619 + }, + { + "epoch": 0.24088189635624704, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 27620 + }, + { + "epoch": 0.24089061764141564, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 27621 + }, + { + "epoch": 0.24089933892658422, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 27622 + }, + { + "epoch": 0.24090806021175282, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 27623 + }, + { + "epoch": 0.2409167814969214, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 27624 + }, + { + "epoch": 0.24092550278208996, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 27625 + }, + { + "epoch": 0.24093422406725856, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 27626 + }, + { + "epoch": 0.24094294535242713, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 27627 + }, + { + "epoch": 0.2409516666375957, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 27628 + }, + { + "epoch": 0.2409603879227643, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 27629 + }, + { + "epoch": 0.24096910920793288, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 27630 + }, + { + "epoch": 0.24097783049310145, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 27631 + }, + { + "epoch": 0.24098655177827005, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 27632 + }, + { + "epoch": 0.24099527306343863, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 27633 + }, + { + "epoch": 0.2410039943486072, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 27634 + }, + { + "epoch": 0.2410127156337758, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 27635 + }, + { + "epoch": 0.24102143691894437, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 27636 + }, + { + "epoch": 0.24103015820411297, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 27637 + }, + { + "epoch": 0.24103887948928154, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 27638 + }, + { + "epoch": 0.24104760077445012, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 27639 + }, + { + "epoch": 0.24105632205961872, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 27640 + }, + { + "epoch": 0.2410650433447873, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 27641 + }, + { + "epoch": 0.24107376462995586, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 27642 + }, + { + "epoch": 0.24108248591512446, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 27643 + }, + { + "epoch": 0.24109120720029303, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 27644 + }, + { + "epoch": 0.2410999284854616, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 27645 + }, + { + "epoch": 0.2411086497706302, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 27646 + }, + { + "epoch": 0.24111737105579878, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 27647 + }, + { + "epoch": 0.24112609234096735, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 27648 + }, + { + "epoch": 0.24113481362613595, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 27649 + }, + { + "epoch": 0.24114353491130452, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 27650 + }, + { + "epoch": 0.24115225619647312, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 27651 + }, + { + "epoch": 0.2411609774816417, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 27652 + }, + { + "epoch": 0.24116969876681027, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 27653 + }, + { + "epoch": 0.24117842005197887, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 27654 + }, + { + "epoch": 0.24118714133714744, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 27655 + }, + { + "epoch": 0.24119586262231602, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 27656 + }, + { + "epoch": 0.24120458390748462, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 27657 + }, + { + "epoch": 0.2412133051926532, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 27658 + }, + { + "epoch": 0.24122202647782176, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 27659 + }, + { + "epoch": 0.24123074776299036, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 27660 + }, + { + "epoch": 0.24123946904815893, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 27661 + }, + { + "epoch": 0.2412481903333275, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 27662 + }, + { + "epoch": 0.2412569116184961, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 27663 + }, + { + "epoch": 0.24126563290366468, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 27664 + }, + { + "epoch": 0.24127435418883328, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27665 + }, + { + "epoch": 0.24128307547400185, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 27666 + }, + { + "epoch": 0.24129179675917042, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 27667 + }, + { + "epoch": 0.24130051804433902, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 27668 + }, + { + "epoch": 0.2413092393295076, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 27669 + }, + { + "epoch": 0.24131796061467617, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 27670 + }, + { + "epoch": 0.24132668189984477, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 27671 + }, + { + "epoch": 0.24133540318501334, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 27672 + }, + { + "epoch": 0.24134412447018191, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 27673 + }, + { + "epoch": 0.24135284575535051, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27674 + }, + { + "epoch": 0.2413615670405191, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 27675 + }, + { + "epoch": 0.24137028832568766, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27676 + }, + { + "epoch": 0.24137900961085626, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 27677 + }, + { + "epoch": 0.24138773089602483, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 27678 + }, + { + "epoch": 0.24139645218119343, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 27679 + }, + { + "epoch": 0.241405173466362, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 27680 + }, + { + "epoch": 0.24141389475153058, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 27681 + }, + { + "epoch": 0.24142261603669918, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 27682 + }, + { + "epoch": 0.24143133732186775, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 27683 + }, + { + "epoch": 0.24144005860703632, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 27684 + }, + { + "epoch": 0.24144877989220492, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 27685 + }, + { + "epoch": 0.2414575011773735, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 27686 + }, + { + "epoch": 0.24146622246254207, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 27687 + }, + { + "epoch": 0.24147494374771067, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 27688 + }, + { + "epoch": 0.24148366503287924, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 27689 + }, + { + "epoch": 0.24149238631804784, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 27690 + }, + { + "epoch": 0.24150110760321641, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 27691 + }, + { + "epoch": 0.241509828888385, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 27692 + }, + { + "epoch": 0.2415185501735536, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 27693 + }, + { + "epoch": 0.24152727145872216, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 27694 + }, + { + "epoch": 0.24153599274389073, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 27695 + }, + { + "epoch": 0.24154471402905933, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 27696 + }, + { + "epoch": 0.2415534353142279, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9786, + "step": 27697 + }, + { + "epoch": 0.24156215659939648, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 27698 + }, + { + "epoch": 0.24157087788456508, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 27699 + }, + { + "epoch": 0.24157959916973365, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 27700 + }, + { + "epoch": 0.24158832045490222, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 27701 + }, + { + "epoch": 0.24159704174007082, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 27702 + }, + { + "epoch": 0.2416057630252394, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 27703 + }, + { + "epoch": 0.241614484310408, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 27704 + }, + { + "epoch": 0.24162320559557657, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27705 + }, + { + "epoch": 0.24163192688074514, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 27706 + }, + { + "epoch": 0.24164064816591374, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 27707 + }, + { + "epoch": 0.2416493694510823, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 27708 + }, + { + "epoch": 0.24165809073625089, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 27709 + }, + { + "epoch": 0.24166681202141949, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 27710 + }, + { + "epoch": 0.24167553330658806, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 27711 + }, + { + "epoch": 0.24168425459175663, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 27712 + }, + { + "epoch": 0.24169297587692523, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 27713 + }, + { + "epoch": 0.2417016971620938, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 27714 + }, + { + "epoch": 0.24171041844726238, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 27715 + }, + { + "epoch": 0.24171913973243098, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 27716 + }, + { + "epoch": 0.24172786101759955, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 27717 + }, + { + "epoch": 0.24173658230276815, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27718 + }, + { + "epoch": 0.24174530358793672, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 27719 + }, + { + "epoch": 0.2417540248731053, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 27720 + }, + { + "epoch": 0.2417627461582739, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 27721 + }, + { + "epoch": 0.24177146744344247, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 27722 + }, + { + "epoch": 0.24178018872861104, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 27723 + }, + { + "epoch": 0.24178891001377964, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 27724 + }, + { + "epoch": 0.2417976312989482, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 27725 + }, + { + "epoch": 0.24180635258411678, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 27726 + }, + { + "epoch": 0.24181507386928539, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 27727 + }, + { + "epoch": 0.24182379515445396, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 27728 + }, + { + "epoch": 0.24183251643962253, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27729 + }, + { + "epoch": 0.24184123772479113, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 27730 + }, + { + "epoch": 0.2418499590099597, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 27731 + }, + { + "epoch": 0.2418586802951283, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 27732 + }, + { + "epoch": 0.24186740158029688, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 27733 + }, + { + "epoch": 0.24187612286546545, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 27734 + }, + { + "epoch": 0.24188484415063405, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 27735 + }, + { + "epoch": 0.24189356543580262, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 27736 + }, + { + "epoch": 0.2419022867209712, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 27737 + }, + { + "epoch": 0.2419110080061398, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 27738 + }, + { + "epoch": 0.24191972929130837, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 27739 + }, + { + "epoch": 0.24192845057647694, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 27740 + }, + { + "epoch": 0.24193717186164554, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 27741 + }, + { + "epoch": 0.2419458931468141, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 27742 + }, + { + "epoch": 0.24195461443198268, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 27743 + }, + { + "epoch": 0.24196333571715128, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 27744 + }, + { + "epoch": 0.24197205700231986, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 27745 + }, + { + "epoch": 0.24198077828748846, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 27746 + }, + { + "epoch": 0.24198949957265703, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 27747 + }, + { + "epoch": 0.2419982208578256, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 27748 + }, + { + "epoch": 0.2420069421429942, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 27749 + }, + { + "epoch": 0.24201566342816277, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 27750 + }, + { + "epoch": 0.24202438471333135, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 27751 + }, + { + "epoch": 0.24203310599849995, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 27752 + }, + { + "epoch": 0.24204182728366852, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 27753 + }, + { + "epoch": 0.2420505485688371, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 27754 + }, + { + "epoch": 0.2420592698540057, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 27755 + }, + { + "epoch": 0.24206799113917427, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 27756 + }, + { + "epoch": 0.24207671242434284, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 27757 + }, + { + "epoch": 0.24208543370951144, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 27758 + }, + { + "epoch": 0.24209415499468, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 27759 + }, + { + "epoch": 0.2421028762798486, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 27760 + }, + { + "epoch": 0.24211159756501718, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 27761 + }, + { + "epoch": 0.24212031885018576, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 27762 + }, + { + "epoch": 0.24212904013535436, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 27763 + }, + { + "epoch": 0.24213776142052293, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 27764 + }, + { + "epoch": 0.2421464827056915, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27765 + }, + { + "epoch": 0.2421552039908601, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 27766 + }, + { + "epoch": 0.24216392527602867, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 27767 + }, + { + "epoch": 0.24217264656119725, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 27768 + }, + { + "epoch": 0.24218136784636585, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 27769 + }, + { + "epoch": 0.24219008913153442, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 27770 + }, + { + "epoch": 0.242198810416703, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 27771 + }, + { + "epoch": 0.2422075317018716, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 27772 + }, + { + "epoch": 0.24221625298704016, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 27773 + }, + { + "epoch": 0.24222497427220877, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27774 + }, + { + "epoch": 0.24223369555737734, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 27775 + }, + { + "epoch": 0.2422424168425459, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 27776 + }, + { + "epoch": 0.2422511381277145, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 27777 + }, + { + "epoch": 0.24225985941288308, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 27778 + }, + { + "epoch": 0.24226858069805166, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 27779 + }, + { + "epoch": 0.24227730198322026, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 27780 + }, + { + "epoch": 0.24228602326838883, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 27781 + }, + { + "epoch": 0.2422947445535574, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 27782 + }, + { + "epoch": 0.242303465838726, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 27783 + }, + { + "epoch": 0.24231218712389457, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 27784 + }, + { + "epoch": 0.24232090840906315, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 27785 + }, + { + "epoch": 0.24232962969423175, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 27786 + }, + { + "epoch": 0.24233835097940032, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 27787 + }, + { + "epoch": 0.24234707226456892, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 27788 + }, + { + "epoch": 0.2423557935497375, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 27789 + }, + { + "epoch": 0.24236451483490606, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 27790 + }, + { + "epoch": 0.24237323612007466, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 27791 + }, + { + "epoch": 0.24238195740524324, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 27792 + }, + { + "epoch": 0.2423906786904118, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 27793 + }, + { + "epoch": 0.2423993999755804, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 27794 + }, + { + "epoch": 0.24240812126074898, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 27795 + }, + { + "epoch": 0.24241684254591755, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 27796 + }, + { + "epoch": 0.24242556383108615, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 27797 + }, + { + "epoch": 0.24243428511625473, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 27798 + }, + { + "epoch": 0.2424430064014233, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 27799 + }, + { + "epoch": 0.2424517276865919, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 27800 + }, + { + "epoch": 0.24246044897176047, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 27801 + }, + { + "epoch": 0.24246917025692907, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 27802 + }, + { + "epoch": 0.24247789154209765, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 27803 + }, + { + "epoch": 0.24248661282726622, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 27804 + }, + { + "epoch": 0.24249533411243482, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 27805 + }, + { + "epoch": 0.2425040553976034, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 27806 + }, + { + "epoch": 0.24251277668277196, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 27807 + }, + { + "epoch": 0.24252149796794056, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 27808 + }, + { + "epoch": 0.24253021925310914, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 27809 + }, + { + "epoch": 0.2425389405382777, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 27810 + }, + { + "epoch": 0.2425476618234463, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 27811 + }, + { + "epoch": 0.24255638310861488, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 27812 + }, + { + "epoch": 0.24256510439378348, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 27813 + }, + { + "epoch": 0.24257382567895205, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 27814 + }, + { + "epoch": 0.24258254696412063, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 27815 + }, + { + "epoch": 0.24259126824928923, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 27816 + }, + { + "epoch": 0.2425999895344578, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 27817 + }, + { + "epoch": 0.24260871081962637, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 27818 + }, + { + "epoch": 0.24261743210479497, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 27819 + }, + { + "epoch": 0.24262615338996354, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 27820 + }, + { + "epoch": 0.24263487467513212, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 27821 + }, + { + "epoch": 0.24264359596030072, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 27822 + }, + { + "epoch": 0.2426523172454693, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 27823 + }, + { + "epoch": 0.24266103853063786, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 27824 + }, + { + "epoch": 0.24266975981580646, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 27825 + }, + { + "epoch": 0.24267848110097504, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 27826 + }, + { + "epoch": 0.24268720238614364, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 27827 + }, + { + "epoch": 0.2426959236713122, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 27828 + }, + { + "epoch": 0.24270464495648078, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 27829 + }, + { + "epoch": 0.24271336624164938, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 27830 + }, + { + "epoch": 0.24272208752681795, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 27831 + }, + { + "epoch": 0.24273080881198653, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 27832 + }, + { + "epoch": 0.24273953009715513, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 27833 + }, + { + "epoch": 0.2427482513823237, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 27834 + }, + { + "epoch": 0.24275697266749227, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 27835 + }, + { + "epoch": 0.24276569395266087, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 27836 + }, + { + "epoch": 0.24277441523782944, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 27837 + }, + { + "epoch": 0.24278313652299802, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 27838 + }, + { + "epoch": 0.24279185780816662, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 27839 + }, + { + "epoch": 0.2428005790933352, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 27840 + }, + { + "epoch": 0.2428093003785038, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 27841 + }, + { + "epoch": 0.24281802166367236, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 27842 + }, + { + "epoch": 0.24282674294884093, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 27843 + }, + { + "epoch": 0.24283546423400953, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 27844 + }, + { + "epoch": 0.2428441855191781, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 27845 + }, + { + "epoch": 0.24285290680434668, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 27846 + }, + { + "epoch": 0.24286162808951528, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 27847 + }, + { + "epoch": 0.24287034937468385, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 27848 + }, + { + "epoch": 0.24287907065985243, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 27849 + }, + { + "epoch": 0.24288779194502103, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 27850 + }, + { + "epoch": 0.2428965132301896, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27851 + }, + { + "epoch": 0.24290523451535817, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 27852 + }, + { + "epoch": 0.24291395580052677, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 27853 + }, + { + "epoch": 0.24292267708569534, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 27854 + }, + { + "epoch": 0.24293139837086394, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 27855 + }, + { + "epoch": 0.24294011965603252, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 27856 + }, + { + "epoch": 0.2429488409412011, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 27857 + }, + { + "epoch": 0.2429575622263697, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 27858 + }, + { + "epoch": 0.24296628351153826, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 27859 + }, + { + "epoch": 0.24297500479670683, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 27860 + }, + { + "epoch": 0.24298372608187543, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 27861 + }, + { + "epoch": 0.242992447367044, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 27862 + }, + { + "epoch": 0.24300116865221258, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 27863 + }, + { + "epoch": 0.24300988993738118, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 27864 + }, + { + "epoch": 0.24301861122254975, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 27865 + }, + { + "epoch": 0.24302733250771832, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 27866 + }, + { + "epoch": 0.24303605379288692, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 27867 + }, + { + "epoch": 0.2430447750780555, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 27868 + }, + { + "epoch": 0.2430534963632241, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 27869 + }, + { + "epoch": 0.24306221764839267, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 27870 + }, + { + "epoch": 0.24307093893356124, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 27871 + }, + { + "epoch": 0.24307966021872984, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 27872 + }, + { + "epoch": 0.24308838150389842, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 27873 + }, + { + "epoch": 0.243097102789067, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 27874 + }, + { + "epoch": 0.2431058240742356, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 27875 + }, + { + "epoch": 0.24311454535940416, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 27876 + }, + { + "epoch": 0.24312326664457273, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 27877 + }, + { + "epoch": 0.24313198792974133, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 27878 + }, + { + "epoch": 0.2431407092149099, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 27879 + }, + { + "epoch": 0.24314943050007848, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 27880 + }, + { + "epoch": 0.24315815178524708, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 27881 + }, + { + "epoch": 0.24316687307041565, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 27882 + }, + { + "epoch": 0.24317559435558425, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 27883 + }, + { + "epoch": 0.24318431564075282, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 27884 + }, + { + "epoch": 0.2431930369259214, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 27885 + }, + { + "epoch": 0.24320175821109, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 27886 + }, + { + "epoch": 0.24321047949625857, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 27887 + }, + { + "epoch": 0.24321920078142714, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 27888 + }, + { + "epoch": 0.24322792206659574, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 27889 + }, + { + "epoch": 0.24323664335176431, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 27890 + }, + { + "epoch": 0.2432453646369329, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 27891 + }, + { + "epoch": 0.2432540859221015, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 27892 + }, + { + "epoch": 0.24326280720727006, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 27893 + }, + { + "epoch": 0.24327152849243863, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 27894 + }, + { + "epoch": 0.24328024977760723, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 27895 + }, + { + "epoch": 0.2432889710627758, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 27896 + }, + { + "epoch": 0.2432976923479444, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 27897 + }, + { + "epoch": 0.24330641363311298, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 27898 + }, + { + "epoch": 0.24331513491828155, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 27899 + }, + { + "epoch": 0.24332385620345015, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 27900 + }, + { + "epoch": 0.24333257748861872, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 27901 + }, + { + "epoch": 0.2433412987737873, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 27902 + }, + { + "epoch": 0.2433500200589559, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 27903 + }, + { + "epoch": 0.24335874134412447, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 27904 + }, + { + "epoch": 0.24336746262929304, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 27905 + }, + { + "epoch": 0.24337618391446164, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 27906 + }, + { + "epoch": 0.2433849051996302, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 27907 + }, + { + "epoch": 0.2433936264847988, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 27908 + }, + { + "epoch": 0.2434023477699674, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 27909 + }, + { + "epoch": 0.24341106905513596, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 27910 + }, + { + "epoch": 0.24341979034030456, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 27911 + }, + { + "epoch": 0.24342851162547313, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 27912 + }, + { + "epoch": 0.2434372329106417, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 27913 + }, + { + "epoch": 0.2434459541958103, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 27914 + }, + { + "epoch": 0.24345467548097888, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 27915 + }, + { + "epoch": 0.24346339676614745, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 27916 + }, + { + "epoch": 0.24347211805131605, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 27917 + }, + { + "epoch": 0.24348083933648462, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 27918 + }, + { + "epoch": 0.2434895606216532, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 27919 + }, + { + "epoch": 0.2434982819068218, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 27920 + }, + { + "epoch": 0.24350700319199037, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 27921 + }, + { + "epoch": 0.24351572447715897, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 27922 + }, + { + "epoch": 0.24352444576232754, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 27923 + }, + { + "epoch": 0.2435331670474961, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 27924 + }, + { + "epoch": 0.2435418883326647, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 27925 + }, + { + "epoch": 0.24355060961783329, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 27926 + }, + { + "epoch": 0.24355933090300186, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 27927 + }, + { + "epoch": 0.24356805218817046, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 27928 + }, + { + "epoch": 0.24357677347333903, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 27929 + }, + { + "epoch": 0.2435854947585076, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 27930 + }, + { + "epoch": 0.2435942160436762, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 27931 + }, + { + "epoch": 0.24360293732884478, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 27932 + }, + { + "epoch": 0.24361165861401335, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 27933 + }, + { + "epoch": 0.24362037989918195, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 27934 + }, + { + "epoch": 0.24362910118435052, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 27935 + }, + { + "epoch": 0.24363782246951912, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 27936 + }, + { + "epoch": 0.2436465437546877, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 27937 + }, + { + "epoch": 0.24365526503985627, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 27938 + }, + { + "epoch": 0.24366398632502487, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 27939 + }, + { + "epoch": 0.24367270761019344, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 27940 + }, + { + "epoch": 0.243681428895362, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 27941 + }, + { + "epoch": 0.2436901501805306, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 27942 + }, + { + "epoch": 0.24369887146569919, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 27943 + }, + { + "epoch": 0.24370759275086776, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 27944 + }, + { + "epoch": 0.24371631403603636, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 27945 + }, + { + "epoch": 0.24372503532120493, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 27946 + }, + { + "epoch": 0.2437337566063735, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 27947 + }, + { + "epoch": 0.2437424778915421, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 27948 + }, + { + "epoch": 0.24375119917671068, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 27949 + }, + { + "epoch": 0.24375992046187928, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 27950 + }, + { + "epoch": 0.24376864174704785, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 27951 + }, + { + "epoch": 0.24377736303221642, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 27952 + }, + { + "epoch": 0.24378608431738502, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 27953 + }, + { + "epoch": 0.2437948056025536, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 27954 + }, + { + "epoch": 0.24380352688772217, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 27955 + }, + { + "epoch": 0.24381224817289077, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 27956 + }, + { + "epoch": 0.24382096945805934, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 27957 + }, + { + "epoch": 0.2438296907432279, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 27958 + }, + { + "epoch": 0.2438384120283965, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 27959 + }, + { + "epoch": 0.24384713331356508, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 27960 + }, + { + "epoch": 0.24385585459873366, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27961 + }, + { + "epoch": 0.24386457588390226, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 27962 + }, + { + "epoch": 0.24387329716907083, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 27963 + }, + { + "epoch": 0.24388201845423943, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 27964 + }, + { + "epoch": 0.243890739739408, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 27965 + }, + { + "epoch": 0.24389946102457657, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 27966 + }, + { + "epoch": 0.24390818230974518, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 27967 + }, + { + "epoch": 0.24391690359491375, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 27968 + }, + { + "epoch": 0.24392562488008232, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 27969 + }, + { + "epoch": 0.24393434616525092, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 27970 + }, + { + "epoch": 0.2439430674504195, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 27971 + }, + { + "epoch": 0.24395178873558807, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 27972 + }, + { + "epoch": 0.24396051002075667, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 27973 + }, + { + "epoch": 0.24396923130592524, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 27974 + }, + { + "epoch": 0.2439779525910938, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 27975 + }, + { + "epoch": 0.2439866738762624, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 27976 + }, + { + "epoch": 0.24399539516143098, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 27977 + }, + { + "epoch": 0.24400411644659958, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 27978 + }, + { + "epoch": 0.24401283773176816, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 27979 + }, + { + "epoch": 0.24402155901693673, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 27980 + }, + { + "epoch": 0.24403028030210533, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 27981 + }, + { + "epoch": 0.2440390015872739, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 27982 + }, + { + "epoch": 0.24404772287244247, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 27983 + }, + { + "epoch": 0.24405644415761107, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 27984 + }, + { + "epoch": 0.24406516544277965, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 27985 + }, + { + "epoch": 0.24407388672794822, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 27986 + }, + { + "epoch": 0.24408260801311682, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 27987 + }, + { + "epoch": 0.2440913292982854, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 27988 + }, + { + "epoch": 0.24410005058345396, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 27989 + }, + { + "epoch": 0.24410877186862256, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 27990 + }, + { + "epoch": 0.24411749315379114, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 27991 + }, + { + "epoch": 0.24412621443895974, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 27992 + }, + { + "epoch": 0.2441349357241283, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 27993 + }, + { + "epoch": 0.24414365700929688, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 27994 + }, + { + "epoch": 0.24415237829446548, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 27995 + }, + { + "epoch": 0.24416109957963406, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 27996 + }, + { + "epoch": 0.24416982086480263, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 27997 + }, + { + "epoch": 0.24417854214997123, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 27998 + }, + { + "epoch": 0.2441872634351398, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 27999 + }, + { + "epoch": 0.24419598472030837, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 28000 + }, + { + "epoch": 0.24420470600547697, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 28001 + }, + { + "epoch": 0.24421342729064555, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 28002 + }, + { + "epoch": 0.24422214857581412, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 28003 + }, + { + "epoch": 0.24423086986098272, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 28004 + }, + { + "epoch": 0.2442395911461513, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 28005 + }, + { + "epoch": 0.2442483124313199, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 28006 + }, + { + "epoch": 0.24425703371648846, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 28007 + }, + { + "epoch": 0.24426575500165704, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 28008 + }, + { + "epoch": 0.24427447628682564, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 28009 + }, + { + "epoch": 0.2442831975719942, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 28010 + }, + { + "epoch": 0.24429191885716278, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 28011 + }, + { + "epoch": 0.24430064014233138, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 28012 + }, + { + "epoch": 0.24430936142749995, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 28013 + }, + { + "epoch": 0.24431808271266853, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 28014 + }, + { + "epoch": 0.24432680399783713, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 28015 + }, + { + "epoch": 0.2443355252830057, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 28016 + }, + { + "epoch": 0.24434424656817427, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 28017 + }, + { + "epoch": 0.24435296785334287, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 28018 + }, + { + "epoch": 0.24436168913851145, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 28019 + }, + { + "epoch": 0.24437041042368005, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 28020 + }, + { + "epoch": 0.24437913170884862, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 28021 + }, + { + "epoch": 0.2443878529940172, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 28022 + }, + { + "epoch": 0.2443965742791858, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 28023 + }, + { + "epoch": 0.24440529556435436, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 28024 + }, + { + "epoch": 0.24441401684952294, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 28025 + }, + { + "epoch": 0.24442273813469154, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 28026 + }, + { + "epoch": 0.2444314594198601, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 28027 + }, + { + "epoch": 0.24444018070502868, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 28028 + }, + { + "epoch": 0.24444890199019728, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 28029 + }, + { + "epoch": 0.24445762327536585, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 28030 + }, + { + "epoch": 0.24446634456053443, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 28031 + }, + { + "epoch": 0.24447506584570303, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 28032 + }, + { + "epoch": 0.2444837871308716, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 28033 + }, + { + "epoch": 0.2444925084160402, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 28034 + }, + { + "epoch": 0.24450122970120877, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 28035 + }, + { + "epoch": 0.24450995098637734, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 28036 + }, + { + "epoch": 0.24451867227154594, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 28037 + }, + { + "epoch": 0.24452739355671452, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 28038 + }, + { + "epoch": 0.2445361148418831, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 28039 + }, + { + "epoch": 0.2445448361270517, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 28040 + }, + { + "epoch": 0.24455355741222026, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 28041 + }, + { + "epoch": 0.24456227869738884, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 28042 + }, + { + "epoch": 0.24457099998255744, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 28043 + }, + { + "epoch": 0.244579721267726, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 28044 + }, + { + "epoch": 0.2445884425528946, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 28045 + }, + { + "epoch": 0.24459716383806318, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 28046 + }, + { + "epoch": 0.24460588512323175, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 28047 + }, + { + "epoch": 0.24461460640840035, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 28048 + }, + { + "epoch": 0.24462332769356893, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 28049 + }, + { + "epoch": 0.2446320489787375, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 28050 + }, + { + "epoch": 0.2446407702639061, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 28051 + }, + { + "epoch": 0.24464949154907467, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 28052 + }, + { + "epoch": 0.24465821283424324, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 28053 + }, + { + "epoch": 0.24466693411941184, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 28054 + }, + { + "epoch": 0.24467565540458042, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 28055 + }, + { + "epoch": 0.244684376689749, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 28056 + }, + { + "epoch": 0.2446930979749176, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 28057 + }, + { + "epoch": 0.24470181926008616, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 28058 + }, + { + "epoch": 0.24471054054525476, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 28059 + }, + { + "epoch": 0.24471926183042333, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 28060 + }, + { + "epoch": 0.2447279831155919, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 28061 + }, + { + "epoch": 0.2447367044007605, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 28062 + }, + { + "epoch": 0.24474542568592908, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 28063 + }, + { + "epoch": 0.24475414697109765, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 28064 + }, + { + "epoch": 0.24476286825626625, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28065 + }, + { + "epoch": 0.24477158954143483, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 28066 + }, + { + "epoch": 0.2447803108266034, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 28067 + }, + { + "epoch": 0.244789032111772, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 28068 + }, + { + "epoch": 0.24479775339694057, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 28069 + }, + { + "epoch": 0.24480647468210914, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 28070 + }, + { + "epoch": 0.24481519596727774, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 28071 + }, + { + "epoch": 0.24482391725244632, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 28072 + }, + { + "epoch": 0.24483263853761492, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 28073 + }, + { + "epoch": 0.2448413598227835, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 28074 + }, + { + "epoch": 0.24485008110795206, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 28075 + }, + { + "epoch": 0.24485880239312066, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 28076 + }, + { + "epoch": 0.24486752367828923, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 28077 + }, + { + "epoch": 0.2448762449634578, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 28078 + }, + { + "epoch": 0.2448849662486264, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 28079 + }, + { + "epoch": 0.24489368753379498, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 28080 + }, + { + "epoch": 0.24490240881896355, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 28081 + }, + { + "epoch": 0.24491113010413215, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 28082 + }, + { + "epoch": 0.24491985138930072, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 28083 + }, + { + "epoch": 0.2449285726744693, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 28084 + }, + { + "epoch": 0.2449372939596379, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 28085 + }, + { + "epoch": 0.24494601524480647, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 28086 + }, + { + "epoch": 0.24495473652997507, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 28087 + }, + { + "epoch": 0.24496345781514364, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28088 + }, + { + "epoch": 0.24497217910031222, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 28089 + }, + { + "epoch": 0.24498090038548082, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 28090 + }, + { + "epoch": 0.2449896216706494, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 28091 + }, + { + "epoch": 0.24499834295581796, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 28092 + }, + { + "epoch": 0.24500706424098656, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 28093 + }, + { + "epoch": 0.24501578552615513, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 28094 + }, + { + "epoch": 0.2450245068113237, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 28095 + }, + { + "epoch": 0.2450332280964923, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 28096 + }, + { + "epoch": 0.24504194938166088, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 28097 + }, + { + "epoch": 0.24505067066682945, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28098 + }, + { + "epoch": 0.24505939195199805, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 28099 + }, + { + "epoch": 0.24506811323716662, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 28100 + }, + { + "epoch": 0.24507683452233522, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 28101 + }, + { + "epoch": 0.2450855558075038, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 28102 + }, + { + "epoch": 0.24509427709267237, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 28103 + }, + { + "epoch": 0.24510299837784097, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 28104 + }, + { + "epoch": 0.24511171966300954, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 28105 + }, + { + "epoch": 0.24512044094817811, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 28106 + }, + { + "epoch": 0.24512916223334671, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28107 + }, + { + "epoch": 0.2451378835185153, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 28108 + }, + { + "epoch": 0.24514660480368386, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 28109 + }, + { + "epoch": 0.24515532608885246, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 28110 + }, + { + "epoch": 0.24516404737402103, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 28111 + }, + { + "epoch": 0.2451727686591896, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 28112 + }, + { + "epoch": 0.2451814899443582, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 28113 + }, + { + "epoch": 0.24519021122952678, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 28114 + }, + { + "epoch": 0.24519893251469538, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 28115 + }, + { + "epoch": 0.24520765379986395, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 28116 + }, + { + "epoch": 0.24521637508503252, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 28117 + }, + { + "epoch": 0.24522509637020112, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 28118 + }, + { + "epoch": 0.2452338176553697, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 28119 + }, + { + "epoch": 0.24524253894053827, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 28120 + }, + { + "epoch": 0.24525126022570687, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 28121 + }, + { + "epoch": 0.24525998151087544, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 28122 + }, + { + "epoch": 0.245268702796044, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 28123 + }, + { + "epoch": 0.2452774240812126, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 28124 + }, + { + "epoch": 0.2452861453663812, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 28125 + }, + { + "epoch": 0.24529486665154976, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 28126 + }, + { + "epoch": 0.24530358793671836, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 28127 + }, + { + "epoch": 0.24531230922188693, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 28128 + }, + { + "epoch": 0.24532103050705553, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 28129 + }, + { + "epoch": 0.2453297517922241, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 28130 + }, + { + "epoch": 0.24533847307739268, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 28131 + }, + { + "epoch": 0.24534719436256128, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 28132 + }, + { + "epoch": 0.24535591564772985, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 28133 + }, + { + "epoch": 0.24536463693289842, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28134 + }, + { + "epoch": 0.24537335821806702, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 28135 + }, + { + "epoch": 0.2453820795032356, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 28136 + }, + { + "epoch": 0.24539080078840417, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 28137 + }, + { + "epoch": 0.24539952207357277, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 28138 + }, + { + "epoch": 0.24540824335874134, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 28139 + }, + { + "epoch": 0.2454169646439099, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 28140 + }, + { + "epoch": 0.2454256859290785, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 28141 + }, + { + "epoch": 0.24543440721424709, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 28142 + }, + { + "epoch": 0.24544312849941569, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 28143 + }, + { + "epoch": 0.24545184978458426, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 28144 + }, + { + "epoch": 0.24546057106975283, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 28145 + }, + { + "epoch": 0.24546929235492143, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 28146 + }, + { + "epoch": 0.24547801364009, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 28147 + }, + { + "epoch": 0.24548673492525858, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 28148 + }, + { + "epoch": 0.24549545621042718, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 28149 + }, + { + "epoch": 0.24550417749559575, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 28150 + }, + { + "epoch": 0.24551289878076432, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 28151 + }, + { + "epoch": 0.24552162006593292, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28152 + }, + { + "epoch": 0.2455303413511015, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 28153 + }, + { + "epoch": 0.2455390626362701, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 28154 + }, + { + "epoch": 0.24554778392143867, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 28155 + }, + { + "epoch": 0.24555650520660724, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 28156 + }, + { + "epoch": 0.24556522649177584, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 28157 + }, + { + "epoch": 0.2455739477769444, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 28158 + }, + { + "epoch": 0.24558266906211298, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 28159 + }, + { + "epoch": 0.24559139034728159, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 28160 + }, + { + "epoch": 0.24560011163245016, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 28161 + }, + { + "epoch": 0.24560883291761873, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 28162 + }, + { + "epoch": 0.24561755420278733, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 28163 + }, + { + "epoch": 0.2456262754879559, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 28164 + }, + { + "epoch": 0.24563499677312448, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 28165 + }, + { + "epoch": 0.24564371805829308, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 28166 + }, + { + "epoch": 0.24565243934346165, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 28167 + }, + { + "epoch": 0.24566116062863025, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 28168 + }, + { + "epoch": 0.24566988191379882, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 28169 + }, + { + "epoch": 0.2456786031989674, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 28170 + }, + { + "epoch": 0.245687324484136, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 28171 + }, + { + "epoch": 0.24569604576930457, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28172 + }, + { + "epoch": 0.24570476705447314, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 28173 + }, + { + "epoch": 0.24571348833964174, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 28174 + }, + { + "epoch": 0.2457222096248103, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 28175 + }, + { + "epoch": 0.24573093090997888, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 28176 + }, + { + "epoch": 0.24573965219514748, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 28177 + }, + { + "epoch": 0.24574837348031606, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 28178 + }, + { + "epoch": 0.24575709476548463, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28179 + }, + { + "epoch": 0.24576581605065323, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 28180 + }, + { + "epoch": 0.2457745373358218, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 28181 + }, + { + "epoch": 0.2457832586209904, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 28182 + }, + { + "epoch": 0.24579197990615897, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 28183 + }, + { + "epoch": 0.24580070119132755, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 28184 + }, + { + "epoch": 0.24580942247649615, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 28185 + }, + { + "epoch": 0.24581814376166472, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 28186 + }, + { + "epoch": 0.2458268650468333, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 28187 + }, + { + "epoch": 0.2458355863320019, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 28188 + }, + { + "epoch": 0.24584430761717047, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 28189 + }, + { + "epoch": 0.24585302890233904, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 28190 + }, + { + "epoch": 0.24586175018750764, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 28191 + }, + { + "epoch": 0.2458704714726762, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 28192 + }, + { + "epoch": 0.24587919275784478, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 28193 + }, + { + "epoch": 0.24588791404301338, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 28194 + }, + { + "epoch": 0.24589663532818196, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 28195 + }, + { + "epoch": 0.24590535661335056, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 28196 + }, + { + "epoch": 0.24591407789851913, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 28197 + }, + { + "epoch": 0.2459227991836877, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 28198 + }, + { + "epoch": 0.2459315204688563, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 28199 + }, + { + "epoch": 0.24594024175402487, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 28200 + }, + { + "epoch": 0.24594896303919345, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 28201 + }, + { + "epoch": 0.24595768432436205, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 28202 + }, + { + "epoch": 0.24596640560953062, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 28203 + }, + { + "epoch": 0.2459751268946992, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 28204 + }, + { + "epoch": 0.2459838481798678, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 28205 + }, + { + "epoch": 0.24599256946503636, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 28206 + }, + { + "epoch": 0.24600129075020494, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 28207 + }, + { + "epoch": 0.24601001203537354, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 28208 + }, + { + "epoch": 0.2460187333205421, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 28209 + }, + { + "epoch": 0.2460274546057107, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 28210 + }, + { + "epoch": 0.24603617589087928, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 28211 + }, + { + "epoch": 0.24604489717604786, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 28212 + }, + { + "epoch": 0.24605361846121646, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 28213 + }, + { + "epoch": 0.24606233974638503, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28214 + }, + { + "epoch": 0.2460710610315536, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 28215 + }, + { + "epoch": 0.2460797823167222, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 28216 + }, + { + "epoch": 0.24608850360189077, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 28217 + }, + { + "epoch": 0.24609722488705935, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 28218 + }, + { + "epoch": 0.24610594617222795, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 28219 + }, + { + "epoch": 0.24611466745739652, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 28220 + }, + { + "epoch": 0.2461233887425651, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 28221 + }, + { + "epoch": 0.2461321100277337, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 28222 + }, + { + "epoch": 0.24614083131290226, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 28223 + }, + { + "epoch": 0.24614955259807086, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 28224 + }, + { + "epoch": 0.24615827388323944, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 28225 + }, + { + "epoch": 0.246166995168408, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 28226 + }, + { + "epoch": 0.2461757164535766, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 28227 + }, + { + "epoch": 0.24618443773874518, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 28228 + }, + { + "epoch": 0.24619315902391375, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 28229 + }, + { + "epoch": 0.24620188030908235, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 28230 + }, + { + "epoch": 0.24621060159425093, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 28231 + }, + { + "epoch": 0.2462193228794195, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 28232 + }, + { + "epoch": 0.2462280441645881, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 28233 + }, + { + "epoch": 0.24623676544975667, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 28234 + }, + { + "epoch": 0.24624548673492525, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 28235 + }, + { + "epoch": 0.24625420802009385, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 28236 + }, + { + "epoch": 0.24626292930526242, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 28237 + }, + { + "epoch": 0.24627165059043102, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 28238 + }, + { + "epoch": 0.2462803718755996, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 28239 + }, + { + "epoch": 0.24628909316076816, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 28240 + }, + { + "epoch": 0.24629781444593676, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 28241 + }, + { + "epoch": 0.24630653573110534, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 28242 + }, + { + "epoch": 0.2463152570162739, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 28243 + }, + { + "epoch": 0.2463239783014425, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 28244 + }, + { + "epoch": 0.24633269958661108, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 28245 + }, + { + "epoch": 0.24634142087177965, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 28246 + }, + { + "epoch": 0.24635014215694825, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28247 + }, + { + "epoch": 0.24635886344211683, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 28248 + }, + { + "epoch": 0.2463675847272854, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 28249 + }, + { + "epoch": 0.246376306012454, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 28250 + }, + { + "epoch": 0.24638502729762257, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 28251 + }, + { + "epoch": 0.24639374858279117, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 28252 + }, + { + "epoch": 0.24640246986795974, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 28253 + }, + { + "epoch": 0.24641119115312832, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28254 + }, + { + "epoch": 0.24641991243829692, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 28255 + }, + { + "epoch": 0.2464286337234655, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 28256 + }, + { + "epoch": 0.24643735500863406, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 28257 + }, + { + "epoch": 0.24644607629380266, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 28258 + }, + { + "epoch": 0.24645479757897124, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 28259 + }, + { + "epoch": 0.2464635188641398, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 28260 + }, + { + "epoch": 0.2464722401493084, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 28261 + }, + { + "epoch": 0.24648096143447698, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 28262 + }, + { + "epoch": 0.24648968271964555, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28263 + }, + { + "epoch": 0.24649840400481415, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 28264 + }, + { + "epoch": 0.24650712528998273, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 28265 + }, + { + "epoch": 0.24651584657515133, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 28266 + }, + { + "epoch": 0.2465245678603199, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 28267 + }, + { + "epoch": 0.24653328914548847, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 28268 + }, + { + "epoch": 0.24654201043065707, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 28269 + }, + { + "epoch": 0.24655073171582564, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 28270 + }, + { + "epoch": 0.24655945300099422, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 28271 + }, + { + "epoch": 0.24656817428616282, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 28272 + }, + { + "epoch": 0.2465768955713314, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 28273 + }, + { + "epoch": 0.24658561685649996, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 28274 + }, + { + "epoch": 0.24659433814166856, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 28275 + }, + { + "epoch": 0.24660305942683713, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 28276 + }, + { + "epoch": 0.24661178071200573, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28277 + }, + { + "epoch": 0.2466205019971743, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 28278 + }, + { + "epoch": 0.24662922328234288, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 28279 + }, + { + "epoch": 0.24663794456751148, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 28280 + }, + { + "epoch": 0.24664666585268005, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 28281 + }, + { + "epoch": 0.24665538713784863, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 28282 + }, + { + "epoch": 0.24666410842301723, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 28283 + }, + { + "epoch": 0.2466728297081858, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 28284 + }, + { + "epoch": 0.24668155099335437, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 28285 + }, + { + "epoch": 0.24669027227852297, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 28286 + }, + { + "epoch": 0.24669899356369154, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 28287 + }, + { + "epoch": 0.24670771484886012, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 28288 + }, + { + "epoch": 0.24671643613402872, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28289 + }, + { + "epoch": 0.2467251574191973, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 28290 + }, + { + "epoch": 0.2467338787043659, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 28291 + }, + { + "epoch": 0.24674259998953446, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 28292 + }, + { + "epoch": 0.24675132127470303, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 28293 + }, + { + "epoch": 0.24676004255987163, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 28294 + }, + { + "epoch": 0.2467687638450402, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 28295 + }, + { + "epoch": 0.24677748513020878, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 28296 + }, + { + "epoch": 0.24678620641537738, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 28297 + }, + { + "epoch": 0.24679492770054595, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 28298 + }, + { + "epoch": 0.24680364898571452, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 28299 + }, + { + "epoch": 0.24681237027088312, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 28300 + }, + { + "epoch": 0.2468210915560517, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 28301 + }, + { + "epoch": 0.24682981284122027, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 28302 + }, + { + "epoch": 0.24683853412638887, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 28303 + }, + { + "epoch": 0.24684725541155744, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 28304 + }, + { + "epoch": 0.24685597669672604, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 28305 + }, + { + "epoch": 0.24686469798189462, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 28306 + }, + { + "epoch": 0.2468734192670632, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 28307 + }, + { + "epoch": 0.2468821405522318, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 28308 + }, + { + "epoch": 0.24689086183740036, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28309 + }, + { + "epoch": 0.24689958312256893, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 28310 + }, + { + "epoch": 0.24690830440773753, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 28311 + }, + { + "epoch": 0.2469170256929061, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28312 + }, + { + "epoch": 0.24692574697807468, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 28313 + }, + { + "epoch": 0.24693446826324328, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 28314 + }, + { + "epoch": 0.24694318954841185, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 28315 + }, + { + "epoch": 0.24695191083358042, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 28316 + }, + { + "epoch": 0.24696063211874902, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 28317 + }, + { + "epoch": 0.2469693534039176, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 28318 + }, + { + "epoch": 0.2469780746890862, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 28319 + }, + { + "epoch": 0.24698679597425477, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 28320 + }, + { + "epoch": 0.24699551725942334, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 28321 + }, + { + "epoch": 0.24700423854459194, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 28322 + }, + { + "epoch": 0.24701295982976051, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 28323 + }, + { + "epoch": 0.2470216811149291, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 28324 + }, + { + "epoch": 0.2470304024000977, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 28325 + }, + { + "epoch": 0.24703912368526626, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 28326 + }, + { + "epoch": 0.24704784497043483, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 28327 + }, + { + "epoch": 0.24705656625560343, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 28328 + }, + { + "epoch": 0.247065287540772, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 28329 + }, + { + "epoch": 0.24707400882594058, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 28330 + }, + { + "epoch": 0.24708273011110918, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 28331 + }, + { + "epoch": 0.24709145139627775, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 28332 + }, + { + "epoch": 0.24710017268144635, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 28333 + }, + { + "epoch": 0.24710889396661492, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 28334 + }, + { + "epoch": 0.2471176152517835, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28335 + }, + { + "epoch": 0.2471263365369521, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 28336 + }, + { + "epoch": 0.24713505782212067, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 28337 + }, + { + "epoch": 0.24714377910728924, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 28338 + }, + { + "epoch": 0.24715250039245784, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 28339 + }, + { + "epoch": 0.2471612216776264, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 28340 + }, + { + "epoch": 0.247169942962795, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 28341 + }, + { + "epoch": 0.2471786642479636, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 28342 + }, + { + "epoch": 0.24718738553313216, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 28343 + }, + { + "epoch": 0.24719610681830073, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 28344 + }, + { + "epoch": 0.24720482810346933, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 28345 + }, + { + "epoch": 0.2472135493886379, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 28346 + }, + { + "epoch": 0.2472222706738065, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 28347 + }, + { + "epoch": 0.24723099195897508, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 28348 + }, + { + "epoch": 0.24723971324414365, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 28349 + }, + { + "epoch": 0.24724843452931225, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 28350 + }, + { + "epoch": 0.24725715581448082, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 28351 + }, + { + "epoch": 0.2472658770996494, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 28352 + }, + { + "epoch": 0.247274598384818, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 28353 + }, + { + "epoch": 0.24728331966998657, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 28354 + }, + { + "epoch": 0.24729204095515514, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 28355 + }, + { + "epoch": 0.24730076224032374, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 28356 + }, + { + "epoch": 0.2473094835254923, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 28357 + }, + { + "epoch": 0.24731820481066089, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 28358 + }, + { + "epoch": 0.24732692609582949, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 28359 + }, + { + "epoch": 0.24733564738099806, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 28360 + }, + { + "epoch": 0.24734436866616666, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 28361 + }, + { + "epoch": 0.24735308995133523, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 28362 + }, + { + "epoch": 0.2473618112365038, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 28363 + }, + { + "epoch": 0.2473705325216724, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 28364 + }, + { + "epoch": 0.24737925380684098, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 28365 + }, + { + "epoch": 0.24738797509200955, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 28366 + }, + { + "epoch": 0.24739669637717815, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 28367 + }, + { + "epoch": 0.24740541766234672, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 28368 + }, + { + "epoch": 0.2474141389475153, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28369 + }, + { + "epoch": 0.2474228602326839, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 28370 + }, + { + "epoch": 0.24743158151785247, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 28371 + }, + { + "epoch": 0.24744030280302104, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 28372 + }, + { + "epoch": 0.24744902408818964, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 28373 + }, + { + "epoch": 0.2474577453733582, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 28374 + }, + { + "epoch": 0.2474664666585268, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 28375 + }, + { + "epoch": 0.24747518794369538, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 28376 + }, + { + "epoch": 0.24748390922886396, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 28377 + }, + { + "epoch": 0.24749263051403256, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 28378 + }, + { + "epoch": 0.24750135179920113, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 28379 + }, + { + "epoch": 0.2475100730843697, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 28380 + }, + { + "epoch": 0.2475187943695383, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 28381 + }, + { + "epoch": 0.24752751565470688, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 28382 + }, + { + "epoch": 0.24753623693987545, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 28383 + }, + { + "epoch": 0.24754495822504405, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 28384 + }, + { + "epoch": 0.24755367951021262, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 28385 + }, + { + "epoch": 0.24756240079538122, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 28386 + }, + { + "epoch": 0.2475711220805498, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 28387 + }, + { + "epoch": 0.24757984336571837, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 28388 + }, + { + "epoch": 0.24758856465088697, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 28389 + }, + { + "epoch": 0.24759728593605554, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 28390 + }, + { + "epoch": 0.2476060072212241, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 28391 + }, + { + "epoch": 0.2476147285063927, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 28392 + }, + { + "epoch": 0.24762344979156128, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 28393 + }, + { + "epoch": 0.24763217107672986, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 28394 + }, + { + "epoch": 0.24764089236189846, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 28395 + }, + { + "epoch": 0.24764961364706703, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 28396 + }, + { + "epoch": 0.2476583349322356, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 28397 + }, + { + "epoch": 0.2476670562174042, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28398 + }, + { + "epoch": 0.24767577750257277, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 28399 + }, + { + "epoch": 0.24768449878774138, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 28400 + }, + { + "epoch": 0.24769322007290995, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 28401 + }, + { + "epoch": 0.24770194135807852, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 28402 + }, + { + "epoch": 0.24771066264324712, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 28403 + }, + { + "epoch": 0.2477193839284157, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 28404 + }, + { + "epoch": 0.24772810521358427, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 28405 + }, + { + "epoch": 0.24773682649875287, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 28406 + }, + { + "epoch": 0.24774554778392144, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 28407 + }, + { + "epoch": 0.24775426906909, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 28408 + }, + { + "epoch": 0.2477629903542586, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 28409 + }, + { + "epoch": 0.24777171163942718, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 28410 + }, + { + "epoch": 0.24778043292459576, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 28411 + }, + { + "epoch": 0.24778915420976436, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 28412 + }, + { + "epoch": 0.24779787549493293, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 28413 + }, + { + "epoch": 0.24780659678010153, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 28414 + }, + { + "epoch": 0.2478153180652701, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 28415 + }, + { + "epoch": 0.24782403935043867, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 28416 + }, + { + "epoch": 0.24783276063560727, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 28417 + }, + { + "epoch": 0.24784148192077585, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 28418 + }, + { + "epoch": 0.24785020320594442, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 28419 + }, + { + "epoch": 0.24785892449111302, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 28420 + }, + { + "epoch": 0.2478676457762816, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 28421 + }, + { + "epoch": 0.24787636706145016, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 28422 + }, + { + "epoch": 0.24788508834661876, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 28423 + }, + { + "epoch": 0.24789380963178734, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 28424 + }, + { + "epoch": 0.2479025309169559, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 28425 + }, + { + "epoch": 0.2479112522021245, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28426 + }, + { + "epoch": 0.24791997348729308, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 28427 + }, + { + "epoch": 0.24792869477246168, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 28428 + }, + { + "epoch": 0.24793741605763026, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 28429 + }, + { + "epoch": 0.24794613734279883, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 28430 + }, + { + "epoch": 0.24795485862796743, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 28431 + }, + { + "epoch": 0.247963579913136, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 28432 + }, + { + "epoch": 0.24797230119830457, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 28433 + }, + { + "epoch": 0.24798102248347317, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 28434 + }, + { + "epoch": 0.24798974376864175, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 28435 + }, + { + "epoch": 0.24799846505381032, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 28436 + }, + { + "epoch": 0.24800718633897892, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 28437 + }, + { + "epoch": 0.2480159076241475, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 28438 + }, + { + "epoch": 0.24802462890931606, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 28439 + }, + { + "epoch": 0.24803335019448466, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28440 + }, + { + "epoch": 0.24804207147965324, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 28441 + }, + { + "epoch": 0.24805079276482184, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 28442 + }, + { + "epoch": 0.2480595140499904, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 28443 + }, + { + "epoch": 0.24806823533515898, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 28444 + }, + { + "epoch": 0.24807695662032758, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 28445 + }, + { + "epoch": 0.24808567790549615, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 28446 + }, + { + "epoch": 0.24809439919066473, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28447 + }, + { + "epoch": 0.24810312047583333, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 28448 + }, + { + "epoch": 0.2481118417610019, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 28449 + }, + { + "epoch": 0.24812056304617047, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 28450 + }, + { + "epoch": 0.24812928433133907, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 28451 + }, + { + "epoch": 0.24813800561650765, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 28452 + }, + { + "epoch": 0.24814672690167622, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 28453 + }, + { + "epoch": 0.24815544818684482, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 28454 + }, + { + "epoch": 0.2481641694720134, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 28455 + }, + { + "epoch": 0.248172890757182, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 28456 + }, + { + "epoch": 0.24818161204235056, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 28457 + }, + { + "epoch": 0.24819033332751914, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 28458 + }, + { + "epoch": 0.24819905461268774, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 28459 + }, + { + "epoch": 0.2482077758978563, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 28460 + }, + { + "epoch": 0.24821649718302488, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 28461 + }, + { + "epoch": 0.24822521846819348, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 28462 + }, + { + "epoch": 0.24823393975336205, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 28463 + }, + { + "epoch": 0.24824266103853063, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 28464 + }, + { + "epoch": 0.24825138232369923, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28465 + }, + { + "epoch": 0.2482601036088678, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 28466 + }, + { + "epoch": 0.24826882489403637, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 28467 + }, + { + "epoch": 0.24827754617920497, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 28468 + }, + { + "epoch": 0.24828626746437354, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 28469 + }, + { + "epoch": 0.24829498874954214, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 28470 + }, + { + "epoch": 0.24830371003471072, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 28471 + }, + { + "epoch": 0.2483124313198793, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 28472 + }, + { + "epoch": 0.2483211526050479, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 28473 + }, + { + "epoch": 0.24832987389021646, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 28474 + }, + { + "epoch": 0.24833859517538504, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 28475 + }, + { + "epoch": 0.24834731646055364, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 28476 + }, + { + "epoch": 0.2483560377457222, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 28477 + }, + { + "epoch": 0.24836475903089078, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 28478 + }, + { + "epoch": 0.24837348031605938, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 28479 + }, + { + "epoch": 0.24838220160122795, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 28480 + }, + { + "epoch": 0.24839092288639653, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 28481 + }, + { + "epoch": 0.24839964417156513, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 28482 + }, + { + "epoch": 0.2484083654567337, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 28483 + }, + { + "epoch": 0.2484170867419023, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 28484 + }, + { + "epoch": 0.24842580802707087, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 28485 + }, + { + "epoch": 0.24843452931223944, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28486 + }, + { + "epoch": 0.24844325059740804, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 28487 + }, + { + "epoch": 0.24845197188257662, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 28488 + }, + { + "epoch": 0.2484606931677452, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 28489 + }, + { + "epoch": 0.2484694144529138, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 28490 + }, + { + "epoch": 0.24847813573808236, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 28491 + }, + { + "epoch": 0.24848685702325093, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 28492 + }, + { + "epoch": 0.24849557830841953, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 28493 + }, + { + "epoch": 0.2485042995935881, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 28494 + }, + { + "epoch": 0.2485130208787567, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 28495 + }, + { + "epoch": 0.24852174216392528, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 28496 + }, + { + "epoch": 0.24853046344909385, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 28497 + }, + { + "epoch": 0.24853918473426245, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 28498 + }, + { + "epoch": 0.24854790601943103, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 28499 + }, + { + "epoch": 0.2485566273045996, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 28500 + }, + { + "epoch": 0.2485653485897682, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 28501 + }, + { + "epoch": 0.24857406987493677, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 28502 + }, + { + "epoch": 0.24858279116010534, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 28503 + }, + { + "epoch": 0.24859151244527394, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 28504 + }, + { + "epoch": 0.24860023373044252, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 28505 + }, + { + "epoch": 0.2486089550156111, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 28506 + }, + { + "epoch": 0.2486176763007797, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 28507 + }, + { + "epoch": 0.24862639758594826, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 28508 + }, + { + "epoch": 0.24863511887111686, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 28509 + }, + { + "epoch": 0.24864384015628543, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 28510 + }, + { + "epoch": 0.248652561441454, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28511 + }, + { + "epoch": 0.2486612827266226, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 28512 + }, + { + "epoch": 0.24867000401179118, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 28513 + }, + { + "epoch": 0.24867872529695975, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 28514 + }, + { + "epoch": 0.24868744658212835, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28515 + }, + { + "epoch": 0.24869616786729692, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 28516 + }, + { + "epoch": 0.2487048891524655, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 28517 + }, + { + "epoch": 0.2487136104376341, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 28518 + }, + { + "epoch": 0.24872233172280267, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 28519 + }, + { + "epoch": 0.24873105300797124, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 28520 + }, + { + "epoch": 0.24873977429313984, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 28521 + }, + { + "epoch": 0.24874849557830842, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 28522 + }, + { + "epoch": 0.24875721686347702, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 28523 + }, + { + "epoch": 0.2487659381486456, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 28524 + }, + { + "epoch": 0.24877465943381416, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 28525 + }, + { + "epoch": 0.24878338071898276, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 28526 + }, + { + "epoch": 0.24879210200415133, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 28527 + }, + { + "epoch": 0.2488008232893199, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 28528 + }, + { + "epoch": 0.2488095445744885, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28529 + }, + { + "epoch": 0.24881826585965708, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 28530 + }, + { + "epoch": 0.24882698714482565, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 28531 + }, + { + "epoch": 0.24883570842999425, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28532 + }, + { + "epoch": 0.24884442971516282, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 28533 + }, + { + "epoch": 0.2488531510003314, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 28534 + }, + { + "epoch": 0.2488618722855, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 28535 + }, + { + "epoch": 0.24887059357066857, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 28536 + }, + { + "epoch": 0.24887931485583717, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 28537 + }, + { + "epoch": 0.24888803614100574, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 28538 + }, + { + "epoch": 0.24889675742617431, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 28539 + }, + { + "epoch": 0.24890547871134291, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 28540 + }, + { + "epoch": 0.2489141999965115, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 28541 + }, + { + "epoch": 0.24892292128168006, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 28542 + }, + { + "epoch": 0.24893164256684866, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 28543 + }, + { + "epoch": 0.24894036385201723, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 28544 + }, + { + "epoch": 0.2489490851371858, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 28545 + }, + { + "epoch": 0.2489578064223544, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 28546 + }, + { + "epoch": 0.24896652770752298, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 28547 + }, + { + "epoch": 0.24897524899269155, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 28548 + }, + { + "epoch": 0.24898397027786015, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 28549 + }, + { + "epoch": 0.24899269156302872, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 28550 + }, + { + "epoch": 0.24900141284819732, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 28551 + }, + { + "epoch": 0.2490101341333659, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 28552 + }, + { + "epoch": 0.24901885541853447, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 28553 + }, + { + "epoch": 0.24902757670370307, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 28554 + }, + { + "epoch": 0.24903629798887164, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 28555 + }, + { + "epoch": 0.2490450192740402, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 28556 + }, + { + "epoch": 0.2490537405592088, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 28557 + }, + { + "epoch": 0.2490624618443774, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 28558 + }, + { + "epoch": 0.24907118312954596, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 28559 + }, + { + "epoch": 0.24907990441471456, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 28560 + }, + { + "epoch": 0.24908862569988313, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 28561 + }, + { + "epoch": 0.2490973469850517, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 28562 + }, + { + "epoch": 0.2491060682702203, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28563 + }, + { + "epoch": 0.24911478955538888, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 28564 + }, + { + "epoch": 0.24912351084055748, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 28565 + }, + { + "epoch": 0.24913223212572605, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 28566 + }, + { + "epoch": 0.24914095341089462, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 28567 + }, + { + "epoch": 0.24914967469606322, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 28568 + }, + { + "epoch": 0.2491583959812318, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 28569 + }, + { + "epoch": 0.24916711726640037, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 28570 + }, + { + "epoch": 0.24917583855156897, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 28571 + }, + { + "epoch": 0.24918455983673754, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 28572 + }, + { + "epoch": 0.2491932811219061, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 28573 + }, + { + "epoch": 0.2492020024070747, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 28574 + }, + { + "epoch": 0.24921072369224329, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 28575 + }, + { + "epoch": 0.24921944497741186, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 28576 + }, + { + "epoch": 0.24922816626258046, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 28577 + }, + { + "epoch": 0.24923688754774903, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 28578 + }, + { + "epoch": 0.24924560883291763, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 28579 + }, + { + "epoch": 0.2492543301180862, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 28580 + }, + { + "epoch": 0.24926305140325478, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 28581 + }, + { + "epoch": 0.24927177268842338, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 28582 + }, + { + "epoch": 0.24928049397359195, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 28583 + }, + { + "epoch": 0.24928921525876052, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 28584 + }, + { + "epoch": 0.24929793654392912, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 28585 + }, + { + "epoch": 0.2493066578290977, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 28586 + }, + { + "epoch": 0.24931537911426627, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 28587 + }, + { + "epoch": 0.24932410039943487, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 28588 + }, + { + "epoch": 0.24933282168460344, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 28589 + }, + { + "epoch": 0.249341542969772, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 28590 + }, + { + "epoch": 0.2493502642549406, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 28591 + }, + { + "epoch": 0.24935898554010918, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 28592 + }, + { + "epoch": 0.24936770682527779, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 28593 + }, + { + "epoch": 0.24937642811044636, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 28594 + }, + { + "epoch": 0.24938514939561493, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 28595 + }, + { + "epoch": 0.24939387068078353, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 28596 + }, + { + "epoch": 0.2494025919659521, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 28597 + }, + { + "epoch": 0.24941131325112068, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 28598 + }, + { + "epoch": 0.24942003453628928, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 28599 + }, + { + "epoch": 0.24942875582145785, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 28600 + }, + { + "epoch": 0.24943747710662642, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 28601 + }, + { + "epoch": 0.24944619839179502, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 28602 + }, + { + "epoch": 0.2494549196769636, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 28603 + }, + { + "epoch": 0.24946364096213217, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 28604 + }, + { + "epoch": 0.24947236224730077, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 28605 + }, + { + "epoch": 0.24948108353246934, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28606 + }, + { + "epoch": 0.24948980481763794, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 28607 + }, + { + "epoch": 0.2494985261028065, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 28608 + }, + { + "epoch": 0.24950724738797508, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 28609 + }, + { + "epoch": 0.24951596867314368, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28610 + }, + { + "epoch": 0.24952468995831226, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 28611 + }, + { + "epoch": 0.24953341124348083, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 28612 + }, + { + "epoch": 0.24954213252864943, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 28613 + }, + { + "epoch": 0.249550853813818, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 28614 + }, + { + "epoch": 0.24955957509898657, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 28615 + }, + { + "epoch": 0.24956829638415517, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 28616 + }, + { + "epoch": 0.24957701766932375, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 28617 + }, + { + "epoch": 0.24958573895449235, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 28618 + }, + { + "epoch": 0.24959446023966092, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 28619 + }, + { + "epoch": 0.2496031815248295, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 28620 + }, + { + "epoch": 0.2496119028099981, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 28621 + }, + { + "epoch": 0.24962062409516667, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 28622 + }, + { + "epoch": 0.24962934538033524, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 28623 + }, + { + "epoch": 0.24963806666550384, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 28624 + }, + { + "epoch": 0.2496467879506724, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 28625 + }, + { + "epoch": 0.24965550923584098, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 28626 + }, + { + "epoch": 0.24966423052100958, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 28627 + }, + { + "epoch": 0.24967295180617816, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 28628 + }, + { + "epoch": 0.24968167309134673, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 28629 + }, + { + "epoch": 0.24969039437651533, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 28630 + }, + { + "epoch": 0.2496991156616839, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 28631 + }, + { + "epoch": 0.2497078369468525, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 28632 + }, + { + "epoch": 0.24971655823202107, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28633 + }, + { + "epoch": 0.24972527951718965, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 28634 + }, + { + "epoch": 0.24973400080235825, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 28635 + }, + { + "epoch": 0.24974272208752682, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 28636 + }, + { + "epoch": 0.2497514433726954, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 28637 + }, + { + "epoch": 0.249760164657864, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 28638 + }, + { + "epoch": 0.24976888594303256, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 28639 + }, + { + "epoch": 0.24977760722820114, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 28640 + }, + { + "epoch": 0.24978632851336974, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 28641 + }, + { + "epoch": 0.2497950497985383, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 28642 + }, + { + "epoch": 0.24980377108370688, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28643 + }, + { + "epoch": 0.24981249236887548, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 28644 + }, + { + "epoch": 0.24982121365404406, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 28645 + }, + { + "epoch": 0.24982993493921266, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 28646 + }, + { + "epoch": 0.24983865622438123, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 28647 + }, + { + "epoch": 0.2498473775095498, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 28648 + }, + { + "epoch": 0.2498560987947184, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 28649 + }, + { + "epoch": 0.24986482007988697, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 28650 + }, + { + "epoch": 0.24987354136505555, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 28651 + }, + { + "epoch": 0.24988226265022415, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 28652 + }, + { + "epoch": 0.24989098393539272, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 28653 + }, + { + "epoch": 0.2498997052205613, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 28654 + }, + { + "epoch": 0.2499084265057299, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 28655 + }, + { + "epoch": 0.24991714779089846, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 28656 + }, + { + "epoch": 0.24992586907606704, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 28657 + }, + { + "epoch": 0.24993459036123564, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 28658 + }, + { + "epoch": 0.2499433116464042, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 28659 + }, + { + "epoch": 0.2499520329315728, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 28660 + }, + { + "epoch": 0.24996075421674138, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 28661 + }, + { + "epoch": 0.24996947550190995, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 28662 + }, + { + "epoch": 0.24997819678707855, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 28663 + }, + { + "epoch": 0.24998691807224713, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 28664 + }, + { + "epoch": 0.2499956393574157, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 28665 + }, + { + "epoch": 0.2500043606425843, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 28666 + }, + { + "epoch": 0.2500130819277529, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 28667 + }, + { + "epoch": 0.25002180321292145, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 28668 + }, + { + "epoch": 0.25003052449809005, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 28669 + }, + { + "epoch": 0.25003924578325865, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 28670 + }, + { + "epoch": 0.2500479670684272, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 28671 + }, + { + "epoch": 0.2500566883535958, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 28672 + }, + { + "epoch": 0.2500654096387644, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 28673 + }, + { + "epoch": 0.25007413092393294, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 28674 + }, + { + "epoch": 0.25008285220910154, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 28675 + }, + { + "epoch": 0.25009157349427014, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 28676 + }, + { + "epoch": 0.2501002947794387, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 28677 + }, + { + "epoch": 0.2501090160646073, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 28678 + }, + { + "epoch": 0.2501177373497759, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 28679 + }, + { + "epoch": 0.2501264586349444, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 28680 + }, + { + "epoch": 0.250135179920113, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 28681 + }, + { + "epoch": 0.2501439012052816, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28682 + }, + { + "epoch": 0.25015262249045017, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 28683 + }, + { + "epoch": 0.25016134377561877, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 28684 + }, + { + "epoch": 0.25017006506078737, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 28685 + }, + { + "epoch": 0.2501787863459559, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 28686 + }, + { + "epoch": 0.2501875076311245, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 28687 + }, + { + "epoch": 0.2501962289162931, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 28688 + }, + { + "epoch": 0.25020495020146166, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 28689 + }, + { + "epoch": 0.25021367148663026, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 28690 + }, + { + "epoch": 0.25022239277179886, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 28691 + }, + { + "epoch": 0.2502311140569674, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 28692 + }, + { + "epoch": 0.250239835342136, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 28693 + }, + { + "epoch": 0.2502485566273046, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 28694 + }, + { + "epoch": 0.2502572779124732, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 28695 + }, + { + "epoch": 0.25026599919764175, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 28696 + }, + { + "epoch": 0.25027472048281035, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 28697 + }, + { + "epoch": 0.25028344176797895, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 28698 + }, + { + "epoch": 0.2502921630531475, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 28699 + }, + { + "epoch": 0.2503008843383161, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 28700 + }, + { + "epoch": 0.2503096056234847, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 28701 + }, + { + "epoch": 0.25031832690865324, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 28702 + }, + { + "epoch": 0.25032704819382184, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 28703 + }, + { + "epoch": 0.25033576947899044, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 28704 + }, + { + "epoch": 0.250344490764159, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 28705 + }, + { + "epoch": 0.2503532120493276, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 28706 + }, + { + "epoch": 0.2503619333344962, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 28707 + }, + { + "epoch": 0.25037065461966473, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 28708 + }, + { + "epoch": 0.25037937590483333, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 28709 + }, + { + "epoch": 0.25038809719000193, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 28710 + }, + { + "epoch": 0.2503968184751705, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 28711 + }, + { + "epoch": 0.2504055397603391, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 28712 + }, + { + "epoch": 0.2504142610455077, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 28713 + }, + { + "epoch": 0.2504229823306762, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 28714 + }, + { + "epoch": 0.2504317036158448, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 28715 + }, + { + "epoch": 0.2504404249010134, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 28716 + }, + { + "epoch": 0.25044914618618197, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 28717 + }, + { + "epoch": 0.25045786747135057, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 28718 + }, + { + "epoch": 0.25046658875651917, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 28719 + }, + { + "epoch": 0.2504753100416877, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 28720 + }, + { + "epoch": 0.2504840313268563, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 28721 + }, + { + "epoch": 0.2504927526120249, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 28722 + }, + { + "epoch": 0.2505014738971935, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 28723 + }, + { + "epoch": 0.25051019518236206, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 28724 + }, + { + "epoch": 0.25051891646753066, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 28725 + }, + { + "epoch": 0.25052763775269926, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 28726 + }, + { + "epoch": 0.2505363590378678, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 28727 + }, + { + "epoch": 0.2505450803230364, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 28728 + }, + { + "epoch": 0.250553801608205, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 28729 + }, + { + "epoch": 0.25056252289337355, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 28730 + }, + { + "epoch": 0.25057124417854215, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 28731 + }, + { + "epoch": 0.25057996546371075, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 28732 + }, + { + "epoch": 0.2505886867488793, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 28733 + }, + { + "epoch": 0.2505974080340479, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 28734 + }, + { + "epoch": 0.2506061293192165, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 28735 + }, + { + "epoch": 0.25061485060438504, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 28736 + }, + { + "epoch": 0.25062357188955364, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 28737 + }, + { + "epoch": 0.25063229317472224, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 28738 + }, + { + "epoch": 0.2506410144598908, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 28739 + }, + { + "epoch": 0.2506497357450594, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 28740 + }, + { + "epoch": 0.250658457030228, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 28741 + }, + { + "epoch": 0.25066717831539653, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 28742 + }, + { + "epoch": 0.25067589960056513, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 28743 + }, + { + "epoch": 0.25068462088573373, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 28744 + }, + { + "epoch": 0.2506933421709023, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 28745 + }, + { + "epoch": 0.2507020634560709, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 28746 + }, + { + "epoch": 0.2507107847412395, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 28747 + }, + { + "epoch": 0.2507195060264081, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 28748 + }, + { + "epoch": 0.2507282273115766, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 28749 + }, + { + "epoch": 0.2507369485967452, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 28750 + }, + { + "epoch": 0.2507456698819138, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 28751 + }, + { + "epoch": 0.25075439116708237, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 28752 + }, + { + "epoch": 0.25076311245225097, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 28753 + }, + { + "epoch": 0.25077183373741957, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 28754 + }, + { + "epoch": 0.2507805550225881, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 28755 + }, + { + "epoch": 0.2507892763077567, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 28756 + }, + { + "epoch": 0.2507979975929253, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 28757 + }, + { + "epoch": 0.25080671887809386, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 28758 + }, + { + "epoch": 0.25081544016326246, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 28759 + }, + { + "epoch": 0.25082416144843106, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 28760 + }, + { + "epoch": 0.2508328827335996, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 28761 + }, + { + "epoch": 0.2508416040187682, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 28762 + }, + { + "epoch": 0.2508503253039368, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 28763 + }, + { + "epoch": 0.25085904658910535, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 28764 + }, + { + "epoch": 0.25086776787427395, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 28765 + }, + { + "epoch": 0.25087648915944255, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 28766 + }, + { + "epoch": 0.2508852104446111, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 28767 + }, + { + "epoch": 0.2508939317297797, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 28768 + }, + { + "epoch": 0.2509026530149483, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 28769 + }, + { + "epoch": 0.25091137430011684, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 28770 + }, + { + "epoch": 0.25092009558528544, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 28771 + }, + { + "epoch": 0.25092881687045404, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 28772 + }, + { + "epoch": 0.2509375381556226, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 28773 + }, + { + "epoch": 0.2509462594407912, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 28774 + }, + { + "epoch": 0.2509549807259598, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 28775 + }, + { + "epoch": 0.2509637020111284, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 28776 + }, + { + "epoch": 0.25097242329629693, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 28777 + }, + { + "epoch": 0.25098114458146553, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 28778 + }, + { + "epoch": 0.25098986586663413, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 28779 + }, + { + "epoch": 0.2509985871518027, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 28780 + }, + { + "epoch": 0.2510073084369713, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 28781 + }, + { + "epoch": 0.2510160297221399, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 28782 + }, + { + "epoch": 0.2510247510073084, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 28783 + }, + { + "epoch": 0.251033472292477, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 28784 + }, + { + "epoch": 0.2510421935776456, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 28785 + }, + { + "epoch": 0.25105091486281417, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 28786 + }, + { + "epoch": 0.25105963614798277, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 28787 + }, + { + "epoch": 0.25106835743315137, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 28788 + }, + { + "epoch": 0.2510770787183199, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 28789 + }, + { + "epoch": 0.2510858000034885, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 28790 + }, + { + "epoch": 0.2510945212886571, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 28791 + }, + { + "epoch": 0.25110324257382566, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 28792 + }, + { + "epoch": 0.25111196385899426, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 28793 + }, + { + "epoch": 0.25112068514416286, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 28794 + }, + { + "epoch": 0.2511294064293314, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 28795 + }, + { + "epoch": 0.2511381277145, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 28796 + }, + { + "epoch": 0.2511468489996686, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 28797 + }, + { + "epoch": 0.25115557028483715, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 28798 + }, + { + "epoch": 0.25116429157000575, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 28799 + }, + { + "epoch": 0.25117301285517435, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 28800 + }, + { + "epoch": 0.2511817341403429, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 28801 + }, + { + "epoch": 0.2511904554255115, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 28802 + }, + { + "epoch": 0.2511991767106801, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 28803 + }, + { + "epoch": 0.2512078979958487, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 28804 + }, + { + "epoch": 0.25121661928101724, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 28805 + }, + { + "epoch": 0.25122534056618584, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 28806 + }, + { + "epoch": 0.25123406185135444, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 28807 + }, + { + "epoch": 0.251242783136523, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 28808 + }, + { + "epoch": 0.2512515044216916, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 28809 + }, + { + "epoch": 0.2512602257068602, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 28810 + }, + { + "epoch": 0.25126894699202873, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 28811 + }, + { + "epoch": 0.25127766827719733, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 28812 + }, + { + "epoch": 0.25128638956236593, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 28813 + }, + { + "epoch": 0.2512951108475345, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 28814 + }, + { + "epoch": 0.2513038321327031, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28815 + }, + { + "epoch": 0.2513125534178717, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 28816 + }, + { + "epoch": 0.2513212747030402, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 28817 + }, + { + "epoch": 0.2513299959882088, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 28818 + }, + { + "epoch": 0.2513387172733774, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 28819 + }, + { + "epoch": 0.25134743855854597, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 28820 + }, + { + "epoch": 0.25135615984371457, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28821 + }, + { + "epoch": 0.25136488112888317, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 28822 + }, + { + "epoch": 0.2513736024140517, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 28823 + }, + { + "epoch": 0.2513823236992203, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 28824 + }, + { + "epoch": 0.2513910449843889, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 28825 + }, + { + "epoch": 0.25139976626955746, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 28826 + }, + { + "epoch": 0.25140848755472606, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 28827 + }, + { + "epoch": 0.25141720883989466, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 28828 + }, + { + "epoch": 0.2514259301250632, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 28829 + }, + { + "epoch": 0.2514346514102318, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 28830 + }, + { + "epoch": 0.2514433726954004, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 28831 + }, + { + "epoch": 0.251452093980569, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 28832 + }, + { + "epoch": 0.25146081526573755, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 28833 + }, + { + "epoch": 0.25146953655090615, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 28834 + }, + { + "epoch": 0.25147825783607475, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 28835 + }, + { + "epoch": 0.2514869791212433, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 28836 + }, + { + "epoch": 0.2514957004064119, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 28837 + }, + { + "epoch": 0.2515044216915805, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 28838 + }, + { + "epoch": 0.25151314297674904, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 28839 + }, + { + "epoch": 0.25152186426191764, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 28840 + }, + { + "epoch": 0.25153058554708624, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 28841 + }, + { + "epoch": 0.2515393068322548, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 28842 + }, + { + "epoch": 0.2515480281174234, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 28843 + }, + { + "epoch": 0.251556749402592, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 28844 + }, + { + "epoch": 0.25156547068776053, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 28845 + }, + { + "epoch": 0.25157419197292913, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 28846 + }, + { + "epoch": 0.25158291325809773, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 28847 + }, + { + "epoch": 0.2515916345432663, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 28848 + }, + { + "epoch": 0.2516003558284349, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 28849 + }, + { + "epoch": 0.2516090771136035, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 28850 + }, + { + "epoch": 0.251617798398772, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 28851 + }, + { + "epoch": 0.2516265196839406, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 28852 + }, + { + "epoch": 0.2516352409691092, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 28853 + }, + { + "epoch": 0.25164396225427776, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 28854 + }, + { + "epoch": 0.25165268353944636, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 28855 + }, + { + "epoch": 0.25166140482461496, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 28856 + }, + { + "epoch": 0.25167012610978357, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 28857 + }, + { + "epoch": 0.2516788473949521, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 28858 + }, + { + "epoch": 0.2516875686801207, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 28859 + }, + { + "epoch": 0.2516962899652893, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 28860 + }, + { + "epoch": 0.25170501125045786, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 28861 + }, + { + "epoch": 0.25171373253562646, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 28862 + }, + { + "epoch": 0.25172245382079506, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 28863 + }, + { + "epoch": 0.2517311751059636, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 28864 + }, + { + "epoch": 0.2517398963911322, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 28865 + }, + { + "epoch": 0.2517486176763008, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 28866 + }, + { + "epoch": 0.25175733896146935, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 28867 + }, + { + "epoch": 0.25176606024663795, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 28868 + }, + { + "epoch": 0.25177478153180655, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 28869 + }, + { + "epoch": 0.2517835028169751, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 28870 + }, + { + "epoch": 0.2517922241021437, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 28871 + }, + { + "epoch": 0.2518009453873123, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 28872 + }, + { + "epoch": 0.25180966667248084, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 28873 + }, + { + "epoch": 0.25181838795764944, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 28874 + }, + { + "epoch": 0.25182710924281804, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 28875 + }, + { + "epoch": 0.2518358305279866, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 28876 + }, + { + "epoch": 0.2518445518131552, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 28877 + }, + { + "epoch": 0.2518532730983238, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 28878 + }, + { + "epoch": 0.2518619943834923, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 28879 + }, + { + "epoch": 0.2518707156686609, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 28880 + }, + { + "epoch": 0.2518794369538295, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 28881 + }, + { + "epoch": 0.25188815823899807, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 28882 + }, + { + "epoch": 0.2518968795241667, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 28883 + }, + { + "epoch": 0.2519056008093353, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 28884 + }, + { + "epoch": 0.2519143220945039, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 28885 + }, + { + "epoch": 0.2519230433796724, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 28886 + }, + { + "epoch": 0.251931764664841, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 28887 + }, + { + "epoch": 0.2519404859500096, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 28888 + }, + { + "epoch": 0.25194920723517816, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 28889 + }, + { + "epoch": 0.25195792852034676, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 28890 + }, + { + "epoch": 0.25196664980551536, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 28891 + }, + { + "epoch": 0.2519753710906839, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 28892 + }, + { + "epoch": 0.2519840923758525, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 28893 + }, + { + "epoch": 0.2519928136610211, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 28894 + }, + { + "epoch": 0.25200153494618965, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 28895 + }, + { + "epoch": 0.25201025623135825, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 28896 + }, + { + "epoch": 0.25201897751652685, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 28897 + }, + { + "epoch": 0.2520276988016954, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 28898 + }, + { + "epoch": 0.252036420086864, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 28899 + }, + { + "epoch": 0.2520451413720326, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 28900 + }, + { + "epoch": 0.25205386265720114, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 28901 + }, + { + "epoch": 0.25206258394236974, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 28902 + }, + { + "epoch": 0.25207130522753834, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 28903 + }, + { + "epoch": 0.2520800265127069, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 28904 + }, + { + "epoch": 0.2520887477978755, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 28905 + }, + { + "epoch": 0.2520974690830441, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 28906 + }, + { + "epoch": 0.25210619036821263, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 28907 + }, + { + "epoch": 0.25211491165338124, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 28908 + }, + { + "epoch": 0.25212363293854984, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 28909 + }, + { + "epoch": 0.2521323542237184, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 28910 + }, + { + "epoch": 0.252141075508887, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 28911 + }, + { + "epoch": 0.2521497967940556, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28912 + }, + { + "epoch": 0.2521585180792242, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 28913 + }, + { + "epoch": 0.2521672393643927, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 28914 + }, + { + "epoch": 0.2521759606495613, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 28915 + }, + { + "epoch": 0.2521846819347299, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 28916 + }, + { + "epoch": 0.25219340321989847, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 28917 + }, + { + "epoch": 0.25220212450506707, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 28918 + }, + { + "epoch": 0.25221084579023567, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 28919 + }, + { + "epoch": 0.2522195670754042, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 28920 + }, + { + "epoch": 0.2522282883605728, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 28921 + }, + { + "epoch": 0.2522370096457414, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 28922 + }, + { + "epoch": 0.25224573093090996, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 28923 + }, + { + "epoch": 0.25225445221607856, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 28924 + }, + { + "epoch": 0.25226317350124716, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 28925 + }, + { + "epoch": 0.2522718947864157, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 28926 + }, + { + "epoch": 0.2522806160715843, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 28927 + }, + { + "epoch": 0.2522893373567529, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 28928 + }, + { + "epoch": 0.25229805864192145, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 28929 + }, + { + "epoch": 0.25230677992709005, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 28930 + }, + { + "epoch": 0.25231550121225865, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 28931 + }, + { + "epoch": 0.2523242224974272, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 28932 + }, + { + "epoch": 0.2523329437825958, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 28933 + }, + { + "epoch": 0.2523416650677644, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 28934 + }, + { + "epoch": 0.25235038635293294, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 28935 + }, + { + "epoch": 0.25235910763810154, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 28936 + }, + { + "epoch": 0.25236782892327014, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 28937 + }, + { + "epoch": 0.2523765502084387, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 28938 + }, + { + "epoch": 0.2523852714936073, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 28939 + }, + { + "epoch": 0.2523939927787759, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 28940 + }, + { + "epoch": 0.2524027140639445, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 28941 + }, + { + "epoch": 0.25241143534911303, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 28942 + }, + { + "epoch": 0.25242015663428163, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 28943 + }, + { + "epoch": 0.25242887791945023, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 28944 + }, + { + "epoch": 0.2524375992046188, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 28945 + }, + { + "epoch": 0.2524463204897874, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 28946 + }, + { + "epoch": 0.252455041774956, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 28947 + }, + { + "epoch": 0.2524637630601245, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 28948 + }, + { + "epoch": 0.2524724843452931, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 28949 + }, + { + "epoch": 0.2524812056304617, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 28950 + }, + { + "epoch": 0.25248992691563027, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 28951 + }, + { + "epoch": 0.25249864820079887, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 28952 + }, + { + "epoch": 0.25250736948596747, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 28953 + }, + { + "epoch": 0.252516090771136, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 28954 + }, + { + "epoch": 0.2525248120563046, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 28955 + }, + { + "epoch": 0.2525335333414732, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 28956 + }, + { + "epoch": 0.25254225462664176, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 28957 + }, + { + "epoch": 0.25255097591181036, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 28958 + }, + { + "epoch": 0.25255969719697896, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 28959 + }, + { + "epoch": 0.2525684184821475, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 28960 + }, + { + "epoch": 0.2525771397673161, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 28961 + }, + { + "epoch": 0.2525858610524847, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 28962 + }, + { + "epoch": 0.25259458233765325, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 28963 + }, + { + "epoch": 0.25260330362282185, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 28964 + }, + { + "epoch": 0.25261202490799045, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 28965 + }, + { + "epoch": 0.25262074619315905, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 28966 + }, + { + "epoch": 0.2526294674783276, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 28967 + }, + { + "epoch": 0.2526381887634962, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 28968 + }, + { + "epoch": 0.2526469100486648, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 28969 + }, + { + "epoch": 0.25265563133383334, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 28970 + }, + { + "epoch": 0.25266435261900194, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 28971 + }, + { + "epoch": 0.25267307390417054, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 28972 + }, + { + "epoch": 0.2526817951893391, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 28973 + }, + { + "epoch": 0.2526905164745077, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 28974 + }, + { + "epoch": 0.2526992377596763, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 28975 + }, + { + "epoch": 0.25270795904484483, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 28976 + }, + { + "epoch": 0.25271668033001343, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 28977 + }, + { + "epoch": 0.25272540161518203, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 28978 + }, + { + "epoch": 0.2527341229003506, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 28979 + }, + { + "epoch": 0.2527428441855192, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 28980 + }, + { + "epoch": 0.2527515654706878, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 28981 + }, + { + "epoch": 0.2527602867558563, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 28982 + }, + { + "epoch": 0.2527690080410249, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 28983 + }, + { + "epoch": 0.2527777293261935, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 28984 + }, + { + "epoch": 0.25278645061136207, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 28985 + }, + { + "epoch": 0.25279517189653067, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 28986 + }, + { + "epoch": 0.25280389318169927, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 28987 + }, + { + "epoch": 0.2528126144668678, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 28988 + }, + { + "epoch": 0.2528213357520364, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 28989 + }, + { + "epoch": 0.252830057037205, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 28990 + }, + { + "epoch": 0.25283877832237356, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 28991 + }, + { + "epoch": 0.25284749960754216, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 28992 + }, + { + "epoch": 0.25285622089271076, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 28993 + }, + { + "epoch": 0.25286494217787936, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 28994 + }, + { + "epoch": 0.2528736634630479, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 28995 + }, + { + "epoch": 0.2528823847482165, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 28996 + }, + { + "epoch": 0.2528911060333851, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 28997 + }, + { + "epoch": 0.25289982731855365, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 28998 + }, + { + "epoch": 0.25290854860372225, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 28999 + }, + { + "epoch": 0.25291726988889085, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 29000 + }, + { + "epoch": 0.2529259911740594, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 29001 + }, + { + "epoch": 0.252934712459228, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 29002 + }, + { + "epoch": 0.2529434337443966, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 29003 + }, + { + "epoch": 0.25295215502956514, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 29004 + }, + { + "epoch": 0.25296087631473374, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 29005 + }, + { + "epoch": 0.25296959759990234, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 29006 + }, + { + "epoch": 0.2529783188850709, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 29007 + }, + { + "epoch": 0.2529870401702395, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 29008 + }, + { + "epoch": 0.2529957614554081, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 29009 + }, + { + "epoch": 0.25300448274057663, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29010 + }, + { + "epoch": 0.25301320402574523, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 29011 + }, + { + "epoch": 0.25302192531091383, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 29012 + }, + { + "epoch": 0.2530306465960824, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29013 + }, + { + "epoch": 0.253039367881251, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 29014 + }, + { + "epoch": 0.2530480891664196, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 29015 + }, + { + "epoch": 0.2530568104515881, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 29016 + }, + { + "epoch": 0.2530655317367567, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29017 + }, + { + "epoch": 0.2530742530219253, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29018 + }, + { + "epoch": 0.25308297430709387, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 29019 + }, + { + "epoch": 0.25309169559226247, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 29020 + }, + { + "epoch": 0.25310041687743107, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 29021 + }, + { + "epoch": 0.25310913816259967, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 29022 + }, + { + "epoch": 0.2531178594477682, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 29023 + }, + { + "epoch": 0.2531265807329368, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 29024 + }, + { + "epoch": 0.2531353020181054, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 29025 + }, + { + "epoch": 0.25314402330327396, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29026 + }, + { + "epoch": 0.25315274458844256, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 29027 + }, + { + "epoch": 0.25316146587361116, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 29028 + }, + { + "epoch": 0.2531701871587797, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 29029 + }, + { + "epoch": 0.2531789084439483, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 29030 + }, + { + "epoch": 0.2531876297291169, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 29031 + }, + { + "epoch": 0.25319635101428545, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 29032 + }, + { + "epoch": 0.25320507229945405, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 29033 + }, + { + "epoch": 0.25321379358462265, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 29034 + }, + { + "epoch": 0.2532225148697912, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 29035 + }, + { + "epoch": 0.2532312361549598, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 29036 + }, + { + "epoch": 0.2532399574401284, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 29037 + }, + { + "epoch": 0.25324867872529694, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 29038 + }, + { + "epoch": 0.25325740001046554, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 29039 + }, + { + "epoch": 0.25326612129563414, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 29040 + }, + { + "epoch": 0.2532748425808027, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 29041 + }, + { + "epoch": 0.2532835638659713, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 29042 + }, + { + "epoch": 0.2532922851511399, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 29043 + }, + { + "epoch": 0.25330100643630843, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 29044 + }, + { + "epoch": 0.25330972772147703, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 29045 + }, + { + "epoch": 0.25331844900664563, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 29046 + }, + { + "epoch": 0.2533271702918142, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 29047 + }, + { + "epoch": 0.2533358915769828, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 29048 + }, + { + "epoch": 0.2533446128621514, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 29049 + }, + { + "epoch": 0.25335333414732, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 29050 + }, + { + "epoch": 0.2533620554324885, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 29051 + }, + { + "epoch": 0.2533707767176571, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 29052 + }, + { + "epoch": 0.2533794980028257, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 29053 + }, + { + "epoch": 0.25338821928799427, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 29054 + }, + { + "epoch": 0.25339694057316287, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 29055 + }, + { + "epoch": 0.25340566185833147, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 29056 + }, + { + "epoch": 0.2534143831435, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 29057 + }, + { + "epoch": 0.2534231044286686, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 29058 + }, + { + "epoch": 0.2534318257138372, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 29059 + }, + { + "epoch": 0.25344054699900576, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 29060 + }, + { + "epoch": 0.25344926828417436, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 29061 + }, + { + "epoch": 0.25345798956934296, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29062 + }, + { + "epoch": 0.2534667108545115, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 29063 + }, + { + "epoch": 0.2534754321396801, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 29064 + }, + { + "epoch": 0.2534841534248487, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 29065 + }, + { + "epoch": 0.25349287471001725, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 29066 + }, + { + "epoch": 0.25350159599518585, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 29067 + }, + { + "epoch": 0.25351031728035445, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 29068 + }, + { + "epoch": 0.253519038565523, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 29069 + }, + { + "epoch": 0.2535277598506916, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 29070 + }, + { + "epoch": 0.2535364811358602, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 29071 + }, + { + "epoch": 0.25354520242102874, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 29072 + }, + { + "epoch": 0.25355392370619734, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 29073 + }, + { + "epoch": 0.25356264499136594, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 29074 + }, + { + "epoch": 0.25357136627653454, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 29075 + }, + { + "epoch": 0.2535800875617031, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 29076 + }, + { + "epoch": 0.2535888088468717, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29077 + }, + { + "epoch": 0.2535975301320403, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 29078 + }, + { + "epoch": 0.25360625141720883, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 29079 + }, + { + "epoch": 0.25361497270237743, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 29080 + }, + { + "epoch": 0.25362369398754603, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29081 + }, + { + "epoch": 0.2536324152727146, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29082 + }, + { + "epoch": 0.2536411365578832, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 29083 + }, + { + "epoch": 0.2536498578430518, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 29084 + }, + { + "epoch": 0.2536585791282203, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 29085 + }, + { + "epoch": 0.2536673004133889, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 29086 + }, + { + "epoch": 0.2536760216985575, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 29087 + }, + { + "epoch": 0.25368474298372606, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29088 + }, + { + "epoch": 0.25369346426889466, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 29089 + }, + { + "epoch": 0.25370218555406326, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 29090 + }, + { + "epoch": 0.2537109068392318, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 29091 + }, + { + "epoch": 0.2537196281244004, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 29092 + }, + { + "epoch": 0.253728349409569, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 29093 + }, + { + "epoch": 0.25373707069473755, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 29094 + }, + { + "epoch": 0.25374579197990615, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 29095 + }, + { + "epoch": 0.25375451326507475, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29096 + }, + { + "epoch": 0.2537632345502433, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 29097 + }, + { + "epoch": 0.2537719558354119, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29098 + }, + { + "epoch": 0.2537806771205805, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 29099 + }, + { + "epoch": 0.25378939840574904, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 29100 + }, + { + "epoch": 0.25379811969091765, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 29101 + }, + { + "epoch": 0.25380684097608625, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 29102 + }, + { + "epoch": 0.25381556226125485, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 29103 + }, + { + "epoch": 0.2538242835464234, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 29104 + }, + { + "epoch": 0.253833004831592, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 29105 + }, + { + "epoch": 0.2538417261167606, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 29106 + }, + { + "epoch": 0.25385044740192914, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 29107 + }, + { + "epoch": 0.25385916868709774, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 29108 + }, + { + "epoch": 0.25386788997226634, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 29109 + }, + { + "epoch": 0.2538766112574349, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 29110 + }, + { + "epoch": 0.2538853325426035, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 29111 + }, + { + "epoch": 0.2538940538277721, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29112 + }, + { + "epoch": 0.2539027751129406, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 29113 + }, + { + "epoch": 0.2539114963981092, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 29114 + }, + { + "epoch": 0.2539202176832778, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 29115 + }, + { + "epoch": 0.25392893896844637, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 29116 + }, + { + "epoch": 0.25393766025361497, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 29117 + }, + { + "epoch": 0.25394638153878357, + "grad_norm": 0.0703125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 29118 + }, + { + "epoch": 0.2539551028239521, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 29119 + }, + { + "epoch": 0.2539638241091207, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 29120 + }, + { + "epoch": 0.2539725453942893, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 29121 + }, + { + "epoch": 0.25398126667945786, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 29122 + }, + { + "epoch": 0.25398998796462646, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 29123 + }, + { + "epoch": 0.25399870924979506, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 29124 + }, + { + "epoch": 0.2540074305349636, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 29125 + }, + { + "epoch": 0.2540161518201322, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 29126 + }, + { + "epoch": 0.2540248731053008, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 29127 + }, + { + "epoch": 0.25403359439046935, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 29128 + }, + { + "epoch": 0.25404231567563795, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 29129 + }, + { + "epoch": 0.25405103696080655, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 29130 + }, + { + "epoch": 0.25405975824597515, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 29131 + }, + { + "epoch": 0.2540684795311437, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 29132 + }, + { + "epoch": 0.2540772008163123, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 29133 + }, + { + "epoch": 0.2540859221014809, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 29134 + }, + { + "epoch": 0.25409464338664944, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 29135 + }, + { + "epoch": 0.25410336467181804, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 29136 + }, + { + "epoch": 0.25411208595698664, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 29137 + }, + { + "epoch": 0.2541208072421552, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 29138 + }, + { + "epoch": 0.2541295285273238, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 29139 + }, + { + "epoch": 0.2541382498124924, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 29140 + }, + { + "epoch": 0.25414697109766093, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 29141 + }, + { + "epoch": 0.25415569238282953, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 29142 + }, + { + "epoch": 0.25416441366799813, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 29143 + }, + { + "epoch": 0.2541731349531667, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 29144 + }, + { + "epoch": 0.2541818562383353, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 29145 + }, + { + "epoch": 0.2541905775235039, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29146 + }, + { + "epoch": 0.2541992988086724, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 29147 + }, + { + "epoch": 0.254208020093841, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29148 + }, + { + "epoch": 0.2542167413790096, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 29149 + }, + { + "epoch": 0.25422546266417817, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 29150 + }, + { + "epoch": 0.25423418394934677, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 29151 + }, + { + "epoch": 0.25424290523451537, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 29152 + }, + { + "epoch": 0.2542516265196839, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29153 + }, + { + "epoch": 0.2542603478048525, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 29154 + }, + { + "epoch": 0.2542690690900211, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 29155 + }, + { + "epoch": 0.25427779037518966, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 29156 + }, + { + "epoch": 0.25428651166035826, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 29157 + }, + { + "epoch": 0.25429523294552686, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 29158 + }, + { + "epoch": 0.25430395423069546, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 29159 + }, + { + "epoch": 0.254312675515864, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 29160 + }, + { + "epoch": 0.2543213968010326, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29161 + }, + { + "epoch": 0.2543301180862012, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 29162 + }, + { + "epoch": 0.25433883937136975, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 29163 + }, + { + "epoch": 0.25434756065653835, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29164 + }, + { + "epoch": 0.25435628194170695, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 29165 + }, + { + "epoch": 0.2543650032268755, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29166 + }, + { + "epoch": 0.2543737245120441, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 29167 + }, + { + "epoch": 0.2543824457972127, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 29168 + }, + { + "epoch": 0.25439116708238124, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 29169 + }, + { + "epoch": 0.25439988836754984, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 29170 + }, + { + "epoch": 0.25440860965271844, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 29171 + }, + { + "epoch": 0.254417330937887, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 29172 + }, + { + "epoch": 0.2544260522230556, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 29173 + }, + { + "epoch": 0.2544347735082242, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 29174 + }, + { + "epoch": 0.25444349479339273, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 29175 + }, + { + "epoch": 0.25445221607856133, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 29176 + }, + { + "epoch": 0.25446093736372993, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 29177 + }, + { + "epoch": 0.2544696586488985, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 29178 + }, + { + "epoch": 0.2544783799340671, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 29179 + }, + { + "epoch": 0.2544871012192357, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 29180 + }, + { + "epoch": 0.2544958225044042, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29181 + }, + { + "epoch": 0.2545045437895728, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29182 + }, + { + "epoch": 0.2545132650747414, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 29183 + }, + { + "epoch": 0.25452198635991, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 29184 + }, + { + "epoch": 0.25453070764507857, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 29185 + }, + { + "epoch": 0.25453942893024717, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 29186 + }, + { + "epoch": 0.25454815021541577, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 29187 + }, + { + "epoch": 0.2545568715005843, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 29188 + }, + { + "epoch": 0.2545655927857529, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 29189 + }, + { + "epoch": 0.2545743140709215, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 29190 + }, + { + "epoch": 0.25458303535609006, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 29191 + }, + { + "epoch": 0.25459175664125866, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 29192 + }, + { + "epoch": 0.25460047792642726, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 29193 + }, + { + "epoch": 0.2546091992115958, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 29194 + }, + { + "epoch": 0.2546179204967644, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 29195 + }, + { + "epoch": 0.254626641781933, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 29196 + }, + { + "epoch": 0.25463536306710155, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29197 + }, + { + "epoch": 0.25464408435227015, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 29198 + }, + { + "epoch": 0.25465280563743875, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 29199 + }, + { + "epoch": 0.2546615269226073, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 29200 + }, + { + "epoch": 0.2546702482077759, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29201 + }, + { + "epoch": 0.2546789694929445, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 29202 + }, + { + "epoch": 0.25468769077811304, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 29203 + }, + { + "epoch": 0.25469641206328164, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 29204 + }, + { + "epoch": 0.25470513334845024, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 29205 + }, + { + "epoch": 0.2547138546336188, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 29206 + }, + { + "epoch": 0.2547225759187874, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 29207 + }, + { + "epoch": 0.254731297203956, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 29208 + }, + { + "epoch": 0.25474001848912453, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 29209 + }, + { + "epoch": 0.25474873977429313, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 29210 + }, + { + "epoch": 0.25475746105946173, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29211 + }, + { + "epoch": 0.25476618234463033, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 29212 + }, + { + "epoch": 0.2547749036297989, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 29213 + }, + { + "epoch": 0.2547836249149675, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 29214 + }, + { + "epoch": 0.2547923462001361, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29215 + }, + { + "epoch": 0.2548010674853046, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 29216 + }, + { + "epoch": 0.2548097887704732, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 29217 + }, + { + "epoch": 0.2548185100556418, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 29218 + }, + { + "epoch": 0.25482723134081037, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 29219 + }, + { + "epoch": 0.25483595262597897, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 29220 + }, + { + "epoch": 0.25484467391114757, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 29221 + }, + { + "epoch": 0.2548533951963161, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 29222 + }, + { + "epoch": 0.2548621164814847, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 29223 + }, + { + "epoch": 0.2548708377666533, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 29224 + }, + { + "epoch": 0.25487955905182186, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 29225 + }, + { + "epoch": 0.25488828033699046, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29226 + }, + { + "epoch": 0.25489700162215906, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 29227 + }, + { + "epoch": 0.2549057229073276, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29228 + }, + { + "epoch": 0.2549144441924962, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 29229 + }, + { + "epoch": 0.2549231654776648, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 29230 + }, + { + "epoch": 0.25493188676283335, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 29231 + }, + { + "epoch": 0.25494060804800195, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 29232 + }, + { + "epoch": 0.25494932933317055, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 29233 + }, + { + "epoch": 0.2549580506183391, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 29234 + }, + { + "epoch": 0.2549667719035077, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 29235 + }, + { + "epoch": 0.2549754931886763, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 29236 + }, + { + "epoch": 0.25498421447384484, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 29237 + }, + { + "epoch": 0.25499293575901344, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 29238 + }, + { + "epoch": 0.25500165704418204, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 29239 + }, + { + "epoch": 0.25501037832935064, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 29240 + }, + { + "epoch": 0.2550190996145192, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 29241 + }, + { + "epoch": 0.2550278208996878, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 29242 + }, + { + "epoch": 0.2550365421848564, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 29243 + }, + { + "epoch": 0.25504526347002493, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 29244 + }, + { + "epoch": 0.25505398475519353, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29245 + }, + { + "epoch": 0.25506270604036213, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 29246 + }, + { + "epoch": 0.2550714273255307, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 29247 + }, + { + "epoch": 0.2550801486106993, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 29248 + }, + { + "epoch": 0.2550888698958679, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 29249 + }, + { + "epoch": 0.2550975911810364, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 29250 + }, + { + "epoch": 0.255106312466205, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 29251 + }, + { + "epoch": 0.2551150337513736, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 29252 + }, + { + "epoch": 0.25512375503654217, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 29253 + }, + { + "epoch": 0.25513247632171077, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 29254 + }, + { + "epoch": 0.25514119760687937, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 29255 + }, + { + "epoch": 0.2551499188920479, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29256 + }, + { + "epoch": 0.2551586401772165, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 29257 + }, + { + "epoch": 0.2551673614623851, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 29258 + }, + { + "epoch": 0.25517608274755366, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29259 + }, + { + "epoch": 0.25518480403272226, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 29260 + }, + { + "epoch": 0.25519352531789086, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 29261 + }, + { + "epoch": 0.2552022466030594, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 29262 + }, + { + "epoch": 0.255210967888228, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 29263 + }, + { + "epoch": 0.2552196891733966, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 29264 + }, + { + "epoch": 0.25522841045856515, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 29265 + }, + { + "epoch": 0.25523713174373375, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 29266 + }, + { + "epoch": 0.25524585302890235, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 29267 + }, + { + "epoch": 0.25525457431407095, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 29268 + }, + { + "epoch": 0.2552632955992395, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 29269 + }, + { + "epoch": 0.2552720168844081, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 29270 + }, + { + "epoch": 0.2552807381695767, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 29271 + }, + { + "epoch": 0.25528945945474524, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 29272 + }, + { + "epoch": 0.25529818073991384, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 29273 + }, + { + "epoch": 0.25530690202508244, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 29274 + }, + { + "epoch": 0.255315623310251, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 29275 + }, + { + "epoch": 0.2553243445954196, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 29276 + }, + { + "epoch": 0.2553330658805882, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 29277 + }, + { + "epoch": 0.25534178716575673, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 29278 + }, + { + "epoch": 0.25535050845092533, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 29279 + }, + { + "epoch": 0.25535922973609393, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 29280 + }, + { + "epoch": 0.2553679510212625, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 29281 + }, + { + "epoch": 0.2553766723064311, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 29282 + }, + { + "epoch": 0.2553853935915997, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 29283 + }, + { + "epoch": 0.2553941148767682, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 29284 + }, + { + "epoch": 0.2554028361619368, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 29285 + }, + { + "epoch": 0.2554115574471054, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 29286 + }, + { + "epoch": 0.25542027873227396, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 29287 + }, + { + "epoch": 0.25542900001744256, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 29288 + }, + { + "epoch": 0.25543772130261116, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 29289 + }, + { + "epoch": 0.2554464425877797, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 29290 + }, + { + "epoch": 0.2554551638729483, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29291 + }, + { + "epoch": 0.2554638851581169, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 29292 + }, + { + "epoch": 0.25547260644328545, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 29293 + }, + { + "epoch": 0.25548132772845406, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 29294 + }, + { + "epoch": 0.25549004901362266, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 29295 + }, + { + "epoch": 0.25549877029879126, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 29296 + }, + { + "epoch": 0.2555074915839598, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 29297 + }, + { + "epoch": 0.2555162128691284, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29298 + }, + { + "epoch": 0.255524934154297, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 29299 + }, + { + "epoch": 0.25553365543946555, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 29300 + }, + { + "epoch": 0.25554237672463415, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 29301 + }, + { + "epoch": 0.25555109800980275, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 29302 + }, + { + "epoch": 0.2555598192949713, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29303 + }, + { + "epoch": 0.2555685405801399, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 29304 + }, + { + "epoch": 0.2555772618653085, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29305 + }, + { + "epoch": 0.25558598315047704, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 29306 + }, + { + "epoch": 0.25559470443564564, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 29307 + }, + { + "epoch": 0.25560342572081424, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 29308 + }, + { + "epoch": 0.2556121470059828, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 29309 + }, + { + "epoch": 0.2556208682911514, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 29310 + }, + { + "epoch": 0.25562958957632, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 29311 + }, + { + "epoch": 0.2556383108614885, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29312 + }, + { + "epoch": 0.2556470321466571, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 29313 + }, + { + "epoch": 0.2556557534318257, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 29314 + }, + { + "epoch": 0.25566447471699427, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 29315 + }, + { + "epoch": 0.2556731960021629, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 29316 + }, + { + "epoch": 0.2556819172873315, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 29317 + }, + { + "epoch": 0.2556906385725, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 29318 + }, + { + "epoch": 0.2556993598576686, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 29319 + }, + { + "epoch": 0.2557080811428372, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 29320 + }, + { + "epoch": 0.2557168024280058, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29321 + }, + { + "epoch": 0.25572552371317436, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29322 + }, + { + "epoch": 0.25573424499834296, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 29323 + }, + { + "epoch": 0.25574296628351156, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29324 + }, + { + "epoch": 0.2557516875686801, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 29325 + }, + { + "epoch": 0.2557604088538487, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 29326 + }, + { + "epoch": 0.2557691301390173, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 29327 + }, + { + "epoch": 0.25577785142418585, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 29328 + }, + { + "epoch": 0.25578657270935445, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 29329 + }, + { + "epoch": 0.25579529399452305, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 29330 + }, + { + "epoch": 0.2558040152796916, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 29331 + }, + { + "epoch": 0.2558127365648602, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 29332 + }, + { + "epoch": 0.2558214578500288, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29333 + }, + { + "epoch": 0.25583017913519734, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29334 + }, + { + "epoch": 0.25583890042036594, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 29335 + }, + { + "epoch": 0.25584762170553454, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 29336 + }, + { + "epoch": 0.2558563429907031, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 29337 + }, + { + "epoch": 0.2558650642758717, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 29338 + }, + { + "epoch": 0.2558737855610403, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 29339 + }, + { + "epoch": 0.25588250684620883, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29340 + }, + { + "epoch": 0.25589122813137744, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 29341 + }, + { + "epoch": 0.25589994941654604, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 29342 + }, + { + "epoch": 0.2559086707017146, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 29343 + }, + { + "epoch": 0.2559173919868832, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 29344 + }, + { + "epoch": 0.2559261132720518, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 29345 + }, + { + "epoch": 0.2559348345572203, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 29346 + }, + { + "epoch": 0.2559435558423889, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 29347 + }, + { + "epoch": 0.2559522771275575, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29348 + }, + { + "epoch": 0.2559609984127261, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 29349 + }, + { + "epoch": 0.25596971969789467, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 29350 + }, + { + "epoch": 0.25597844098306327, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 29351 + }, + { + "epoch": 0.25598716226823187, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 29352 + }, + { + "epoch": 0.2559958835534004, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 29353 + }, + { + "epoch": 0.256004604838569, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 29354 + }, + { + "epoch": 0.2560133261237376, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 29355 + }, + { + "epoch": 0.25602204740890616, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 29356 + }, + { + "epoch": 0.25603076869407476, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 29357 + }, + { + "epoch": 0.25603948997924336, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 29358 + }, + { + "epoch": 0.2560482112644119, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 29359 + }, + { + "epoch": 0.2560569325495805, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 29360 + }, + { + "epoch": 0.2560656538347491, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 29361 + }, + { + "epoch": 0.25607437511991765, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 29362 + }, + { + "epoch": 0.25608309640508625, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 29363 + }, + { + "epoch": 0.25609181769025485, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 29364 + }, + { + "epoch": 0.2561005389754234, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 29365 + }, + { + "epoch": 0.256109260260592, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 29366 + }, + { + "epoch": 0.2561179815457606, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 29367 + }, + { + "epoch": 0.25612670283092914, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29368 + }, + { + "epoch": 0.25613542411609774, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 29369 + }, + { + "epoch": 0.25614414540126634, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 29370 + }, + { + "epoch": 0.2561528666864349, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 29371 + }, + { + "epoch": 0.2561615879716035, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 29372 + }, + { + "epoch": 0.2561703092567721, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 29373 + }, + { + "epoch": 0.25617903054194063, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 29374 + }, + { + "epoch": 0.25618775182710923, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 29375 + }, + { + "epoch": 0.25619647311227783, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 29376 + }, + { + "epoch": 0.25620519439744643, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 29377 + }, + { + "epoch": 0.256213915682615, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 29378 + }, + { + "epoch": 0.2562226369677836, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 29379 + }, + { + "epoch": 0.2562313582529522, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 29380 + }, + { + "epoch": 0.2562400795381207, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 29381 + }, + { + "epoch": 0.2562488008232893, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 29382 + }, + { + "epoch": 0.2562575221084579, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 29383 + }, + { + "epoch": 0.25626624339362647, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 29384 + }, + { + "epoch": 0.25627496467879507, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 29385 + }, + { + "epoch": 0.25628368596396367, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 29386 + }, + { + "epoch": 0.2562924072491322, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 29387 + }, + { + "epoch": 0.2563011285343008, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 29388 + }, + { + "epoch": 0.2563098498194694, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 29389 + }, + { + "epoch": 0.25631857110463796, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 29390 + }, + { + "epoch": 0.25632729238980656, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 29391 + }, + { + "epoch": 0.25633601367497516, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 29392 + }, + { + "epoch": 0.2563447349601437, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 29393 + }, + { + "epoch": 0.2563534562453123, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29394 + }, + { + "epoch": 0.2563621775304809, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 29395 + }, + { + "epoch": 0.25637089881564945, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 29396 + }, + { + "epoch": 0.25637962010081805, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 29397 + }, + { + "epoch": 0.25638834138598665, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 29398 + }, + { + "epoch": 0.2563970626711552, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 29399 + }, + { + "epoch": 0.2564057839563238, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 29400 + }, + { + "epoch": 0.2564145052414924, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 29401 + }, + { + "epoch": 0.25642322652666094, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 29402 + }, + { + "epoch": 0.25643194781182954, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 29403 + }, + { + "epoch": 0.25644066909699814, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 29404 + }, + { + "epoch": 0.25644939038216674, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 29405 + }, + { + "epoch": 0.2564581116673353, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 29406 + }, + { + "epoch": 0.2564668329525039, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 29407 + }, + { + "epoch": 0.2564755542376725, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 29408 + }, + { + "epoch": 0.25648427552284103, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 29409 + }, + { + "epoch": 0.25649299680800963, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 29410 + }, + { + "epoch": 0.25650171809317823, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 29411 + }, + { + "epoch": 0.2565104393783468, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 29412 + }, + { + "epoch": 0.2565191606635154, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 29413 + }, + { + "epoch": 0.256527881948684, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29414 + }, + { + "epoch": 0.2565366032338525, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 29415 + }, + { + "epoch": 0.2565453245190211, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 29416 + }, + { + "epoch": 0.2565540458041897, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 29417 + }, + { + "epoch": 0.25656276708935827, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29418 + }, + { + "epoch": 0.25657148837452687, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29419 + }, + { + "epoch": 0.25658020965969547, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 29420 + }, + { + "epoch": 0.256588930944864, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 29421 + }, + { + "epoch": 0.2565976522300326, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29422 + }, + { + "epoch": 0.2566063735152012, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 29423 + }, + { + "epoch": 0.25661509480036976, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 29424 + }, + { + "epoch": 0.25662381608553836, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 29425 + }, + { + "epoch": 0.25663253737070696, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 29426 + }, + { + "epoch": 0.2566412586558755, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 29427 + }, + { + "epoch": 0.2566499799410441, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 29428 + }, + { + "epoch": 0.2566587012262127, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 29429 + }, + { + "epoch": 0.2566674225113813, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 29430 + }, + { + "epoch": 0.25667614379654985, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 29431 + }, + { + "epoch": 0.25668486508171845, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 29432 + }, + { + "epoch": 0.25669358636688705, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 29433 + }, + { + "epoch": 0.2567023076520556, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 29434 + }, + { + "epoch": 0.2567110289372242, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 29435 + }, + { + "epoch": 0.2567197502223928, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29436 + }, + { + "epoch": 0.25672847150756134, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 29437 + }, + { + "epoch": 0.25673719279272994, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 29438 + }, + { + "epoch": 0.25674591407789854, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 29439 + }, + { + "epoch": 0.2567546353630671, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 29440 + }, + { + "epoch": 0.2567633566482357, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 29441 + }, + { + "epoch": 0.2567720779334043, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 29442 + }, + { + "epoch": 0.25678079921857283, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 29443 + }, + { + "epoch": 0.25678952050374143, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 29444 + }, + { + "epoch": 0.25679824178891003, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 29445 + }, + { + "epoch": 0.2568069630740786, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 29446 + }, + { + "epoch": 0.2568156843592472, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 29447 + }, + { + "epoch": 0.2568244056444158, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 29448 + }, + { + "epoch": 0.2568331269295843, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 29449 + }, + { + "epoch": 0.2568418482147529, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 29450 + }, + { + "epoch": 0.2568505694999215, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 29451 + }, + { + "epoch": 0.25685929078509007, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 29452 + }, + { + "epoch": 0.25686801207025867, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 29453 + }, + { + "epoch": 0.25687673335542727, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 29454 + }, + { + "epoch": 0.2568854546405958, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 29455 + }, + { + "epoch": 0.2568941759257644, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 29456 + }, + { + "epoch": 0.256902897210933, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29457 + }, + { + "epoch": 0.2569116184961016, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 29458 + }, + { + "epoch": 0.25692033978127016, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 29459 + }, + { + "epoch": 0.25692906106643876, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 29460 + }, + { + "epoch": 0.25693778235160736, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 29461 + }, + { + "epoch": 0.2569465036367759, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 29462 + }, + { + "epoch": 0.2569552249219445, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 29463 + }, + { + "epoch": 0.2569639462071131, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 29464 + }, + { + "epoch": 0.25697266749228165, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 29465 + }, + { + "epoch": 0.25698138877745025, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 29466 + }, + { + "epoch": 0.25699011006261885, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 29467 + }, + { + "epoch": 0.2569988313477874, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 29468 + }, + { + "epoch": 0.257007552632956, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 29469 + }, + { + "epoch": 0.2570162739181246, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 29470 + }, + { + "epoch": 0.25702499520329314, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 29471 + }, + { + "epoch": 0.25703371648846174, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 29472 + }, + { + "epoch": 0.25704243777363034, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 29473 + }, + { + "epoch": 0.2570511590587989, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 29474 + }, + { + "epoch": 0.2570598803439675, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 29475 + }, + { + "epoch": 0.2570686016291361, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 29476 + }, + { + "epoch": 0.25707732291430463, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 29477 + }, + { + "epoch": 0.25708604419947323, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 29478 + }, + { + "epoch": 0.25709476548464183, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 29479 + }, + { + "epoch": 0.2571034867698104, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 29480 + }, + { + "epoch": 0.257112208054979, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 29481 + }, + { + "epoch": 0.2571209293401476, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 29482 + }, + { + "epoch": 0.2571296506253161, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 29483 + }, + { + "epoch": 0.2571383719104847, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 29484 + }, + { + "epoch": 0.2571470931956533, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 29485 + }, + { + "epoch": 0.2571558144808219, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 29486 + }, + { + "epoch": 0.25716453576599047, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 29487 + }, + { + "epoch": 0.25717325705115907, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 29488 + }, + { + "epoch": 0.25718197833632767, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29489 + }, + { + "epoch": 0.2571906996214962, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29490 + }, + { + "epoch": 0.2571994209066648, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 29491 + }, + { + "epoch": 0.2572081421918334, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 29492 + }, + { + "epoch": 0.25721686347700196, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 29493 + }, + { + "epoch": 0.25722558476217056, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 29494 + }, + { + "epoch": 0.25723430604733916, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 29495 + }, + { + "epoch": 0.2572430273325077, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 29496 + }, + { + "epoch": 0.2572517486176763, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 29497 + }, + { + "epoch": 0.2572604699028449, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 29498 + }, + { + "epoch": 0.25726919118801345, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 29499 + }, + { + "epoch": 0.25727791247318205, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 29500 + }, + { + "epoch": 0.25728663375835065, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29501 + }, + { + "epoch": 0.2572953550435192, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 29502 + }, + { + "epoch": 0.2573040763286878, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 29503 + }, + { + "epoch": 0.2573127976138564, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 29504 + }, + { + "epoch": 0.25732151889902494, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29505 + }, + { + "epoch": 0.25733024018419354, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 29506 + }, + { + "epoch": 0.25733896146936214, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9802, + "step": 29507 + }, + { + "epoch": 0.2573476827545307, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 29508 + }, + { + "epoch": 0.2573564040396993, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 29509 + }, + { + "epoch": 0.2573651253248679, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 29510 + }, + { + "epoch": 0.2573738466100364, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29511 + }, + { + "epoch": 0.25738256789520503, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 29512 + }, + { + "epoch": 0.25739128918037363, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 29513 + }, + { + "epoch": 0.25740001046554223, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 29514 + }, + { + "epoch": 0.2574087317507108, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 29515 + }, + { + "epoch": 0.2574174530358794, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 29516 + }, + { + "epoch": 0.257426174321048, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 29517 + }, + { + "epoch": 0.2574348956062165, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 29518 + }, + { + "epoch": 0.2574436168913851, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 29519 + }, + { + "epoch": 0.2574523381765537, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 29520 + }, + { + "epoch": 0.25746105946172226, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 29521 + }, + { + "epoch": 0.25746978074689086, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 29522 + }, + { + "epoch": 0.25747850203205946, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 29523 + }, + { + "epoch": 0.257487223317228, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 29524 + }, + { + "epoch": 0.2574959446023966, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 29525 + }, + { + "epoch": 0.2575046658875652, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 29526 + }, + { + "epoch": 0.25751338717273375, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 29527 + }, + { + "epoch": 0.25752210845790235, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 29528 + }, + { + "epoch": 0.25753082974307095, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 29529 + }, + { + "epoch": 0.2575395510282395, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 29530 + }, + { + "epoch": 0.2575482723134081, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 29531 + }, + { + "epoch": 0.2575569935985767, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 29532 + }, + { + "epoch": 0.25756571488374524, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 29533 + }, + { + "epoch": 0.25757443616891385, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 29534 + }, + { + "epoch": 0.25758315745408245, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 29535 + }, + { + "epoch": 0.257591878739251, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 29536 + }, + { + "epoch": 0.2576006000244196, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29537 + }, + { + "epoch": 0.2576093213095882, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 29538 + }, + { + "epoch": 0.2576180425947568, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29539 + }, + { + "epoch": 0.25762676387992534, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 29540 + }, + { + "epoch": 0.25763548516509394, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 29541 + }, + { + "epoch": 0.25764420645026254, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 29542 + }, + { + "epoch": 0.2576529277354311, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29543 + }, + { + "epoch": 0.2576616490205997, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 29544 + }, + { + "epoch": 0.2576703703057683, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 29545 + }, + { + "epoch": 0.2576790915909368, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 29546 + }, + { + "epoch": 0.2576878128761054, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 29547 + }, + { + "epoch": 0.257696534161274, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 29548 + }, + { + "epoch": 0.25770525544644257, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 29549 + }, + { + "epoch": 0.25771397673161117, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 29550 + }, + { + "epoch": 0.25772269801677977, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 29551 + }, + { + "epoch": 0.2577314193019483, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 29552 + }, + { + "epoch": 0.2577401405871169, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 29553 + }, + { + "epoch": 0.2577488618722855, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 29554 + }, + { + "epoch": 0.25775758315745406, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 29555 + }, + { + "epoch": 0.25776630444262266, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 29556 + }, + { + "epoch": 0.25777502572779126, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29557 + }, + { + "epoch": 0.2577837470129598, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 29558 + }, + { + "epoch": 0.2577924682981284, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29559 + }, + { + "epoch": 0.257801189583297, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29560 + }, + { + "epoch": 0.25780991086846555, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 29561 + }, + { + "epoch": 0.25781863215363415, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 29562 + }, + { + "epoch": 0.25782735343880275, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 29563 + }, + { + "epoch": 0.2578360747239713, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 29564 + }, + { + "epoch": 0.2578447960091399, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 29565 + }, + { + "epoch": 0.2578535172943085, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 29566 + }, + { + "epoch": 0.2578622385794771, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 29567 + }, + { + "epoch": 0.25787095986464564, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 29568 + }, + { + "epoch": 0.25787968114981424, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29569 + }, + { + "epoch": 0.25788840243498284, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 29570 + }, + { + "epoch": 0.2578971237201514, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 29571 + }, + { + "epoch": 0.25790584500532, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 29572 + }, + { + "epoch": 0.2579145662904886, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 29573 + }, + { + "epoch": 0.25792328757565713, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 29574 + }, + { + "epoch": 0.25793200886082573, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 29575 + }, + { + "epoch": 0.25794073014599433, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 29576 + }, + { + "epoch": 0.2579494514311629, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 29577 + }, + { + "epoch": 0.2579581727163315, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 29578 + }, + { + "epoch": 0.2579668940015001, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 29579 + }, + { + "epoch": 0.2579756152866686, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 29580 + }, + { + "epoch": 0.2579843365718372, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 29581 + }, + { + "epoch": 0.2579930578570058, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 29582 + }, + { + "epoch": 0.25800177914217437, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 29583 + }, + { + "epoch": 0.25801050042734297, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 29584 + }, + { + "epoch": 0.25801922171251157, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 29585 + }, + { + "epoch": 0.2580279429976801, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 29586 + }, + { + "epoch": 0.2580366642828487, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29587 + }, + { + "epoch": 0.2580453855680173, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 29588 + }, + { + "epoch": 0.25805410685318586, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 29589 + }, + { + "epoch": 0.25806282813835446, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 29590 + }, + { + "epoch": 0.25807154942352306, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 29591 + }, + { + "epoch": 0.2580802707086916, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 29592 + }, + { + "epoch": 0.2580889919938602, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 29593 + }, + { + "epoch": 0.2580977132790288, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 29594 + }, + { + "epoch": 0.2581064345641974, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 29595 + }, + { + "epoch": 0.25811515584936595, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 29596 + }, + { + "epoch": 0.25812387713453455, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 29597 + }, + { + "epoch": 0.25813259841970315, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 29598 + }, + { + "epoch": 0.2581413197048717, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 29599 + }, + { + "epoch": 0.2581500409900403, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 29600 + }, + { + "epoch": 0.2581587622752089, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 29601 + }, + { + "epoch": 0.25816748356037744, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29602 + }, + { + "epoch": 0.25817620484554604, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 29603 + }, + { + "epoch": 0.25818492613071464, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 29604 + }, + { + "epoch": 0.2581936474158832, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 29605 + }, + { + "epoch": 0.2582023687010518, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 29606 + }, + { + "epoch": 0.2582110899862204, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 29607 + }, + { + "epoch": 0.25821981127138893, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 29608 + }, + { + "epoch": 0.25822853255655753, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 29609 + }, + { + "epoch": 0.25823725384172613, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 29610 + }, + { + "epoch": 0.2582459751268947, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 29611 + }, + { + "epoch": 0.2582546964120633, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 29612 + }, + { + "epoch": 0.2582634176972319, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 29613 + }, + { + "epoch": 0.2582721389824004, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29614 + }, + { + "epoch": 0.258280860267569, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 29615 + }, + { + "epoch": 0.2582895815527376, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 29616 + }, + { + "epoch": 0.25829830283790617, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29617 + }, + { + "epoch": 0.25830702412307477, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 29618 + }, + { + "epoch": 0.25831574540824337, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 29619 + }, + { + "epoch": 0.2583244666934119, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 29620 + }, + { + "epoch": 0.2583331879785805, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 29621 + }, + { + "epoch": 0.2583419092637491, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 29622 + }, + { + "epoch": 0.2583506305489177, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 29623 + }, + { + "epoch": 0.25835935183408626, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 29624 + }, + { + "epoch": 0.25836807311925486, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 29625 + }, + { + "epoch": 0.25837679440442346, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 29626 + }, + { + "epoch": 0.258385515689592, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 29627 + }, + { + "epoch": 0.2583942369747606, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 29628 + }, + { + "epoch": 0.2584029582599292, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 29629 + }, + { + "epoch": 0.25841167954509775, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 29630 + }, + { + "epoch": 0.25842040083026635, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 29631 + }, + { + "epoch": 0.25842912211543495, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 29632 + }, + { + "epoch": 0.2584378434006035, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 29633 + }, + { + "epoch": 0.2584465646857721, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 29634 + }, + { + "epoch": 0.2584552859709407, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 29635 + }, + { + "epoch": 0.25846400725610924, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 29636 + }, + { + "epoch": 0.25847272854127784, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 29637 + }, + { + "epoch": 0.25848144982644644, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 29638 + }, + { + "epoch": 0.258490171111615, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 29639 + }, + { + "epoch": 0.2584988923967836, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29640 + }, + { + "epoch": 0.2585076136819522, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 29641 + }, + { + "epoch": 0.25851633496712073, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 29642 + }, + { + "epoch": 0.25852505625228933, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29643 + }, + { + "epoch": 0.25853377753745793, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 29644 + }, + { + "epoch": 0.2585424988226265, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 29645 + }, + { + "epoch": 0.2585512201077951, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 29646 + }, + { + "epoch": 0.2585599413929637, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29647 + }, + { + "epoch": 0.2585686626781323, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 29648 + }, + { + "epoch": 0.2585773839633008, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 29649 + }, + { + "epoch": 0.2585861052484694, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 29650 + }, + { + "epoch": 0.258594826533638, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 29651 + }, + { + "epoch": 0.25860354781880657, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29652 + }, + { + "epoch": 0.25861226910397517, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 29653 + }, + { + "epoch": 0.25862099038914377, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 29654 + }, + { + "epoch": 0.2586297116743123, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 29655 + }, + { + "epoch": 0.2586384329594809, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 29656 + }, + { + "epoch": 0.2586471542446495, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29657 + }, + { + "epoch": 0.25865587552981806, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 29658 + }, + { + "epoch": 0.25866459681498666, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 29659 + }, + { + "epoch": 0.25867331810015526, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 29660 + }, + { + "epoch": 0.2586820393853238, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 29661 + }, + { + "epoch": 0.2586907606704924, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 29662 + }, + { + "epoch": 0.258699481955661, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 29663 + }, + { + "epoch": 0.25870820324082955, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 29664 + }, + { + "epoch": 0.25871692452599815, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 29665 + }, + { + "epoch": 0.25872564581116675, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 29666 + }, + { + "epoch": 0.2587343670963353, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 29667 + }, + { + "epoch": 0.2587430883815039, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 29668 + }, + { + "epoch": 0.2587518096666725, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 29669 + }, + { + "epoch": 0.25876053095184104, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 29670 + }, + { + "epoch": 0.25876925223700964, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 29671 + }, + { + "epoch": 0.25877797352217824, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29672 + }, + { + "epoch": 0.2587866948073468, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 29673 + }, + { + "epoch": 0.2587954160925154, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 29674 + }, + { + "epoch": 0.258804137377684, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 29675 + }, + { + "epoch": 0.2588128586628526, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 29676 + }, + { + "epoch": 0.25882157994802113, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 29677 + }, + { + "epoch": 0.25883030123318973, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 29678 + }, + { + "epoch": 0.25883902251835833, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29679 + }, + { + "epoch": 0.2588477438035269, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 29680 + }, + { + "epoch": 0.2588564650886955, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 29681 + }, + { + "epoch": 0.2588651863738641, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 29682 + }, + { + "epoch": 0.2588739076590326, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 29683 + }, + { + "epoch": 0.2588826289442012, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 29684 + }, + { + "epoch": 0.2588913502293698, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 29685 + }, + { + "epoch": 0.25890007151453837, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 29686 + }, + { + "epoch": 0.25890879279970697, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 29687 + }, + { + "epoch": 0.25891751408487557, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 29688 + }, + { + "epoch": 0.2589262353700441, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 29689 + }, + { + "epoch": 0.2589349566552127, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 29690 + }, + { + "epoch": 0.2589436779403813, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 29691 + }, + { + "epoch": 0.25895239922554986, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 29692 + }, + { + "epoch": 0.25896112051071846, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 29693 + }, + { + "epoch": 0.25896984179588706, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 29694 + }, + { + "epoch": 0.2589785630810556, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 29695 + }, + { + "epoch": 0.2589872843662242, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 29696 + }, + { + "epoch": 0.2589960056513928, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 29697 + }, + { + "epoch": 0.25900472693656135, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 29698 + }, + { + "epoch": 0.25901344822172995, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 29699 + }, + { + "epoch": 0.25902216950689855, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 29700 + }, + { + "epoch": 0.2590308907920671, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 29701 + }, + { + "epoch": 0.2590396120772357, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 29702 + }, + { + "epoch": 0.2590483333624043, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 29703 + }, + { + "epoch": 0.2590570546475729, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 29704 + }, + { + "epoch": 0.25906577593274144, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 29705 + }, + { + "epoch": 0.25907449721791004, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 29706 + }, + { + "epoch": 0.25908321850307864, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 29707 + }, + { + "epoch": 0.2590919397882472, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 29708 + }, + { + "epoch": 0.2591006610734158, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 29709 + }, + { + "epoch": 0.2591093823585844, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 29710 + }, + { + "epoch": 0.25911810364375293, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 29711 + }, + { + "epoch": 0.25912682492892153, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 29712 + }, + { + "epoch": 0.25913554621409013, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 29713 + }, + { + "epoch": 0.2591442674992587, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 29714 + }, + { + "epoch": 0.2591529887844273, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29715 + }, + { + "epoch": 0.2591617100695959, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 29716 + }, + { + "epoch": 0.2591704313547644, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 29717 + }, + { + "epoch": 0.259179152639933, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 29718 + }, + { + "epoch": 0.2591878739251016, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 29719 + }, + { + "epoch": 0.25919659521027016, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 29720 + }, + { + "epoch": 0.25920531649543876, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 29721 + }, + { + "epoch": 0.25921403778060736, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 29722 + }, + { + "epoch": 0.2592227590657759, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 29723 + }, + { + "epoch": 0.2592314803509445, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 29724 + }, + { + "epoch": 0.2592402016361131, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 29725 + }, + { + "epoch": 0.25924892292128165, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29726 + }, + { + "epoch": 0.25925764420645026, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 29727 + }, + { + "epoch": 0.25926636549161886, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 29728 + }, + { + "epoch": 0.2592750867767874, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 29729 + }, + { + "epoch": 0.259283808061956, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 29730 + }, + { + "epoch": 0.2592925293471246, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 29731 + }, + { + "epoch": 0.2593012506322932, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 29732 + }, + { + "epoch": 0.25930997191746175, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 29733 + }, + { + "epoch": 0.25931869320263035, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 29734 + }, + { + "epoch": 0.25932741448779895, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 29735 + }, + { + "epoch": 0.2593361357729675, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 29736 + }, + { + "epoch": 0.2593448570581361, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 29737 + }, + { + "epoch": 0.2593535783433047, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 29738 + }, + { + "epoch": 0.25936229962847324, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 29739 + }, + { + "epoch": 0.25937102091364184, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 29740 + }, + { + "epoch": 0.25937974219881044, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 29741 + }, + { + "epoch": 0.259388463483979, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 29742 + }, + { + "epoch": 0.2593971847691476, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 29743 + }, + { + "epoch": 0.2594059060543162, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 29744 + }, + { + "epoch": 0.2594146273394847, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 29745 + }, + { + "epoch": 0.2594233486246533, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 29746 + }, + { + "epoch": 0.2594320699098219, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 29747 + }, + { + "epoch": 0.25944079119499047, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 29748 + }, + { + "epoch": 0.2594495124801591, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 29749 + }, + { + "epoch": 0.2594582337653277, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 29750 + }, + { + "epoch": 0.2594669550504962, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 29751 + }, + { + "epoch": 0.2594756763356648, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 29752 + }, + { + "epoch": 0.2594843976208334, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 29753 + }, + { + "epoch": 0.25949311890600196, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 29754 + }, + { + "epoch": 0.25950184019117056, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29755 + }, + { + "epoch": 0.25951056147633916, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 29756 + }, + { + "epoch": 0.25951928276150776, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 29757 + }, + { + "epoch": 0.2595280040466763, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 29758 + }, + { + "epoch": 0.2595367253318449, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 29759 + }, + { + "epoch": 0.2595454466170135, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29760 + }, + { + "epoch": 0.25955416790218205, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29761 + }, + { + "epoch": 0.25956288918735065, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 29762 + }, + { + "epoch": 0.25957161047251925, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 29763 + }, + { + "epoch": 0.2595803317576878, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 29764 + }, + { + "epoch": 0.2595890530428564, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 29765 + }, + { + "epoch": 0.259597774328025, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 29766 + }, + { + "epoch": 0.25960649561319354, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 29767 + }, + { + "epoch": 0.25961521689836214, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 29768 + }, + { + "epoch": 0.25962393818353074, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 29769 + }, + { + "epoch": 0.2596326594686993, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 29770 + }, + { + "epoch": 0.2596413807538679, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 29771 + }, + { + "epoch": 0.2596501020390365, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 29772 + }, + { + "epoch": 0.25965882332420503, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 29773 + }, + { + "epoch": 0.25966754460937364, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 29774 + }, + { + "epoch": 0.25967626589454224, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 29775 + }, + { + "epoch": 0.2596849871797108, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 29776 + }, + { + "epoch": 0.2596937084648794, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 29777 + }, + { + "epoch": 0.259702429750048, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 29778 + }, + { + "epoch": 0.2597111510352165, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 29779 + }, + { + "epoch": 0.2597198723203851, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 29780 + }, + { + "epoch": 0.2597285936055537, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 29781 + }, + { + "epoch": 0.25973731489072227, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 29782 + }, + { + "epoch": 0.25974603617589087, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 29783 + }, + { + "epoch": 0.25975475746105947, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 29784 + }, + { + "epoch": 0.25976347874622807, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 29785 + }, + { + "epoch": 0.2597722000313966, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 29786 + }, + { + "epoch": 0.2597809213165652, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29787 + }, + { + "epoch": 0.2597896426017338, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 29788 + }, + { + "epoch": 0.25979836388690236, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 29789 + }, + { + "epoch": 0.25980708517207096, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 29790 + }, + { + "epoch": 0.25981580645723956, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 29791 + }, + { + "epoch": 0.2598245277424081, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 29792 + }, + { + "epoch": 0.2598332490275767, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 29793 + }, + { + "epoch": 0.2598419703127453, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29794 + }, + { + "epoch": 0.25985069159791385, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 29795 + }, + { + "epoch": 0.25985941288308245, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 29796 + }, + { + "epoch": 0.25986813416825105, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 29797 + }, + { + "epoch": 0.2598768554534196, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 29798 + }, + { + "epoch": 0.2598855767385882, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 29799 + }, + { + "epoch": 0.2598942980237568, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29800 + }, + { + "epoch": 0.25990301930892534, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 29801 + }, + { + "epoch": 0.25991174059409394, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 29802 + }, + { + "epoch": 0.25992046187926254, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 29803 + }, + { + "epoch": 0.2599291831644311, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 29804 + }, + { + "epoch": 0.2599379044495997, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29805 + }, + { + "epoch": 0.2599466257347683, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 29806 + }, + { + "epoch": 0.25995534701993683, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 29807 + }, + { + "epoch": 0.25996406830510543, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 29808 + }, + { + "epoch": 0.25997278959027403, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 29809 + }, + { + "epoch": 0.2599815108754426, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 29810 + }, + { + "epoch": 0.2599902321606112, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 29811 + }, + { + "epoch": 0.2599989534457798, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 29812 + }, + { + "epoch": 0.2600076747309484, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 29813 + }, + { + "epoch": 0.2600163960161169, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 29814 + }, + { + "epoch": 0.2600251173012855, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 29815 + }, + { + "epoch": 0.2600338385864541, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 29816 + }, + { + "epoch": 0.26004255987162267, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29817 + }, + { + "epoch": 0.26005128115679127, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 29818 + }, + { + "epoch": 0.26006000244195987, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 29819 + }, + { + "epoch": 0.2600687237271284, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 29820 + }, + { + "epoch": 0.260077445012297, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 29821 + }, + { + "epoch": 0.2600861662974656, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 29822 + }, + { + "epoch": 0.26009488758263416, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 29823 + }, + { + "epoch": 0.26010360886780276, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 29824 + }, + { + "epoch": 0.26011233015297136, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 29825 + }, + { + "epoch": 0.2601210514381399, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 29826 + }, + { + "epoch": 0.2601297727233085, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 29827 + }, + { + "epoch": 0.2601384940084771, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 29828 + }, + { + "epoch": 0.26014721529364565, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 29829 + }, + { + "epoch": 0.26015593657881425, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 29830 + }, + { + "epoch": 0.26016465786398285, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 29831 + }, + { + "epoch": 0.2601733791491514, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 29832 + }, + { + "epoch": 0.26018210043432, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 29833 + }, + { + "epoch": 0.2601908217194886, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 29834 + }, + { + "epoch": 0.26019954300465714, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 29835 + }, + { + "epoch": 0.26020826428982574, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 29836 + }, + { + "epoch": 0.26021698557499434, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 29837 + }, + { + "epoch": 0.2602257068601629, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 29838 + }, + { + "epoch": 0.2602344281453315, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 29839 + }, + { + "epoch": 0.2602431494305001, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 29840 + }, + { + "epoch": 0.2602518707156687, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 29841 + }, + { + "epoch": 0.26026059200083723, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 29842 + }, + { + "epoch": 0.26026931328600583, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 29843 + }, + { + "epoch": 0.26027803457117443, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 29844 + }, + { + "epoch": 0.260286755856343, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 29845 + }, + { + "epoch": 0.2602954771415116, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 29846 + }, + { + "epoch": 0.2603041984266802, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 29847 + }, + { + "epoch": 0.2603129197118487, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29848 + }, + { + "epoch": 0.2603216409970173, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 29849 + }, + { + "epoch": 0.2603303622821859, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 29850 + }, + { + "epoch": 0.26033908356735447, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 29851 + }, + { + "epoch": 0.26034780485252307, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 29852 + }, + { + "epoch": 0.26035652613769167, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 29853 + }, + { + "epoch": 0.2603652474228602, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 29854 + }, + { + "epoch": 0.2603739687080288, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 29855 + }, + { + "epoch": 0.2603826899931974, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29856 + }, + { + "epoch": 0.26039141127836596, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 29857 + }, + { + "epoch": 0.26040013256353456, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 29858 + }, + { + "epoch": 0.26040885384870316, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 29859 + }, + { + "epoch": 0.2604175751338717, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 29860 + }, + { + "epoch": 0.2604262964190403, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 29861 + }, + { + "epoch": 0.2604350177042089, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 29862 + }, + { + "epoch": 0.26044373898937745, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 29863 + }, + { + "epoch": 0.26045246027454605, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 29864 + }, + { + "epoch": 0.26046118155971465, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 29865 + }, + { + "epoch": 0.2604699028448832, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 29866 + }, + { + "epoch": 0.2604786241300518, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 29867 + }, + { + "epoch": 0.2604873454152204, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 29868 + }, + { + "epoch": 0.260496066700389, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 29869 + }, + { + "epoch": 0.26050478798555754, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 29870 + }, + { + "epoch": 0.26051350927072614, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 29871 + }, + { + "epoch": 0.26052223055589474, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 29872 + }, + { + "epoch": 0.2605309518410633, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 29873 + }, + { + "epoch": 0.2605396731262319, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 29874 + }, + { + "epoch": 0.2605483944114005, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 29875 + }, + { + "epoch": 0.26055711569656903, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 29876 + }, + { + "epoch": 0.26056583698173763, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 29877 + }, + { + "epoch": 0.26057455826690623, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 29878 + }, + { + "epoch": 0.2605832795520748, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 29879 + }, + { + "epoch": 0.2605920008372434, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 29880 + }, + { + "epoch": 0.260600722122412, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 29881 + }, + { + "epoch": 0.2606094434075805, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 29882 + }, + { + "epoch": 0.2606181646927491, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 29883 + }, + { + "epoch": 0.2606268859779177, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 29884 + }, + { + "epoch": 0.26063560726308627, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 29885 + }, + { + "epoch": 0.26064432854825487, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 29886 + }, + { + "epoch": 0.26065304983342347, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 29887 + }, + { + "epoch": 0.260661771118592, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 29888 + }, + { + "epoch": 0.2606704924037606, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 29889 + }, + { + "epoch": 0.2606792136889292, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 29890 + }, + { + "epoch": 0.26068793497409776, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 29891 + }, + { + "epoch": 0.26069665625926636, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 29892 + }, + { + "epoch": 0.26070537754443496, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 29893 + }, + { + "epoch": 0.26071409882960356, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 29894 + }, + { + "epoch": 0.2607228201147721, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 29895 + }, + { + "epoch": 0.2607315413999407, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 29896 + }, + { + "epoch": 0.2607402626851093, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 29897 + }, + { + "epoch": 0.26074898397027785, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 29898 + }, + { + "epoch": 0.26075770525544645, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 29899 + }, + { + "epoch": 0.26076642654061505, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 29900 + }, + { + "epoch": 0.2607751478257836, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 29901 + }, + { + "epoch": 0.2607838691109522, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9774, + "step": 29902 + }, + { + "epoch": 0.2607925903961208, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 29903 + }, + { + "epoch": 0.26080131168128934, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 29904 + }, + { + "epoch": 0.26081003296645794, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 29905 + }, + { + "epoch": 0.26081875425162654, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 29906 + }, + { + "epoch": 0.2608274755367951, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 29907 + }, + { + "epoch": 0.2608361968219637, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 29908 + }, + { + "epoch": 0.2608449181071323, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 29909 + }, + { + "epoch": 0.26085363939230083, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 29910 + }, + { + "epoch": 0.26086236067746943, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 29911 + }, + { + "epoch": 0.26087108196263803, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 29912 + }, + { + "epoch": 0.2608798032478066, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 29913 + }, + { + "epoch": 0.2608885245329752, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29914 + }, + { + "epoch": 0.2608972458181438, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 29915 + }, + { + "epoch": 0.2609059671033123, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 29916 + }, + { + "epoch": 0.2609146883884809, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 29917 + }, + { + "epoch": 0.2609234096736495, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 29918 + }, + { + "epoch": 0.26093213095881806, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 29919 + }, + { + "epoch": 0.26094085224398667, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 29920 + }, + { + "epoch": 0.26094957352915527, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29921 + }, + { + "epoch": 0.26095829481432387, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 29922 + }, + { + "epoch": 0.2609670160994924, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 29923 + }, + { + "epoch": 0.260975737384661, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 29924 + }, + { + "epoch": 0.2609844586698296, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 29925 + }, + { + "epoch": 0.26099317995499816, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 29926 + }, + { + "epoch": 0.26100190124016676, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 29927 + }, + { + "epoch": 0.26101062252533536, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 29928 + }, + { + "epoch": 0.2610193438105039, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 29929 + }, + { + "epoch": 0.2610280650956725, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 29930 + }, + { + "epoch": 0.2610367863808411, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 29931 + }, + { + "epoch": 0.26104550766600965, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 29932 + }, + { + "epoch": 0.26105422895117825, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 29933 + }, + { + "epoch": 0.26106295023634685, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9792, + "step": 29934 + }, + { + "epoch": 0.2610716715215154, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 29935 + }, + { + "epoch": 0.261080392806684, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 29936 + }, + { + "epoch": 0.2610891140918526, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 29937 + }, + { + "epoch": 0.26109783537702114, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 29938 + }, + { + "epoch": 0.26110655666218974, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 29939 + }, + { + "epoch": 0.26111527794735834, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 29940 + }, + { + "epoch": 0.2611239992325269, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 29941 + }, + { + "epoch": 0.2611327205176955, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 29942 + }, + { + "epoch": 0.2611414418028641, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 29943 + }, + { + "epoch": 0.2611501630880326, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 29944 + }, + { + "epoch": 0.2611588843732012, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 29945 + }, + { + "epoch": 0.26116760565836983, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 29946 + }, + { + "epoch": 0.2611763269435384, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 29947 + }, + { + "epoch": 0.261185048228707, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 29948 + }, + { + "epoch": 0.2611937695138756, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 29949 + }, + { + "epoch": 0.2612024907990442, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 29950 + }, + { + "epoch": 0.2612112120842127, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 29951 + }, + { + "epoch": 0.2612199333693813, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 29952 + }, + { + "epoch": 0.2612286546545499, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 29953 + }, + { + "epoch": 0.26123737593971846, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 29954 + }, + { + "epoch": 0.26124609722488706, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 29955 + }, + { + "epoch": 0.26125481851005566, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 29956 + }, + { + "epoch": 0.2612635397952242, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 29957 + }, + { + "epoch": 0.2612722610803928, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 29958 + }, + { + "epoch": 0.2612809823655614, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 29959 + }, + { + "epoch": 0.26128970365072995, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 29960 + }, + { + "epoch": 0.26129842493589855, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 29961 + }, + { + "epoch": 0.26130714622106715, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 29962 + }, + { + "epoch": 0.2613158675062357, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 29963 + }, + { + "epoch": 0.2613245887914043, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 29964 + }, + { + "epoch": 0.2613333100765729, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 29965 + }, + { + "epoch": 0.26134203136174144, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 29966 + }, + { + "epoch": 0.26135075264691005, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 29967 + }, + { + "epoch": 0.26135947393207865, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 29968 + }, + { + "epoch": 0.2613681952172472, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 29969 + }, + { + "epoch": 0.2613769165024158, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 29970 + }, + { + "epoch": 0.2613856377875844, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 29971 + }, + { + "epoch": 0.26139435907275294, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 29972 + }, + { + "epoch": 0.26140308035792154, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 29973 + }, + { + "epoch": 0.26141180164309014, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 29974 + }, + { + "epoch": 0.2614205229282587, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 29975 + }, + { + "epoch": 0.2614292442134273, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 29976 + }, + { + "epoch": 0.2614379654985959, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 29977 + }, + { + "epoch": 0.2614466867837645, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 29978 + }, + { + "epoch": 0.261455408068933, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 29979 + }, + { + "epoch": 0.2614641293541016, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 29980 + }, + { + "epoch": 0.2614728506392702, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 29981 + }, + { + "epoch": 0.26148157192443877, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 29982 + }, + { + "epoch": 0.26149029320960737, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 29983 + }, + { + "epoch": 0.26149901449477597, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 29984 + }, + { + "epoch": 0.2615077357799445, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 29985 + }, + { + "epoch": 0.2615164570651131, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 29986 + }, + { + "epoch": 0.2615251783502817, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 29987 + }, + { + "epoch": 0.26153389963545026, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 29988 + }, + { + "epoch": 0.26154262092061886, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 29989 + }, + { + "epoch": 0.26155134220578746, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 29990 + }, + { + "epoch": 0.261560063490956, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 29991 + }, + { + "epoch": 0.2615687847761246, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 29992 + }, + { + "epoch": 0.2615775060612932, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 29993 + }, + { + "epoch": 0.26158622734646175, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 29994 + }, + { + "epoch": 0.26159494863163035, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 29995 + }, + { + "epoch": 0.26160366991679895, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 29996 + }, + { + "epoch": 0.2616123912019675, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 29997 + }, + { + "epoch": 0.2616211124871361, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 29998 + }, + { + "epoch": 0.2616298337723047, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 29999 + }, + { + "epoch": 0.26163855505747324, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 30000 + }, + { + "epoch": 0.26164727634264184, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 30001 + }, + { + "epoch": 0.26165599762781044, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 30002 + }, + { + "epoch": 0.26166471891297904, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 30003 + }, + { + "epoch": 0.2616734401981476, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 30004 + }, + { + "epoch": 0.2616821614833162, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 30005 + }, + { + "epoch": 0.2616908827684848, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 30006 + }, + { + "epoch": 0.26169960405365333, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 30007 + }, + { + "epoch": 0.26170832533882193, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 30008 + }, + { + "epoch": 0.26171704662399053, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 30009 + }, + { + "epoch": 0.2617257679091591, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 30010 + }, + { + "epoch": 0.2617344891943277, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 30011 + }, + { + "epoch": 0.2617432104794963, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 30012 + }, + { + "epoch": 0.2617519317646648, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 30013 + }, + { + "epoch": 0.2617606530498334, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 30014 + }, + { + "epoch": 0.261769374335002, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 30015 + }, + { + "epoch": 0.26177809562017057, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 30016 + }, + { + "epoch": 0.26178681690533917, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 30017 + }, + { + "epoch": 0.26179553819050777, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 30018 + }, + { + "epoch": 0.2618042594756763, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 30019 + }, + { + "epoch": 0.2618129807608449, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 30020 + }, + { + "epoch": 0.2618217020460135, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 30021 + }, + { + "epoch": 0.26183042333118206, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 30022 + }, + { + "epoch": 0.26183914461635066, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 30023 + }, + { + "epoch": 0.26184786590151926, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 30024 + }, + { + "epoch": 0.2618565871866878, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 30025 + }, + { + "epoch": 0.2618653084718564, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 30026 + }, + { + "epoch": 0.261874029757025, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 30027 + }, + { + "epoch": 0.26188275104219355, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 30028 + }, + { + "epoch": 0.26189147232736215, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 30029 + }, + { + "epoch": 0.26190019361253075, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 30030 + }, + { + "epoch": 0.26190891489769935, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 30031 + }, + { + "epoch": 0.2619176361828679, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 30032 + }, + { + "epoch": 0.2619263574680365, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 30033 + }, + { + "epoch": 0.2619350787532051, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 30034 + }, + { + "epoch": 0.26194380003837364, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 30035 + }, + { + "epoch": 0.26195252132354224, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 30036 + }, + { + "epoch": 0.26196124260871084, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 30037 + }, + { + "epoch": 0.2619699638938794, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 30038 + }, + { + "epoch": 0.261978685179048, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 30039 + }, + { + "epoch": 0.2619874064642166, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 30040 + }, + { + "epoch": 0.26199612774938513, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 30041 + }, + { + "epoch": 0.26200484903455373, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 30042 + }, + { + "epoch": 0.26201357031972233, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 30043 + }, + { + "epoch": 0.2620222916048909, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 30044 + }, + { + "epoch": 0.2620310128900595, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 30045 + }, + { + "epoch": 0.2620397341752281, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 30046 + }, + { + "epoch": 0.2620484554603966, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 30047 + }, + { + "epoch": 0.2620571767455652, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 30048 + }, + { + "epoch": 0.2620658980307338, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 30049 + }, + { + "epoch": 0.26207461931590237, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 30050 + }, + { + "epoch": 0.26208334060107097, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 30051 + }, + { + "epoch": 0.26209206188623957, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 30052 + }, + { + "epoch": 0.2621007831714081, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 30053 + }, + { + "epoch": 0.2621095044565767, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 30054 + }, + { + "epoch": 0.2621182257417453, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 30055 + }, + { + "epoch": 0.26212694702691386, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 30056 + }, + { + "epoch": 0.26213566831208246, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 30057 + }, + { + "epoch": 0.26214438959725106, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 30058 + }, + { + "epoch": 0.26215311088241966, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 30059 + }, + { + "epoch": 0.2621618321675882, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 30060 + }, + { + "epoch": 0.2621705534527568, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 30061 + }, + { + "epoch": 0.2621792747379254, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30062 + }, + { + "epoch": 0.26218799602309395, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 30063 + }, + { + "epoch": 0.26219671730826255, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 30064 + }, + { + "epoch": 0.26220543859343115, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 30065 + }, + { + "epoch": 0.2622141598785997, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 30066 + }, + { + "epoch": 0.2622228811637683, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 30067 + }, + { + "epoch": 0.2622316024489369, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 30068 + }, + { + "epoch": 0.26224032373410544, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 30069 + }, + { + "epoch": 0.26224904501927404, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 30070 + }, + { + "epoch": 0.26225776630444264, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 30071 + }, + { + "epoch": 0.2622664875896112, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 30072 + }, + { + "epoch": 0.2622752088747798, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 30073 + }, + { + "epoch": 0.2622839301599484, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 30074 + }, + { + "epoch": 0.26229265144511693, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 30075 + }, + { + "epoch": 0.26230137273028553, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30076 + }, + { + "epoch": 0.26231009401545413, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 30077 + }, + { + "epoch": 0.2623188153006227, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 30078 + }, + { + "epoch": 0.2623275365857913, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 30079 + }, + { + "epoch": 0.2623362578709599, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 30080 + }, + { + "epoch": 0.2623449791561284, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 30081 + }, + { + "epoch": 0.262353700441297, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 30082 + }, + { + "epoch": 0.2623624217264656, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 30083 + }, + { + "epoch": 0.26237114301163417, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 30084 + }, + { + "epoch": 0.26237986429680277, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 30085 + }, + { + "epoch": 0.26238858558197137, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 30086 + }, + { + "epoch": 0.26239730686713997, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 30087 + }, + { + "epoch": 0.2624060281523085, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30088 + }, + { + "epoch": 0.2624147494374771, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 30089 + }, + { + "epoch": 0.2624234707226457, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 30090 + }, + { + "epoch": 0.26243219200781426, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30091 + }, + { + "epoch": 0.26244091329298286, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 30092 + }, + { + "epoch": 0.26244963457815146, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30093 + }, + { + "epoch": 0.26245835586332, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 30094 + }, + { + "epoch": 0.2624670771484886, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 30095 + }, + { + "epoch": 0.2624757984336572, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 30096 + }, + { + "epoch": 0.26248451971882575, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 30097 + }, + { + "epoch": 0.26249324100399435, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 30098 + }, + { + "epoch": 0.26250196228916295, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 30099 + }, + { + "epoch": 0.2625106835743315, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 30100 + }, + { + "epoch": 0.2625194048595001, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 30101 + }, + { + "epoch": 0.2625281261446687, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 30102 + }, + { + "epoch": 0.26253684742983724, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 30103 + }, + { + "epoch": 0.26254556871500584, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 30104 + }, + { + "epoch": 0.26255429000017444, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 30105 + }, + { + "epoch": 0.262563011285343, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 30106 + }, + { + "epoch": 0.2625717325705116, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 30107 + }, + { + "epoch": 0.2625804538556802, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 30108 + }, + { + "epoch": 0.26258917514084873, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 30109 + }, + { + "epoch": 0.26259789642601733, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 30110 + }, + { + "epoch": 0.26260661771118593, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 30111 + }, + { + "epoch": 0.26261533899635453, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 30112 + }, + { + "epoch": 0.2626240602815231, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 30113 + }, + { + "epoch": 0.2626327815666917, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 30114 + }, + { + "epoch": 0.2626415028518603, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 30115 + }, + { + "epoch": 0.2626502241370288, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 30116 + }, + { + "epoch": 0.2626589454221974, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30117 + }, + { + "epoch": 0.262667666707366, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 30118 + }, + { + "epoch": 0.26267638799253457, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 30119 + }, + { + "epoch": 0.26268510927770317, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 30120 + }, + { + "epoch": 0.26269383056287177, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 30121 + }, + { + "epoch": 0.2627025518480403, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 30122 + }, + { + "epoch": 0.2627112731332089, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 30123 + }, + { + "epoch": 0.2627199944183775, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 30124 + }, + { + "epoch": 0.26272871570354606, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 30125 + }, + { + "epoch": 0.26273743698871466, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 30126 + }, + { + "epoch": 0.26274615827388326, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 30127 + }, + { + "epoch": 0.2627548795590518, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 30128 + }, + { + "epoch": 0.2627636008442204, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 30129 + }, + { + "epoch": 0.262772322129389, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 30130 + }, + { + "epoch": 0.26278104341455755, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 30131 + }, + { + "epoch": 0.26278976469972615, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 30132 + }, + { + "epoch": 0.26279848598489475, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 30133 + }, + { + "epoch": 0.2628072072700633, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30134 + }, + { + "epoch": 0.2628159285552319, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 30135 + }, + { + "epoch": 0.2628246498404005, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30136 + }, + { + "epoch": 0.26283337112556904, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 30137 + }, + { + "epoch": 0.26284209241073764, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 30138 + }, + { + "epoch": 0.26285081369590624, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 30139 + }, + { + "epoch": 0.26285953498107484, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 30140 + }, + { + "epoch": 0.2628682562662434, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 30141 + }, + { + "epoch": 0.262876977551412, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 30142 + }, + { + "epoch": 0.2628856988365806, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 30143 + }, + { + "epoch": 0.26289442012174913, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 30144 + }, + { + "epoch": 0.26290314140691773, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 30145 + }, + { + "epoch": 0.26291186269208633, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 30146 + }, + { + "epoch": 0.2629205839772549, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 30147 + }, + { + "epoch": 0.2629293052624235, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 30148 + }, + { + "epoch": 0.2629380265475921, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 30149 + }, + { + "epoch": 0.2629467478327606, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 30150 + }, + { + "epoch": 0.2629554691179292, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 30151 + }, + { + "epoch": 0.2629641904030978, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 30152 + }, + { + "epoch": 0.26297291168826636, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 30153 + }, + { + "epoch": 0.26298163297343496, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 30154 + }, + { + "epoch": 0.26299035425860356, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 30155 + }, + { + "epoch": 0.2629990755437721, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 30156 + }, + { + "epoch": 0.2630077968289407, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 30157 + }, + { + "epoch": 0.2630165181141093, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 30158 + }, + { + "epoch": 0.26302523939927785, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 30159 + }, + { + "epoch": 0.26303396068444646, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30160 + }, + { + "epoch": 0.26304268196961506, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 30161 + }, + { + "epoch": 0.2630514032547836, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 30162 + }, + { + "epoch": 0.2630601245399522, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 30163 + }, + { + "epoch": 0.2630688458251208, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 30164 + }, + { + "epoch": 0.26307756711028935, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 30165 + }, + { + "epoch": 0.26308628839545795, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 30166 + }, + { + "epoch": 0.26309500968062655, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 30167 + }, + { + "epoch": 0.26310373096579515, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 30168 + }, + { + "epoch": 0.2631124522509637, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 30169 + }, + { + "epoch": 0.2631211735361323, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 30170 + }, + { + "epoch": 0.2631298948213009, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 30171 + }, + { + "epoch": 0.26313861610646944, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30172 + }, + { + "epoch": 0.26314733739163804, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 30173 + }, + { + "epoch": 0.26315605867680664, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 30174 + }, + { + "epoch": 0.2631647799619752, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 30175 + }, + { + "epoch": 0.2631735012471438, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 30176 + }, + { + "epoch": 0.2631822225323124, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 30177 + }, + { + "epoch": 0.2631909438174809, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30178 + }, + { + "epoch": 0.2631996651026495, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30179 + }, + { + "epoch": 0.2632083863878181, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 30180 + }, + { + "epoch": 0.26321710767298667, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 30181 + }, + { + "epoch": 0.2632258289581553, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 30182 + }, + { + "epoch": 0.2632345502433239, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 30183 + }, + { + "epoch": 0.2632432715284924, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 30184 + }, + { + "epoch": 0.263251992813661, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 30185 + }, + { + "epoch": 0.2632607140988296, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 30186 + }, + { + "epoch": 0.26326943538399816, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 30187 + }, + { + "epoch": 0.26327815666916676, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30188 + }, + { + "epoch": 0.26328687795433536, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 30189 + }, + { + "epoch": 0.2632955992395039, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 30190 + }, + { + "epoch": 0.2633043205246725, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 30191 + }, + { + "epoch": 0.2633130418098411, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 30192 + }, + { + "epoch": 0.26332176309500965, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 30193 + }, + { + "epoch": 0.26333048438017825, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 30194 + }, + { + "epoch": 0.26333920566534685, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 30195 + }, + { + "epoch": 0.26334792695051545, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 30196 + }, + { + "epoch": 0.263356648235684, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 30197 + }, + { + "epoch": 0.2633653695208526, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 30198 + }, + { + "epoch": 0.2633740908060212, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30199 + }, + { + "epoch": 0.26338281209118974, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 30200 + }, + { + "epoch": 0.26339153337635834, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 30201 + }, + { + "epoch": 0.26340025466152694, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 30202 + }, + { + "epoch": 0.2634089759466955, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 30203 + }, + { + "epoch": 0.2634176972318641, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 30204 + }, + { + "epoch": 0.2634264185170327, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 30205 + }, + { + "epoch": 0.26343513980220123, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 30206 + }, + { + "epoch": 0.26344386108736983, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 30207 + }, + { + "epoch": 0.26345258237253844, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 30208 + }, + { + "epoch": 0.263461303657707, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 30209 + }, + { + "epoch": 0.2634700249428756, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 30210 + }, + { + "epoch": 0.2634787462280442, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 30211 + }, + { + "epoch": 0.2634874675132127, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 30212 + }, + { + "epoch": 0.2634961887983813, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 30213 + }, + { + "epoch": 0.2635049100835499, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 30214 + }, + { + "epoch": 0.26351363136871847, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 30215 + }, + { + "epoch": 0.26352235265388707, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 30216 + }, + { + "epoch": 0.26353107393905567, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 30217 + }, + { + "epoch": 0.2635397952242242, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 30218 + }, + { + "epoch": 0.2635485165093928, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 30219 + }, + { + "epoch": 0.2635572377945614, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 30220 + }, + { + "epoch": 0.26356595907973, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 30221 + }, + { + "epoch": 0.26357468036489856, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 30222 + }, + { + "epoch": 0.26358340165006716, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 30223 + }, + { + "epoch": 0.26359212293523576, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 30224 + }, + { + "epoch": 0.2636008442204043, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 30225 + }, + { + "epoch": 0.2636095655055729, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 30226 + }, + { + "epoch": 0.2636182867907415, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 30227 + }, + { + "epoch": 0.26362700807591005, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 30228 + }, + { + "epoch": 0.26363572936107865, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 30229 + }, + { + "epoch": 0.26364445064624725, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 30230 + }, + { + "epoch": 0.2636531719314158, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9789, + "step": 30231 + }, + { + "epoch": 0.2636618932165844, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 30232 + }, + { + "epoch": 0.263670614501753, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 30233 + }, + { + "epoch": 0.26367933578692154, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 30234 + }, + { + "epoch": 0.26368805707209014, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 30235 + }, + { + "epoch": 0.26369677835725874, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 30236 + }, + { + "epoch": 0.2637054996424273, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 30237 + }, + { + "epoch": 0.2637142209275959, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 30238 + }, + { + "epoch": 0.2637229422127645, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 30239 + }, + { + "epoch": 0.26373166349793303, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 30240 + }, + { + "epoch": 0.26374038478310163, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 30241 + }, + { + "epoch": 0.26374910606827023, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 30242 + }, + { + "epoch": 0.2637578273534388, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 30243 + }, + { + "epoch": 0.2637665486386074, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 30244 + }, + { + "epoch": 0.263775269923776, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 30245 + }, + { + "epoch": 0.2637839912089445, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 30246 + }, + { + "epoch": 0.2637927124941131, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 30247 + }, + { + "epoch": 0.2638014337792817, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 30248 + }, + { + "epoch": 0.2638101550644503, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 30249 + }, + { + "epoch": 0.26381887634961887, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 30250 + }, + { + "epoch": 0.26382759763478747, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 30251 + }, + { + "epoch": 0.26383631891995607, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 30252 + }, + { + "epoch": 0.2638450402051246, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 30253 + }, + { + "epoch": 0.2638537614902932, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30254 + }, + { + "epoch": 0.2638624827754618, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 30255 + }, + { + "epoch": 0.26387120406063036, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30256 + }, + { + "epoch": 0.26387992534579896, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 30257 + }, + { + "epoch": 0.26388864663096756, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 30258 + }, + { + "epoch": 0.2638973679161361, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 30259 + }, + { + "epoch": 0.2639060892013047, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 30260 + }, + { + "epoch": 0.2639148104864733, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 30261 + }, + { + "epoch": 0.26392353177164185, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 30262 + }, + { + "epoch": 0.26393225305681045, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 30263 + }, + { + "epoch": 0.26394097434197905, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 30264 + }, + { + "epoch": 0.2639496956271476, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 30265 + }, + { + "epoch": 0.2639584169123162, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 30266 + }, + { + "epoch": 0.2639671381974848, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 30267 + }, + { + "epoch": 0.26397585948265334, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 30268 + }, + { + "epoch": 0.26398458076782194, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30269 + }, + { + "epoch": 0.26399330205299054, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 30270 + }, + { + "epoch": 0.2640020233381591, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 30271 + }, + { + "epoch": 0.2640107446233277, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 30272 + }, + { + "epoch": 0.2640194659084963, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 30273 + }, + { + "epoch": 0.26402818719366483, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30274 + }, + { + "epoch": 0.26403690847883343, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 30275 + }, + { + "epoch": 0.26404562976400203, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 30276 + }, + { + "epoch": 0.26405435104917063, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 30277 + }, + { + "epoch": 0.2640630723343392, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 30278 + }, + { + "epoch": 0.2640717936195078, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 30279 + }, + { + "epoch": 0.2640805149046764, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 30280 + }, + { + "epoch": 0.2640892361898449, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 30281 + }, + { + "epoch": 0.2640979574750135, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 30282 + }, + { + "epoch": 0.2641066787601821, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 30283 + }, + { + "epoch": 0.26411540004535067, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 30284 + }, + { + "epoch": 0.26412412133051927, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 30285 + }, + { + "epoch": 0.26413284261568787, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 30286 + }, + { + "epoch": 0.2641415639008564, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 30287 + }, + { + "epoch": 0.264150285186025, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 30288 + }, + { + "epoch": 0.2641590064711936, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 30289 + }, + { + "epoch": 0.26416772775636216, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 30290 + }, + { + "epoch": 0.26417644904153076, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 30291 + }, + { + "epoch": 0.26418517032669936, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 30292 + }, + { + "epoch": 0.2641938916118679, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 30293 + }, + { + "epoch": 0.2642026128970365, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 30294 + }, + { + "epoch": 0.2642113341822051, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30295 + }, + { + "epoch": 0.26422005546737365, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 30296 + }, + { + "epoch": 0.26422877675254225, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 30297 + }, + { + "epoch": 0.26423749803771085, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 30298 + }, + { + "epoch": 0.2642462193228794, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 30299 + }, + { + "epoch": 0.264254940608048, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 30300 + }, + { + "epoch": 0.2642636618932166, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 30301 + }, + { + "epoch": 0.26427238317838514, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 30302 + }, + { + "epoch": 0.26428110446355374, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 30303 + }, + { + "epoch": 0.26428982574872234, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 30304 + }, + { + "epoch": 0.26429854703389094, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 30305 + }, + { + "epoch": 0.2643072683190595, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 30306 + }, + { + "epoch": 0.2643159896042281, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 30307 + }, + { + "epoch": 0.2643247108893967, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 30308 + }, + { + "epoch": 0.26433343217456523, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 30309 + }, + { + "epoch": 0.26434215345973383, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 30310 + }, + { + "epoch": 0.26435087474490243, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 30311 + }, + { + "epoch": 0.264359596030071, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 30312 + }, + { + "epoch": 0.2643683173152396, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 30313 + }, + { + "epoch": 0.2643770386004082, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 30314 + }, + { + "epoch": 0.2643857598855767, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 30315 + }, + { + "epoch": 0.2643944811707453, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 30316 + }, + { + "epoch": 0.2644032024559139, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 30317 + }, + { + "epoch": 0.26441192374108247, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 30318 + }, + { + "epoch": 0.26442064502625107, + "grad_norm": 0.41015625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 30319 + }, + { + "epoch": 0.26442936631141967, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30320 + }, + { + "epoch": 0.2644380875965882, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30321 + }, + { + "epoch": 0.2644468088817568, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 30322 + }, + { + "epoch": 0.2644555301669254, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30323 + }, + { + "epoch": 0.26446425145209396, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 30324 + }, + { + "epoch": 0.26447297273726256, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30325 + }, + { + "epoch": 0.26448169402243116, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 30326 + }, + { + "epoch": 0.2644904153075997, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 30327 + }, + { + "epoch": 0.2644991365927683, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 30328 + }, + { + "epoch": 0.2645078578779369, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 30329 + }, + { + "epoch": 0.26451657916310545, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 30330 + }, + { + "epoch": 0.26452530044827405, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 30331 + }, + { + "epoch": 0.26453402173344265, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 30332 + }, + { + "epoch": 0.26454274301861125, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 30333 + }, + { + "epoch": 0.2645514643037798, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 30334 + }, + { + "epoch": 0.2645601855889484, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 30335 + }, + { + "epoch": 0.264568906874117, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 30336 + }, + { + "epoch": 0.26457762815928554, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 30337 + }, + { + "epoch": 0.26458634944445414, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 30338 + }, + { + "epoch": 0.26459507072962274, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 30339 + }, + { + "epoch": 0.2646037920147913, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 30340 + }, + { + "epoch": 0.2646125132999599, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 30341 + }, + { + "epoch": 0.2646212345851285, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 30342 + }, + { + "epoch": 0.26462995587029703, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 30343 + }, + { + "epoch": 0.26463867715546563, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 30344 + }, + { + "epoch": 0.26464739844063423, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 30345 + }, + { + "epoch": 0.2646561197258028, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 30346 + }, + { + "epoch": 0.2646648410109714, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 30347 + }, + { + "epoch": 0.26467356229614, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 30348 + }, + { + "epoch": 0.2646822835813085, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 30349 + }, + { + "epoch": 0.2646910048664771, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 30350 + }, + { + "epoch": 0.2646997261516457, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30351 + }, + { + "epoch": 0.26470844743681426, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 30352 + }, + { + "epoch": 0.26471716872198287, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 30353 + }, + { + "epoch": 0.26472589000715147, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 30354 + }, + { + "epoch": 0.26473461129232, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 30355 + }, + { + "epoch": 0.2647433325774886, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 30356 + }, + { + "epoch": 0.2647520538626572, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 30357 + }, + { + "epoch": 0.2647607751478258, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 30358 + }, + { + "epoch": 0.26476949643299436, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 30359 + }, + { + "epoch": 0.26477821771816296, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 30360 + }, + { + "epoch": 0.26478693900333156, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 30361 + }, + { + "epoch": 0.2647956602885001, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 30362 + }, + { + "epoch": 0.2648043815736687, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30363 + }, + { + "epoch": 0.2648131028588373, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 30364 + }, + { + "epoch": 0.26482182414400585, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 30365 + }, + { + "epoch": 0.26483054542917445, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 30366 + }, + { + "epoch": 0.26483926671434305, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30367 + }, + { + "epoch": 0.2648479879995116, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 30368 + }, + { + "epoch": 0.2648567092846802, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 30369 + }, + { + "epoch": 0.2648654305698488, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 30370 + }, + { + "epoch": 0.26487415185501734, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 30371 + }, + { + "epoch": 0.26488287314018594, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30372 + }, + { + "epoch": 0.26489159442535454, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 30373 + }, + { + "epoch": 0.2649003157105231, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 30374 + }, + { + "epoch": 0.2649090369956917, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 30375 + }, + { + "epoch": 0.2649177582808603, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 30376 + }, + { + "epoch": 0.2649264795660288, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 30377 + }, + { + "epoch": 0.2649352008511974, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9755, + "step": 30378 + }, + { + "epoch": 0.26494392213636603, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 30379 + }, + { + "epoch": 0.2649526434215346, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 30380 + }, + { + "epoch": 0.2649613647067032, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 30381 + }, + { + "epoch": 0.2649700859918718, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 30382 + }, + { + "epoch": 0.2649788072770403, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 30383 + }, + { + "epoch": 0.2649875285622089, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 30384 + }, + { + "epoch": 0.2649962498473775, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 30385 + }, + { + "epoch": 0.2650049711325461, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 30386 + }, + { + "epoch": 0.26501369241771466, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 30387 + }, + { + "epoch": 0.26502241370288326, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 30388 + }, + { + "epoch": 0.26503113498805186, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 30389 + }, + { + "epoch": 0.2650398562732204, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 30390 + }, + { + "epoch": 0.265048577558389, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 30391 + }, + { + "epoch": 0.2650572988435576, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30392 + }, + { + "epoch": 0.26506602012872615, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 30393 + }, + { + "epoch": 0.26507474141389475, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 30394 + }, + { + "epoch": 0.26508346269906335, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 30395 + }, + { + "epoch": 0.2650921839842319, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 30396 + }, + { + "epoch": 0.2651009052694005, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30397 + }, + { + "epoch": 0.2651096265545691, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30398 + }, + { + "epoch": 0.26511834783973764, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 30399 + }, + { + "epoch": 0.26512706912490625, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 30400 + }, + { + "epoch": 0.26513579041007485, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 30401 + }, + { + "epoch": 0.2651445116952434, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 30402 + }, + { + "epoch": 0.265153232980412, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30403 + }, + { + "epoch": 0.2651619542655806, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 30404 + }, + { + "epoch": 0.26517067555074914, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 30405 + }, + { + "epoch": 0.26517939683591774, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 30406 + }, + { + "epoch": 0.26518811812108634, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 30407 + }, + { + "epoch": 0.2651968394062549, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 30408 + }, + { + "epoch": 0.2652055606914235, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 30409 + }, + { + "epoch": 0.2652142819765921, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 30410 + }, + { + "epoch": 0.2652230032617606, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 30411 + }, + { + "epoch": 0.2652317245469292, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 30412 + }, + { + "epoch": 0.2652404458320978, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 30413 + }, + { + "epoch": 0.2652491671172664, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 30414 + }, + { + "epoch": 0.26525788840243497, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 30415 + }, + { + "epoch": 0.26526660968760357, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 30416 + }, + { + "epoch": 0.26527533097277217, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 30417 + }, + { + "epoch": 0.2652840522579407, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 30418 + }, + { + "epoch": 0.2652927735431093, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 30419 + }, + { + "epoch": 0.2653014948282779, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30420 + }, + { + "epoch": 0.26531021611344646, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 30421 + }, + { + "epoch": 0.26531893739861506, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 30422 + }, + { + "epoch": 0.26532765868378366, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 30423 + }, + { + "epoch": 0.2653363799689522, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 30424 + }, + { + "epoch": 0.2653451012541208, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 30425 + }, + { + "epoch": 0.2653538225392894, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 30426 + }, + { + "epoch": 0.26536254382445795, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 30427 + }, + { + "epoch": 0.26537126510962655, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 30428 + }, + { + "epoch": 0.26537998639479515, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 30429 + }, + { + "epoch": 0.2653887076799637, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 30430 + }, + { + "epoch": 0.2653974289651323, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 30431 + }, + { + "epoch": 0.2654061502503009, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30432 + }, + { + "epoch": 0.26541487153546944, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 30433 + }, + { + "epoch": 0.26542359282063804, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 30434 + }, + { + "epoch": 0.26543231410580664, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30435 + }, + { + "epoch": 0.2654410353909752, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30436 + }, + { + "epoch": 0.2654497566761438, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 30437 + }, + { + "epoch": 0.2654584779613124, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30438 + }, + { + "epoch": 0.26546719924648093, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 30439 + }, + { + "epoch": 0.26547592053164953, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9765, + "step": 30440 + }, + { + "epoch": 0.26548464181681813, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 30441 + }, + { + "epoch": 0.26549336310198673, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 30442 + }, + { + "epoch": 0.2655020843871553, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 30443 + }, + { + "epoch": 0.2655108056723239, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 30444 + }, + { + "epoch": 0.2655195269574925, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 30445 + }, + { + "epoch": 0.265528248242661, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30446 + }, + { + "epoch": 0.2655369695278296, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 30447 + }, + { + "epoch": 0.2655456908129982, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 30448 + }, + { + "epoch": 0.26555441209816677, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 30449 + }, + { + "epoch": 0.26556313338333537, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 30450 + }, + { + "epoch": 0.26557185466850397, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 30451 + }, + { + "epoch": 0.2655805759536725, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 30452 + }, + { + "epoch": 0.2655892972388411, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 30453 + }, + { + "epoch": 0.2655980185240097, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 30454 + }, + { + "epoch": 0.26560673980917826, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 30455 + }, + { + "epoch": 0.26561546109434686, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 30456 + }, + { + "epoch": 0.26562418237951546, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 30457 + }, + { + "epoch": 0.265632903664684, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 30458 + }, + { + "epoch": 0.2656416249498526, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 30459 + }, + { + "epoch": 0.2656503462350212, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 30460 + }, + { + "epoch": 0.26565906752018975, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 30461 + }, + { + "epoch": 0.26566778880535835, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 30462 + }, + { + "epoch": 0.26567651009052695, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 30463 + }, + { + "epoch": 0.2656852313756955, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 30464 + }, + { + "epoch": 0.2656939526608641, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 30465 + }, + { + "epoch": 0.2657026739460327, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 30466 + }, + { + "epoch": 0.2657113952312013, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 30467 + }, + { + "epoch": 0.26572011651636984, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 30468 + }, + { + "epoch": 0.26572883780153844, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 30469 + }, + { + "epoch": 0.26573755908670704, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 30470 + }, + { + "epoch": 0.2657462803718756, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 30471 + }, + { + "epoch": 0.2657550016570442, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 30472 + }, + { + "epoch": 0.2657637229422128, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 30473 + }, + { + "epoch": 0.26577244422738133, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 30474 + }, + { + "epoch": 0.26578116551254993, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 30475 + }, + { + "epoch": 0.26578988679771853, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 30476 + }, + { + "epoch": 0.2657986080828871, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 30477 + }, + { + "epoch": 0.2658073293680557, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 30478 + }, + { + "epoch": 0.2658160506532243, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 30479 + }, + { + "epoch": 0.2658247719383928, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 30480 + }, + { + "epoch": 0.2658334932235614, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 30481 + }, + { + "epoch": 0.26584221450873, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 30482 + }, + { + "epoch": 0.26585093579389857, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 30483 + }, + { + "epoch": 0.26585965707906717, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 30484 + }, + { + "epoch": 0.26586837836423577, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 30485 + }, + { + "epoch": 0.2658770996494043, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30486 + }, + { + "epoch": 0.2658858209345729, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 30487 + }, + { + "epoch": 0.2658945422197415, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 30488 + }, + { + "epoch": 0.26590326350491006, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 30489 + }, + { + "epoch": 0.26591198479007866, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 30490 + }, + { + "epoch": 0.26592070607524726, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 30491 + }, + { + "epoch": 0.2659294273604158, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 30492 + }, + { + "epoch": 0.2659381486455844, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 30493 + }, + { + "epoch": 0.265946869930753, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 30494 + }, + { + "epoch": 0.2659555912159216, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 30495 + }, + { + "epoch": 0.26596431250109015, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30496 + }, + { + "epoch": 0.26597303378625875, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 30497 + }, + { + "epoch": 0.26598175507142735, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 30498 + }, + { + "epoch": 0.2659904763565959, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 30499 + }, + { + "epoch": 0.2659991976417645, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 30500 + }, + { + "epoch": 0.2660079189269331, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 30501 + }, + { + "epoch": 0.26601664021210164, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 30502 + }, + { + "epoch": 0.26602536149727024, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 30503 + }, + { + "epoch": 0.26603408278243884, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 30504 + }, + { + "epoch": 0.2660428040676074, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 30505 + }, + { + "epoch": 0.266051525352776, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 30506 + }, + { + "epoch": 0.2660602466379446, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 30507 + }, + { + "epoch": 0.26606896792311313, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 30508 + }, + { + "epoch": 0.26607768920828173, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 30509 + }, + { + "epoch": 0.26608641049345033, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 30510 + }, + { + "epoch": 0.2660951317786189, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 30511 + }, + { + "epoch": 0.2661038530637875, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 30512 + }, + { + "epoch": 0.2661125743489561, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 30513 + }, + { + "epoch": 0.2661212956341246, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 30514 + }, + { + "epoch": 0.2661300169192932, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 30515 + }, + { + "epoch": 0.2661387382044618, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 30516 + }, + { + "epoch": 0.26614745948963037, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 30517 + }, + { + "epoch": 0.26615618077479897, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 30518 + }, + { + "epoch": 0.26616490205996757, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 30519 + }, + { + "epoch": 0.2661736233451361, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30520 + }, + { + "epoch": 0.2661823446303047, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30521 + }, + { + "epoch": 0.2661910659154733, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 30522 + }, + { + "epoch": 0.2661997872006419, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 30523 + }, + { + "epoch": 0.26620850848581046, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 30524 + }, + { + "epoch": 0.26621722977097906, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 30525 + }, + { + "epoch": 0.26622595105614766, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 30526 + }, + { + "epoch": 0.2662346723413162, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 30527 + }, + { + "epoch": 0.2662433936264848, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 30528 + }, + { + "epoch": 0.2662521149116534, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 30529 + }, + { + "epoch": 0.26626083619682195, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 30530 + }, + { + "epoch": 0.26626955748199055, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 30531 + }, + { + "epoch": 0.26627827876715915, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 30532 + }, + { + "epoch": 0.2662870000523277, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 30533 + }, + { + "epoch": 0.2662957213374963, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 30534 + }, + { + "epoch": 0.2663044426226649, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 30535 + }, + { + "epoch": 0.26631316390783344, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 30536 + }, + { + "epoch": 0.26632188519300204, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 30537 + }, + { + "epoch": 0.26633060647817064, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 30538 + }, + { + "epoch": 0.2663393277633392, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 30539 + }, + { + "epoch": 0.2663480490485078, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 30540 + }, + { + "epoch": 0.2663567703336764, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 30541 + }, + { + "epoch": 0.26636549161884493, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30542 + }, + { + "epoch": 0.26637421290401353, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 30543 + }, + { + "epoch": 0.26638293418918213, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 30544 + }, + { + "epoch": 0.2663916554743507, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 30545 + }, + { + "epoch": 0.2664003767595193, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 30546 + }, + { + "epoch": 0.2664090980446879, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 30547 + }, + { + "epoch": 0.2664178193298564, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 30548 + }, + { + "epoch": 0.266426540615025, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 30549 + }, + { + "epoch": 0.2664352619001936, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 30550 + }, + { + "epoch": 0.2664439831853622, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 30551 + }, + { + "epoch": 0.26645270447053077, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 30552 + }, + { + "epoch": 0.26646142575569937, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 30553 + }, + { + "epoch": 0.26647014704086797, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 30554 + }, + { + "epoch": 0.2664788683260365, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 30555 + }, + { + "epoch": 0.2664875896112051, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 30556 + }, + { + "epoch": 0.2664963108963737, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 30557 + }, + { + "epoch": 0.26650503218154226, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 30558 + }, + { + "epoch": 0.26651375346671086, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 30559 + }, + { + "epoch": 0.26652247475187946, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 30560 + }, + { + "epoch": 0.266531196037048, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 30561 + }, + { + "epoch": 0.2665399173222166, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30562 + }, + { + "epoch": 0.2665486386073852, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 30563 + }, + { + "epoch": 0.26655735989255375, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 30564 + }, + { + "epoch": 0.26656608117772235, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 30565 + }, + { + "epoch": 0.26657480246289095, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 30566 + }, + { + "epoch": 0.2665835237480595, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 30567 + }, + { + "epoch": 0.2665922450332281, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 30568 + }, + { + "epoch": 0.2666009663183967, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30569 + }, + { + "epoch": 0.26660968760356524, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 30570 + }, + { + "epoch": 0.26661840888873384, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 30571 + }, + { + "epoch": 0.26662713017390244, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 30572 + }, + { + "epoch": 0.266635851459071, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 30573 + }, + { + "epoch": 0.2666445727442396, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 30574 + }, + { + "epoch": 0.2666532940294082, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 30575 + }, + { + "epoch": 0.2666620153145768, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 30576 + }, + { + "epoch": 0.26667073659974533, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 30577 + }, + { + "epoch": 0.26667945788491393, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 30578 + }, + { + "epoch": 0.26668817917008253, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 30579 + }, + { + "epoch": 0.2666969004552511, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 30580 + }, + { + "epoch": 0.2667056217404197, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 30581 + }, + { + "epoch": 0.2667143430255883, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 30582 + }, + { + "epoch": 0.2667230643107568, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 30583 + }, + { + "epoch": 0.2667317855959254, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30584 + }, + { + "epoch": 0.266740506881094, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 30585 + }, + { + "epoch": 0.26674922816626256, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 30586 + }, + { + "epoch": 0.26675794945143116, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 30587 + }, + { + "epoch": 0.26676667073659976, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 30588 + }, + { + "epoch": 0.2667753920217683, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 30589 + }, + { + "epoch": 0.2667841133069369, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 30590 + }, + { + "epoch": 0.2667928345921055, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 30591 + }, + { + "epoch": 0.26680155587727405, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 30592 + }, + { + "epoch": 0.26681027716244266, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 30593 + }, + { + "epoch": 0.26681899844761126, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30594 + }, + { + "epoch": 0.2668277197327798, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 30595 + }, + { + "epoch": 0.2668364410179484, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 30596 + }, + { + "epoch": 0.266845162303117, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 30597 + }, + { + "epoch": 0.26685388358828555, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 30598 + }, + { + "epoch": 0.26686260487345415, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 30599 + }, + { + "epoch": 0.26687132615862275, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9799, + "step": 30600 + }, + { + "epoch": 0.2668800474437913, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 30601 + }, + { + "epoch": 0.2668887687289599, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 30602 + }, + { + "epoch": 0.2668974900141285, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 30603 + }, + { + "epoch": 0.2669062112992971, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 30604 + }, + { + "epoch": 0.26691493258446564, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 30605 + }, + { + "epoch": 0.26692365386963424, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 30606 + }, + { + "epoch": 0.26693237515480284, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 30607 + }, + { + "epoch": 0.2669410964399714, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 30608 + }, + { + "epoch": 0.26694981772514, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30609 + }, + { + "epoch": 0.2669585390103086, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 30610 + }, + { + "epoch": 0.2669672602954771, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 30611 + }, + { + "epoch": 0.2669759815806457, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 30612 + }, + { + "epoch": 0.2669847028658143, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 30613 + }, + { + "epoch": 0.26699342415098287, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 30614 + }, + { + "epoch": 0.2670021454361515, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 30615 + }, + { + "epoch": 0.2670108667213201, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 30616 + }, + { + "epoch": 0.2670195880064886, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 30617 + }, + { + "epoch": 0.2670283092916572, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 30618 + }, + { + "epoch": 0.2670370305768258, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 30619 + }, + { + "epoch": 0.26704575186199436, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 30620 + }, + { + "epoch": 0.26705447314716296, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 30621 + }, + { + "epoch": 0.26706319443233156, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 30622 + }, + { + "epoch": 0.2670719157175001, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 30623 + }, + { + "epoch": 0.2670806370026687, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 30624 + }, + { + "epoch": 0.2670893582878373, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 30625 + }, + { + "epoch": 0.26709807957300585, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30626 + }, + { + "epoch": 0.26710680085817445, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 30627 + }, + { + "epoch": 0.26711552214334305, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 30628 + }, + { + "epoch": 0.2671242434285116, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 30629 + }, + { + "epoch": 0.2671329647136802, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 30630 + }, + { + "epoch": 0.2671416859988488, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 30631 + }, + { + "epoch": 0.2671504072840174, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 30632 + }, + { + "epoch": 0.26715912856918594, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 30633 + }, + { + "epoch": 0.26716784985435454, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 30634 + }, + { + "epoch": 0.26717657113952314, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 30635 + }, + { + "epoch": 0.2671852924246917, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 30636 + }, + { + "epoch": 0.2671940137098603, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 30637 + }, + { + "epoch": 0.2672027349950289, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 30638 + }, + { + "epoch": 0.26721145628019743, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 30639 + }, + { + "epoch": 0.26722017756536603, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 30640 + }, + { + "epoch": 0.26722889885053464, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.982, + "step": 30641 + }, + { + "epoch": 0.2672376201357032, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 30642 + }, + { + "epoch": 0.2672463414208718, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 30643 + }, + { + "epoch": 0.2672550627060404, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 30644 + }, + { + "epoch": 0.2672637839912089, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 30645 + }, + { + "epoch": 0.2672725052763775, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 30646 + }, + { + "epoch": 0.2672812265615461, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 30647 + }, + { + "epoch": 0.26728994784671467, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 30648 + }, + { + "epoch": 0.26729866913188327, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 30649 + }, + { + "epoch": 0.26730739041705187, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 30650 + }, + { + "epoch": 0.2673161117022204, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 30651 + }, + { + "epoch": 0.267324832987389, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 30652 + }, + { + "epoch": 0.2673335542725576, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 30653 + }, + { + "epoch": 0.26734227555772616, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 30654 + }, + { + "epoch": 0.26735099684289476, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 30655 + }, + { + "epoch": 0.26735971812806336, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 30656 + }, + { + "epoch": 0.2673684394132319, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 30657 + }, + { + "epoch": 0.2673771606984005, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 30658 + }, + { + "epoch": 0.2673858819835691, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 30659 + }, + { + "epoch": 0.2673946032687377, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 30660 + }, + { + "epoch": 0.26740332455390625, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 30661 + }, + { + "epoch": 0.26741204583907485, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 30662 + }, + { + "epoch": 0.26742076712424345, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 30663 + }, + { + "epoch": 0.267429488409412, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 30664 + }, + { + "epoch": 0.2674382096945806, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 30665 + }, + { + "epoch": 0.2674469309797492, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 30666 + }, + { + "epoch": 0.26745565226491774, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 30667 + }, + { + "epoch": 0.26746437355008634, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 30668 + }, + { + "epoch": 0.26747309483525494, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 30669 + }, + { + "epoch": 0.2674818161204235, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 30670 + }, + { + "epoch": 0.2674905374055921, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 30671 + }, + { + "epoch": 0.2674992586907607, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 30672 + }, + { + "epoch": 0.26750797997592923, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 30673 + }, + { + "epoch": 0.26751670126109783, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 30674 + }, + { + "epoch": 0.26752542254626643, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 30675 + }, + { + "epoch": 0.267534143831435, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 30676 + }, + { + "epoch": 0.2675428651166036, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 30677 + }, + { + "epoch": 0.2675515864017722, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 30678 + }, + { + "epoch": 0.2675603076869407, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 30679 + }, + { + "epoch": 0.2675690289721093, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 30680 + }, + { + "epoch": 0.2675777502572779, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 30681 + }, + { + "epoch": 0.26758647154244647, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 30682 + }, + { + "epoch": 0.26759519282761507, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 30683 + }, + { + "epoch": 0.26760391411278367, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 30684 + }, + { + "epoch": 0.26761263539795227, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 30685 + }, + { + "epoch": 0.2676213566831208, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 30686 + }, + { + "epoch": 0.2676300779682894, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 30687 + }, + { + "epoch": 0.267638799253458, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 30688 + }, + { + "epoch": 0.26764752053862656, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 30689 + }, + { + "epoch": 0.26765624182379516, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 30690 + }, + { + "epoch": 0.26766496310896376, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 30691 + }, + { + "epoch": 0.2676736843941323, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 30692 + }, + { + "epoch": 0.2676824056793009, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30693 + }, + { + "epoch": 0.2676911269644695, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 30694 + }, + { + "epoch": 0.26769984824963805, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 30695 + }, + { + "epoch": 0.26770856953480665, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 30696 + }, + { + "epoch": 0.26771729081997525, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 30697 + }, + { + "epoch": 0.2677260121051438, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 30698 + }, + { + "epoch": 0.2677347333903124, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 30699 + }, + { + "epoch": 0.267743454675481, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 30700 + }, + { + "epoch": 0.26775217596064954, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 30701 + }, + { + "epoch": 0.26776089724581814, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30702 + }, + { + "epoch": 0.26776961853098674, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 30703 + }, + { + "epoch": 0.2677783398161553, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 30704 + }, + { + "epoch": 0.2677870611013239, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 30705 + }, + { + "epoch": 0.2677957823864925, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 30706 + }, + { + "epoch": 0.26780450367166103, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 30707 + }, + { + "epoch": 0.26781322495682963, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 30708 + }, + { + "epoch": 0.26782194624199823, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 30709 + }, + { + "epoch": 0.2678306675271668, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 30710 + }, + { + "epoch": 0.2678393888123354, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 30711 + }, + { + "epoch": 0.267848110097504, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 30712 + }, + { + "epoch": 0.2678568313826726, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 30713 + }, + { + "epoch": 0.2678655526678411, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 30714 + }, + { + "epoch": 0.2678742739530097, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 30715 + }, + { + "epoch": 0.2678829952381783, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 30716 + }, + { + "epoch": 0.26789171652334687, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 30717 + }, + { + "epoch": 0.26790043780851547, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 30718 + }, + { + "epoch": 0.26790915909368407, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 30719 + }, + { + "epoch": 0.2679178803788526, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 30720 + }, + { + "epoch": 0.2679266016640212, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 30721 + }, + { + "epoch": 0.2679353229491898, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 30722 + }, + { + "epoch": 0.26794404423435836, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 30723 + }, + { + "epoch": 0.26795276551952696, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 30724 + }, + { + "epoch": 0.26796148680469556, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 30725 + }, + { + "epoch": 0.2679702080898641, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 30726 + }, + { + "epoch": 0.2679789293750327, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 30727 + }, + { + "epoch": 0.2679876506602013, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 30728 + }, + { + "epoch": 0.26799637194536985, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 30729 + }, + { + "epoch": 0.26800509323053845, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 30730 + }, + { + "epoch": 0.26801381451570705, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 30731 + }, + { + "epoch": 0.2680225358008756, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 30732 + }, + { + "epoch": 0.2680312570860442, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 30733 + }, + { + "epoch": 0.2680399783712128, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 30734 + }, + { + "epoch": 0.26804869965638134, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 30735 + }, + { + "epoch": 0.26805742094154994, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 30736 + }, + { + "epoch": 0.26806614222671854, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 30737 + }, + { + "epoch": 0.2680748635118871, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 30738 + }, + { + "epoch": 0.2680835847970557, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 30739 + }, + { + "epoch": 0.2680923060822243, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 30740 + }, + { + "epoch": 0.2681010273673929, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 30741 + }, + { + "epoch": 0.26810974865256143, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 30742 + }, + { + "epoch": 0.26811846993773003, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 30743 + }, + { + "epoch": 0.26812719122289863, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 30744 + }, + { + "epoch": 0.2681359125080672, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 30745 + }, + { + "epoch": 0.2681446337932358, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 30746 + }, + { + "epoch": 0.2681533550784044, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 30747 + }, + { + "epoch": 0.2681620763635729, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 30748 + }, + { + "epoch": 0.2681707976487415, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 30749 + }, + { + "epoch": 0.2681795189339101, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 30750 + }, + { + "epoch": 0.26818824021907867, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 30751 + }, + { + "epoch": 0.26819696150424727, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 30752 + }, + { + "epoch": 0.26820568278941587, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 30753 + }, + { + "epoch": 0.2682144040745844, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 30754 + }, + { + "epoch": 0.268223125359753, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 30755 + }, + { + "epoch": 0.2682318466449216, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 30756 + }, + { + "epoch": 0.26824056793009016, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 30757 + }, + { + "epoch": 0.26824928921525876, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 30758 + }, + { + "epoch": 0.26825801050042736, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 30759 + }, + { + "epoch": 0.2682667317855959, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 30760 + }, + { + "epoch": 0.2682754530707645, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 30761 + }, + { + "epoch": 0.2682841743559331, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 30762 + }, + { + "epoch": 0.26829289564110165, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 30763 + }, + { + "epoch": 0.26830161692627025, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 30764 + }, + { + "epoch": 0.26831033821143885, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 30765 + }, + { + "epoch": 0.2683190594966074, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 30766 + }, + { + "epoch": 0.268327780781776, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 30767 + }, + { + "epoch": 0.2683365020669446, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 30768 + }, + { + "epoch": 0.2683452233521132, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 30769 + }, + { + "epoch": 0.26835394463728174, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 30770 + }, + { + "epoch": 0.26836266592245034, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 30771 + }, + { + "epoch": 0.26837138720761894, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 30772 + }, + { + "epoch": 0.2683801084927875, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 30773 + }, + { + "epoch": 0.2683888297779561, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 30774 + }, + { + "epoch": 0.2683975510631247, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 30775 + }, + { + "epoch": 0.26840627234829323, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 30776 + }, + { + "epoch": 0.26841499363346183, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 30777 + }, + { + "epoch": 0.26842371491863043, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 30778 + }, + { + "epoch": 0.268432436203799, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 30779 + }, + { + "epoch": 0.2684411574889676, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 30780 + }, + { + "epoch": 0.2684498787741362, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 30781 + }, + { + "epoch": 0.2684586000593047, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 30782 + }, + { + "epoch": 0.2684673213444733, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 30783 + }, + { + "epoch": 0.2684760426296419, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 30784 + }, + { + "epoch": 0.26848476391481046, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 30785 + }, + { + "epoch": 0.26849348519997907, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 30786 + }, + { + "epoch": 0.26850220648514767, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30787 + }, + { + "epoch": 0.2685109277703162, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 30788 + }, + { + "epoch": 0.2685196490554848, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 30789 + }, + { + "epoch": 0.2685283703406534, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 30790 + }, + { + "epoch": 0.26853709162582196, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 30791 + }, + { + "epoch": 0.26854581291099056, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 30792 + }, + { + "epoch": 0.26855453419615916, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 30793 + }, + { + "epoch": 0.26856325548132776, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 30794 + }, + { + "epoch": 0.2685719767664963, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 30795 + }, + { + "epoch": 0.2685806980516649, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 30796 + }, + { + "epoch": 0.2685894193368335, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 30797 + }, + { + "epoch": 0.26859814062200205, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 30798 + }, + { + "epoch": 0.26860686190717065, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 30799 + }, + { + "epoch": 0.26861558319233925, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 30800 + }, + { + "epoch": 0.2686243044775078, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 30801 + }, + { + "epoch": 0.2686330257626764, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 30802 + }, + { + "epoch": 0.268641747047845, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 30803 + }, + { + "epoch": 0.26865046833301354, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 30804 + }, + { + "epoch": 0.26865918961818214, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 30805 + }, + { + "epoch": 0.26866791090335074, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 30806 + }, + { + "epoch": 0.2686766321885193, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 30807 + }, + { + "epoch": 0.2686853534736879, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 30808 + }, + { + "epoch": 0.2686940747588565, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 30809 + }, + { + "epoch": 0.268702796044025, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 30810 + }, + { + "epoch": 0.2687115173291936, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 30811 + }, + { + "epoch": 0.26872023861436223, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 30812 + }, + { + "epoch": 0.2687289598995308, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 30813 + }, + { + "epoch": 0.2687376811846994, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 30814 + }, + { + "epoch": 0.268746402469868, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 30815 + }, + { + "epoch": 0.2687551237550365, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 30816 + }, + { + "epoch": 0.2687638450402051, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 30817 + }, + { + "epoch": 0.2687725663253737, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 30818 + }, + { + "epoch": 0.26878128761054226, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 30819 + }, + { + "epoch": 0.26879000889571086, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 30820 + }, + { + "epoch": 0.26879873018087946, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 30821 + }, + { + "epoch": 0.26880745146604806, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 30822 + }, + { + "epoch": 0.2688161727512166, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 30823 + }, + { + "epoch": 0.2688248940363852, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 30824 + }, + { + "epoch": 0.2688336153215538, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 30825 + }, + { + "epoch": 0.26884233660672235, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 30826 + }, + { + "epoch": 0.26885105789189095, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 30827 + }, + { + "epoch": 0.26885977917705955, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 30828 + }, + { + "epoch": 0.2688685004622281, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 30829 + }, + { + "epoch": 0.2688772217473967, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 30830 + }, + { + "epoch": 0.2688859430325653, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 30831 + }, + { + "epoch": 0.26889466431773384, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 30832 + }, + { + "epoch": 0.26890338560290244, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 30833 + }, + { + "epoch": 0.26891210688807105, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 30834 + }, + { + "epoch": 0.2689208281732396, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 30835 + }, + { + "epoch": 0.2689295494584082, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 30836 + }, + { + "epoch": 0.2689382707435768, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 30837 + }, + { + "epoch": 0.26894699202874534, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 30838 + }, + { + "epoch": 0.26895571331391394, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 30839 + }, + { + "epoch": 0.26896443459908254, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 30840 + }, + { + "epoch": 0.2689731558842511, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 30841 + }, + { + "epoch": 0.2689818771694197, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 30842 + }, + { + "epoch": 0.2689905984545883, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 30843 + }, + { + "epoch": 0.2689993197397568, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 30844 + }, + { + "epoch": 0.2690080410249254, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 30845 + }, + { + "epoch": 0.269016762310094, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 30846 + }, + { + "epoch": 0.26902548359526257, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 30847 + }, + { + "epoch": 0.26903420488043117, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 30848 + }, + { + "epoch": 0.26904292616559977, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 30849 + }, + { + "epoch": 0.26905164745076837, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 30850 + }, + { + "epoch": 0.2690603687359369, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 30851 + }, + { + "epoch": 0.2690690900211055, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 30852 + }, + { + "epoch": 0.2690778113062741, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 30853 + }, + { + "epoch": 0.26908653259144266, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 30854 + }, + { + "epoch": 0.26909525387661126, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 30855 + }, + { + "epoch": 0.26910397516177986, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 30856 + }, + { + "epoch": 0.2691126964469484, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 30857 + }, + { + "epoch": 0.269121417732117, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 30858 + }, + { + "epoch": 0.2691301390172856, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 30859 + }, + { + "epoch": 0.26913886030245415, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 30860 + }, + { + "epoch": 0.26914758158762275, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 30861 + }, + { + "epoch": 0.26915630287279135, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 30862 + }, + { + "epoch": 0.2691650241579599, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 30863 + }, + { + "epoch": 0.2691737454431285, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 30864 + }, + { + "epoch": 0.2691824667282971, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 30865 + }, + { + "epoch": 0.26919118801346564, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 30866 + }, + { + "epoch": 0.26919990929863424, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 30867 + }, + { + "epoch": 0.26920863058380284, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 30868 + }, + { + "epoch": 0.2692173518689714, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 30869 + }, + { + "epoch": 0.26922607315414, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 30870 + }, + { + "epoch": 0.2692347944393086, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 30871 + }, + { + "epoch": 0.26924351572447713, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 30872 + }, + { + "epoch": 0.26925223700964573, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 30873 + }, + { + "epoch": 0.26926095829481433, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 30874 + }, + { + "epoch": 0.2692696795799829, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 30875 + }, + { + "epoch": 0.2692784008651515, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 30876 + }, + { + "epoch": 0.2692871221503201, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 30877 + }, + { + "epoch": 0.2692958434354887, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 30878 + }, + { + "epoch": 0.2693045647206572, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 30879 + }, + { + "epoch": 0.2693132860058258, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 30880 + }, + { + "epoch": 0.2693220072909944, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 30881 + }, + { + "epoch": 0.26933072857616297, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 30882 + }, + { + "epoch": 0.26933944986133157, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 30883 + }, + { + "epoch": 0.26934817114650017, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 30884 + }, + { + "epoch": 0.2693568924316687, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 30885 + }, + { + "epoch": 0.2693656137168373, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 30886 + }, + { + "epoch": 0.2693743350020059, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 30887 + }, + { + "epoch": 0.26938305628717446, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 30888 + }, + { + "epoch": 0.26939177757234306, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 30889 + }, + { + "epoch": 0.26940049885751166, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 30890 + }, + { + "epoch": 0.2694092201426802, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 30891 + }, + { + "epoch": 0.2694179414278488, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 30892 + }, + { + "epoch": 0.2694266627130174, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 30893 + }, + { + "epoch": 0.26943538399818595, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 30894 + }, + { + "epoch": 0.26944410528335455, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 30895 + }, + { + "epoch": 0.26945282656852315, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 30896 + }, + { + "epoch": 0.2694615478536917, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 30897 + }, + { + "epoch": 0.2694702691388603, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 30898 + }, + { + "epoch": 0.2694789904240289, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 30899 + }, + { + "epoch": 0.26948771170919744, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 30900 + }, + { + "epoch": 0.26949643299436604, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 30901 + }, + { + "epoch": 0.26950515427953464, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 30902 + }, + { + "epoch": 0.2695138755647032, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 30903 + }, + { + "epoch": 0.2695225968498718, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 30904 + }, + { + "epoch": 0.2695313181350404, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 30905 + }, + { + "epoch": 0.269540039420209, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 30906 + }, + { + "epoch": 0.26954876070537753, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 30907 + }, + { + "epoch": 0.26955748199054613, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 30908 + }, + { + "epoch": 0.26956620327571473, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 30909 + }, + { + "epoch": 0.2695749245608833, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 30910 + }, + { + "epoch": 0.2695836458460519, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 30911 + }, + { + "epoch": 0.2695923671312205, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 30912 + }, + { + "epoch": 0.269601088416389, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 30913 + }, + { + "epoch": 0.2696098097015576, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 30914 + }, + { + "epoch": 0.2696185309867262, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 30915 + }, + { + "epoch": 0.26962725227189477, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 30916 + }, + { + "epoch": 0.26963597355706337, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 30917 + }, + { + "epoch": 0.26964469484223197, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.983, + "step": 30918 + }, + { + "epoch": 0.2696534161274005, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 30919 + }, + { + "epoch": 0.2696621374125691, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 30920 + }, + { + "epoch": 0.2696708586977377, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 30921 + }, + { + "epoch": 0.26967957998290626, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 30922 + }, + { + "epoch": 0.26968830126807486, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 30923 + }, + { + "epoch": 0.26969702255324346, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 30924 + }, + { + "epoch": 0.269705743838412, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 30925 + }, + { + "epoch": 0.2697144651235806, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 30926 + }, + { + "epoch": 0.2697231864087492, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 30927 + }, + { + "epoch": 0.26973190769391775, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 30928 + }, + { + "epoch": 0.26974062897908635, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 30929 + }, + { + "epoch": 0.26974935026425495, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 30930 + }, + { + "epoch": 0.26975807154942355, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 30931 + }, + { + "epoch": 0.2697667928345921, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 30932 + }, + { + "epoch": 0.2697755141197607, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 30933 + }, + { + "epoch": 0.2697842354049293, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 30934 + }, + { + "epoch": 0.26979295669009784, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 30935 + }, + { + "epoch": 0.26980167797526644, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 30936 + }, + { + "epoch": 0.26981039926043504, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30937 + }, + { + "epoch": 0.2698191205456036, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 30938 + }, + { + "epoch": 0.2698278418307722, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 30939 + }, + { + "epoch": 0.2698365631159408, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 30940 + }, + { + "epoch": 0.26984528440110933, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 30941 + }, + { + "epoch": 0.26985400568627793, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 30942 + }, + { + "epoch": 0.26986272697144653, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 30943 + }, + { + "epoch": 0.2698714482566151, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 30944 + }, + { + "epoch": 0.2698801695417837, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 30945 + }, + { + "epoch": 0.2698888908269523, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 30946 + }, + { + "epoch": 0.2698976121121208, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 30947 + }, + { + "epoch": 0.2699063333972894, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 30948 + }, + { + "epoch": 0.269915054682458, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 30949 + }, + { + "epoch": 0.26992377596762657, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 30950 + }, + { + "epoch": 0.26993249725279517, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 30951 + }, + { + "epoch": 0.26994121853796377, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 30952 + }, + { + "epoch": 0.2699499398231323, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 30953 + }, + { + "epoch": 0.2699586611083009, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 30954 + }, + { + "epoch": 0.2699673823934695, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 30955 + }, + { + "epoch": 0.26997610367863806, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 30956 + }, + { + "epoch": 0.26998482496380666, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 30957 + }, + { + "epoch": 0.26999354624897526, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 30958 + }, + { + "epoch": 0.27000226753414386, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 30959 + }, + { + "epoch": 0.2700109888193124, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 30960 + }, + { + "epoch": 0.270019710104481, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 30961 + }, + { + "epoch": 0.2700284313896496, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 30962 + }, + { + "epoch": 0.27003715267481815, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 30963 + }, + { + "epoch": 0.27004587395998675, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 30964 + }, + { + "epoch": 0.27005459524515535, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 30965 + }, + { + "epoch": 0.2700633165303239, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 30966 + }, + { + "epoch": 0.2700720378154925, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 30967 + }, + { + "epoch": 0.2700807591006611, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 30968 + }, + { + "epoch": 0.27008948038582964, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 30969 + }, + { + "epoch": 0.27009820167099824, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 30970 + }, + { + "epoch": 0.27010692295616684, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9792, + "step": 30971 + }, + { + "epoch": 0.2701156442413354, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 30972 + }, + { + "epoch": 0.270124365526504, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 30973 + }, + { + "epoch": 0.2701330868116726, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 30974 + }, + { + "epoch": 0.27014180809684113, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 30975 + }, + { + "epoch": 0.27015052938200973, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 30976 + }, + { + "epoch": 0.27015925066717833, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 30977 + }, + { + "epoch": 0.2701679719523469, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 30978 + }, + { + "epoch": 0.2701766932375155, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 30979 + }, + { + "epoch": 0.2701854145226841, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 30980 + }, + { + "epoch": 0.2701941358078526, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 30981 + }, + { + "epoch": 0.2702028570930212, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 30982 + }, + { + "epoch": 0.2702115783781898, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 30983 + }, + { + "epoch": 0.27022029966335837, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 30984 + }, + { + "epoch": 0.27022902094852697, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 30985 + }, + { + "epoch": 0.27023774223369557, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 30986 + }, + { + "epoch": 0.27024646351886417, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 30987 + }, + { + "epoch": 0.2702551848040327, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 30988 + }, + { + "epoch": 0.2702639060892013, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 30989 + }, + { + "epoch": 0.2702726273743699, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 30990 + }, + { + "epoch": 0.27028134865953846, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 30991 + }, + { + "epoch": 0.27029006994470706, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 30992 + }, + { + "epoch": 0.27029879122987566, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 30993 + }, + { + "epoch": 0.2703075125150442, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 30994 + }, + { + "epoch": 0.2703162338002128, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 30995 + }, + { + "epoch": 0.2703249550853814, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 30996 + }, + { + "epoch": 0.27033367637054995, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 30997 + }, + { + "epoch": 0.27034239765571855, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 30998 + }, + { + "epoch": 0.27035111894088715, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 30999 + }, + { + "epoch": 0.2703598402260557, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 31000 + }, + { + "epoch": 0.2703685615112243, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 31001 + }, + { + "epoch": 0.2703772827963929, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 31002 + }, + { + "epoch": 0.27038600408156144, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 31003 + }, + { + "epoch": 0.27039472536673004, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 31004 + }, + { + "epoch": 0.27040344665189864, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 31005 + }, + { + "epoch": 0.2704121679370672, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 31006 + }, + { + "epoch": 0.2704208892222358, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 31007 + }, + { + "epoch": 0.2704296105074044, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 31008 + }, + { + "epoch": 0.27043833179257293, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 31009 + }, + { + "epoch": 0.27044705307774153, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 31010 + }, + { + "epoch": 0.27045577436291013, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 31011 + }, + { + "epoch": 0.2704644956480787, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 31012 + }, + { + "epoch": 0.2704732169332473, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 31013 + }, + { + "epoch": 0.2704819382184159, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 31014 + }, + { + "epoch": 0.2704906595035845, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 31015 + }, + { + "epoch": 0.270499380788753, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 31016 + }, + { + "epoch": 0.2705081020739216, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 31017 + }, + { + "epoch": 0.2705168233590902, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 31018 + }, + { + "epoch": 0.27052554464425876, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 31019 + }, + { + "epoch": 0.27053426592942736, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 31020 + }, + { + "epoch": 0.27054298721459596, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 31021 + }, + { + "epoch": 0.2705517084997645, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 31022 + }, + { + "epoch": 0.2705604297849331, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31023 + }, + { + "epoch": 0.2705691510701017, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 31024 + }, + { + "epoch": 0.27057787235527025, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 31025 + }, + { + "epoch": 0.27058659364043885, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 31026 + }, + { + "epoch": 0.27059531492560746, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 31027 + }, + { + "epoch": 0.270604036210776, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 31028 + }, + { + "epoch": 0.2706127574959446, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 31029 + }, + { + "epoch": 0.2706214787811132, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 31030 + }, + { + "epoch": 0.27063020006628175, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 31031 + }, + { + "epoch": 0.27063892135145035, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 31032 + }, + { + "epoch": 0.27064764263661895, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 31033 + }, + { + "epoch": 0.2706563639217875, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 31034 + }, + { + "epoch": 0.2706650852069561, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 31035 + }, + { + "epoch": 0.2706738064921247, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 31036 + }, + { + "epoch": 0.27068252777729324, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 31037 + }, + { + "epoch": 0.27069124906246184, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 31038 + }, + { + "epoch": 0.27069997034763044, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 31039 + }, + { + "epoch": 0.27070869163279904, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 31040 + }, + { + "epoch": 0.2707174129179676, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 31041 + }, + { + "epoch": 0.2707261342031362, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 31042 + }, + { + "epoch": 0.2707348554883048, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 31043 + }, + { + "epoch": 0.2707435767734733, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 31044 + }, + { + "epoch": 0.2707522980586419, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 31045 + }, + { + "epoch": 0.2707610193438105, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 31046 + }, + { + "epoch": 0.27076974062897907, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 31047 + }, + { + "epoch": 0.2707784619141477, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 31048 + }, + { + "epoch": 0.2707871831993163, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 31049 + }, + { + "epoch": 0.2707959044844848, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 31050 + }, + { + "epoch": 0.2708046257696534, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 31051 + }, + { + "epoch": 0.270813347054822, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31052 + }, + { + "epoch": 0.27082206833999056, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 31053 + }, + { + "epoch": 0.27083078962515916, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 31054 + }, + { + "epoch": 0.27083951091032776, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 31055 + }, + { + "epoch": 0.2708482321954963, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 31056 + }, + { + "epoch": 0.2708569534806649, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 31057 + }, + { + "epoch": 0.2708656747658335, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 31058 + }, + { + "epoch": 0.27087439605100205, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 31059 + }, + { + "epoch": 0.27088311733617065, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 31060 + }, + { + "epoch": 0.27089183862133925, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 31061 + }, + { + "epoch": 0.2709005599065078, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 31062 + }, + { + "epoch": 0.2709092811916764, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 31063 + }, + { + "epoch": 0.270918002476845, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 31064 + }, + { + "epoch": 0.27092672376201354, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 31065 + }, + { + "epoch": 0.27093544504718214, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 31066 + }, + { + "epoch": 0.27094416633235074, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 31067 + }, + { + "epoch": 0.27095288761751934, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 31068 + }, + { + "epoch": 0.2709616089026879, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 31069 + }, + { + "epoch": 0.2709703301878565, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 31070 + }, + { + "epoch": 0.2709790514730251, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 31071 + }, + { + "epoch": 0.27098777275819363, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 31072 + }, + { + "epoch": 0.27099649404336223, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31073 + }, + { + "epoch": 0.27100521532853084, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 31074 + }, + { + "epoch": 0.2710139366136994, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 31075 + }, + { + "epoch": 0.271022657898868, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 31076 + }, + { + "epoch": 0.2710313791840366, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 31077 + }, + { + "epoch": 0.2710401004692051, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 31078 + }, + { + "epoch": 0.2710488217543737, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 31079 + }, + { + "epoch": 0.2710575430395423, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 31080 + }, + { + "epoch": 0.27106626432471087, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 31081 + }, + { + "epoch": 0.27107498560987947, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 31082 + }, + { + "epoch": 0.27108370689504807, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 31083 + }, + { + "epoch": 0.2710924281802166, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 31084 + }, + { + "epoch": 0.2711011494653852, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 31085 + }, + { + "epoch": 0.2711098707505538, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 31086 + }, + { + "epoch": 0.27111859203572236, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 31087 + }, + { + "epoch": 0.27112731332089096, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 31088 + }, + { + "epoch": 0.27113603460605956, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 31089 + }, + { + "epoch": 0.2711447558912281, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31090 + }, + { + "epoch": 0.2711534771763967, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 31091 + }, + { + "epoch": 0.2711621984615653, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 31092 + }, + { + "epoch": 0.27117091974673385, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 31093 + }, + { + "epoch": 0.27117964103190245, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 31094 + }, + { + "epoch": 0.27118836231707105, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 31095 + }, + { + "epoch": 0.27119708360223965, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 31096 + }, + { + "epoch": 0.2712058048874082, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 31097 + }, + { + "epoch": 0.2712145261725768, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 31098 + }, + { + "epoch": 0.2712232474577454, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 31099 + }, + { + "epoch": 0.27123196874291394, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 31100 + }, + { + "epoch": 0.27124069002808254, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 31101 + }, + { + "epoch": 0.27124941131325114, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 31102 + }, + { + "epoch": 0.2712581325984197, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 31103 + }, + { + "epoch": 0.2712668538835883, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 31104 + }, + { + "epoch": 0.2712755751687569, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 31105 + }, + { + "epoch": 0.27128429645392543, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 31106 + }, + { + "epoch": 0.27129301773909403, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31107 + }, + { + "epoch": 0.27130173902426263, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 31108 + }, + { + "epoch": 0.2713104603094312, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 31109 + }, + { + "epoch": 0.2713191815945998, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 31110 + }, + { + "epoch": 0.2713279028797684, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 31111 + }, + { + "epoch": 0.2713366241649369, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 31112 + }, + { + "epoch": 0.2713453454501055, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 31113 + }, + { + "epoch": 0.2713540667352741, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 31114 + }, + { + "epoch": 0.27136278802044267, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31115 + }, + { + "epoch": 0.27137150930561127, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31116 + }, + { + "epoch": 0.27138023059077987, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 31117 + }, + { + "epoch": 0.2713889518759484, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 31118 + }, + { + "epoch": 0.271397673161117, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 31119 + }, + { + "epoch": 0.2714063944462856, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 31120 + }, + { + "epoch": 0.27141511573145416, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 31121 + }, + { + "epoch": 0.27142383701662276, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 31122 + }, + { + "epoch": 0.27143255830179136, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 31123 + }, + { + "epoch": 0.27144127958695996, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 31124 + }, + { + "epoch": 0.2714500008721285, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 31125 + }, + { + "epoch": 0.2714587221572971, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 31126 + }, + { + "epoch": 0.2714674434424657, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 31127 + }, + { + "epoch": 0.27147616472763425, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 31128 + }, + { + "epoch": 0.27148488601280285, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 31129 + }, + { + "epoch": 0.27149360729797145, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 31130 + }, + { + "epoch": 0.27150232858314, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 31131 + }, + { + "epoch": 0.2715110498683086, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 31132 + }, + { + "epoch": 0.2715197711534772, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 31133 + }, + { + "epoch": 0.27152849243864574, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 31134 + }, + { + "epoch": 0.27153721372381434, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 31135 + }, + { + "epoch": 0.27154593500898294, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 31136 + }, + { + "epoch": 0.2715546562941515, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 31137 + }, + { + "epoch": 0.2715633775793201, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 31138 + }, + { + "epoch": 0.2715720988644887, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 31139 + }, + { + "epoch": 0.27158082014965723, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 31140 + }, + { + "epoch": 0.27158954143482583, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 31141 + }, + { + "epoch": 0.27159826271999443, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 31142 + }, + { + "epoch": 0.271606984005163, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 31143 + }, + { + "epoch": 0.2716157052903316, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 31144 + }, + { + "epoch": 0.2716244265755002, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 31145 + }, + { + "epoch": 0.2716331478606687, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 31146 + }, + { + "epoch": 0.2716418691458373, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 31147 + }, + { + "epoch": 0.2716505904310059, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 31148 + }, + { + "epoch": 0.2716593117161745, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 31149 + }, + { + "epoch": 0.27166803300134307, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 31150 + }, + { + "epoch": 0.27167675428651167, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 31151 + }, + { + "epoch": 0.27168547557168027, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 31152 + }, + { + "epoch": 0.2716941968568488, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 31153 + }, + { + "epoch": 0.2717029181420174, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 31154 + }, + { + "epoch": 0.271711639427186, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 31155 + }, + { + "epoch": 0.27172036071235456, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 31156 + }, + { + "epoch": 0.27172908199752316, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 31157 + }, + { + "epoch": 0.27173780328269176, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 31158 + }, + { + "epoch": 0.2717465245678603, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 31159 + }, + { + "epoch": 0.2717552458530289, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 31160 + }, + { + "epoch": 0.2717639671381975, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 31161 + }, + { + "epoch": 0.27177268842336605, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 31162 + }, + { + "epoch": 0.27178140970853465, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 31163 + }, + { + "epoch": 0.27179013099370325, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 31164 + }, + { + "epoch": 0.2717988522788718, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 31165 + }, + { + "epoch": 0.2718075735640404, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 31166 + }, + { + "epoch": 0.271816294849209, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 31167 + }, + { + "epoch": 0.27182501613437754, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 31168 + }, + { + "epoch": 0.27183373741954614, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 31169 + }, + { + "epoch": 0.27184245870471474, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 31170 + }, + { + "epoch": 0.2718511799898833, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 31171 + }, + { + "epoch": 0.2718599012750519, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 31172 + }, + { + "epoch": 0.2718686225602205, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 31173 + }, + { + "epoch": 0.27187734384538903, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 31174 + }, + { + "epoch": 0.27188606513055763, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 31175 + }, + { + "epoch": 0.27189478641572623, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 31176 + }, + { + "epoch": 0.27190350770089483, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 31177 + }, + { + "epoch": 0.2719122289860634, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 31178 + }, + { + "epoch": 0.271920950271232, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 31179 + }, + { + "epoch": 0.2719296715564006, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 31180 + }, + { + "epoch": 0.2719383928415691, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 31181 + }, + { + "epoch": 0.2719471141267377, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 31182 + }, + { + "epoch": 0.2719558354119063, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 31183 + }, + { + "epoch": 0.27196455669707487, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 31184 + }, + { + "epoch": 0.27197327798224347, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 31185 + }, + { + "epoch": 0.27198199926741207, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 31186 + }, + { + "epoch": 0.2719907205525806, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 31187 + }, + { + "epoch": 0.2719994418377492, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 31188 + }, + { + "epoch": 0.2720081631229178, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 31189 + }, + { + "epoch": 0.27201688440808636, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 31190 + }, + { + "epoch": 0.27202560569325496, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 31191 + }, + { + "epoch": 0.27203432697842356, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 31192 + }, + { + "epoch": 0.2720430482635921, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 31193 + }, + { + "epoch": 0.2720517695487607, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 31194 + }, + { + "epoch": 0.2720604908339293, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 31195 + }, + { + "epoch": 0.27206921211909785, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 31196 + }, + { + "epoch": 0.27207793340426645, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 31197 + }, + { + "epoch": 0.27208665468943505, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 31198 + }, + { + "epoch": 0.2720953759746036, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 31199 + }, + { + "epoch": 0.2721040972597722, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 31200 + }, + { + "epoch": 0.2721128185449408, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 31201 + }, + { + "epoch": 0.27212153983010934, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 31202 + }, + { + "epoch": 0.27213026111527794, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 31203 + }, + { + "epoch": 0.27213898240044654, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 31204 + }, + { + "epoch": 0.27214770368561514, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 31205 + }, + { + "epoch": 0.2721564249707837, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 31206 + }, + { + "epoch": 0.2721651462559523, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 31207 + }, + { + "epoch": 0.2721738675411209, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 31208 + }, + { + "epoch": 0.27218258882628943, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 31209 + }, + { + "epoch": 0.27219131011145803, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 31210 + }, + { + "epoch": 0.27220003139662663, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31211 + }, + { + "epoch": 0.2722087526817952, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 31212 + }, + { + "epoch": 0.2722174739669638, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 31213 + }, + { + "epoch": 0.2722261952521324, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 31214 + }, + { + "epoch": 0.2722349165373009, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 31215 + }, + { + "epoch": 0.2722436378224695, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31216 + }, + { + "epoch": 0.2722523591076381, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 31217 + }, + { + "epoch": 0.27226108039280666, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 31218 + }, + { + "epoch": 0.27226980167797526, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 31219 + }, + { + "epoch": 0.27227852296314387, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 31220 + }, + { + "epoch": 0.2722872442483124, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31221 + }, + { + "epoch": 0.272295965533481, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 31222 + }, + { + "epoch": 0.2723046868186496, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 31223 + }, + { + "epoch": 0.27231340810381816, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 31224 + }, + { + "epoch": 0.27232212938898676, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 31225 + }, + { + "epoch": 0.27233085067415536, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 31226 + }, + { + "epoch": 0.2723395719593239, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31227 + }, + { + "epoch": 0.2723482932444925, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 31228 + }, + { + "epoch": 0.2723570145296611, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 31229 + }, + { + "epoch": 0.27236573581482965, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 31230 + }, + { + "epoch": 0.27237445709999825, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 31231 + }, + { + "epoch": 0.27238317838516685, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 31232 + }, + { + "epoch": 0.27239189967033545, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 31233 + }, + { + "epoch": 0.272400620955504, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 31234 + }, + { + "epoch": 0.2724093422406726, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 31235 + }, + { + "epoch": 0.2724180635258412, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 31236 + }, + { + "epoch": 0.27242678481100974, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 31237 + }, + { + "epoch": 0.27243550609617834, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 31238 + }, + { + "epoch": 0.27244422738134694, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 31239 + }, + { + "epoch": 0.2724529486665155, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 31240 + }, + { + "epoch": 0.2724616699516841, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31241 + }, + { + "epoch": 0.2724703912368527, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 31242 + }, + { + "epoch": 0.2724791125220212, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31243 + }, + { + "epoch": 0.2724878338071898, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 31244 + }, + { + "epoch": 0.27249655509235843, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31245 + }, + { + "epoch": 0.272505276377527, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 31246 + }, + { + "epoch": 0.2725139976626956, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31247 + }, + { + "epoch": 0.2725227189478642, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 31248 + }, + { + "epoch": 0.2725314402330327, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 31249 + }, + { + "epoch": 0.2725401615182013, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 31250 + }, + { + "epoch": 0.2725488828033699, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 31251 + }, + { + "epoch": 0.27255760408853846, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 31252 + }, + { + "epoch": 0.27256632537370706, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 31253 + }, + { + "epoch": 0.27257504665887566, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 31254 + }, + { + "epoch": 0.2725837679440442, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 31255 + }, + { + "epoch": 0.2725924892292128, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 31256 + }, + { + "epoch": 0.2726012105143814, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31257 + }, + { + "epoch": 0.27260993179955, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 31258 + }, + { + "epoch": 0.27261865308471855, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 31259 + }, + { + "epoch": 0.27262737436988715, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 31260 + }, + { + "epoch": 0.27263609565505575, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 31261 + }, + { + "epoch": 0.2726448169402243, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 31262 + }, + { + "epoch": 0.2726535382253929, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 31263 + }, + { + "epoch": 0.2726622595105615, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 31264 + }, + { + "epoch": 0.27267098079573004, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 31265 + }, + { + "epoch": 0.27267970208089864, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 31266 + }, + { + "epoch": 0.27268842336606725, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 31267 + }, + { + "epoch": 0.2726971446512358, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 31268 + }, + { + "epoch": 0.2727058659364044, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 31269 + }, + { + "epoch": 0.272714587221573, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31270 + }, + { + "epoch": 0.27272330850674154, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 31271 + }, + { + "epoch": 0.27273202979191014, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 31272 + }, + { + "epoch": 0.27274075107707874, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 31273 + }, + { + "epoch": 0.2727494723622473, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 31274 + }, + { + "epoch": 0.2727581936474159, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 31275 + }, + { + "epoch": 0.2727669149325845, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 31276 + }, + { + "epoch": 0.272775636217753, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 31277 + }, + { + "epoch": 0.2727843575029216, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 31278 + }, + { + "epoch": 0.2727930787880902, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31279 + }, + { + "epoch": 0.27280180007325877, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 31280 + }, + { + "epoch": 0.27281052135842737, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 31281 + }, + { + "epoch": 0.27281924264359597, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 31282 + }, + { + "epoch": 0.2728279639287645, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 31283 + }, + { + "epoch": 0.2728366852139331, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 31284 + }, + { + "epoch": 0.2728454064991017, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 31285 + }, + { + "epoch": 0.2728541277842703, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 31286 + }, + { + "epoch": 0.27286284906943886, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 31287 + }, + { + "epoch": 0.27287157035460746, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 31288 + }, + { + "epoch": 0.27288029163977606, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 31289 + }, + { + "epoch": 0.2728890129249446, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 31290 + }, + { + "epoch": 0.2728977342101132, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 31291 + }, + { + "epoch": 0.2729064554952818, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 31292 + }, + { + "epoch": 0.27291517678045035, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 31293 + }, + { + "epoch": 0.27292389806561895, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 31294 + }, + { + "epoch": 0.27293261935078755, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 31295 + }, + { + "epoch": 0.2729413406359561, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 31296 + }, + { + "epoch": 0.2729500619211247, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 31297 + }, + { + "epoch": 0.2729587832062933, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 31298 + }, + { + "epoch": 0.27296750449146184, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 31299 + }, + { + "epoch": 0.27297622577663044, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31300 + }, + { + "epoch": 0.27298494706179904, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 31301 + }, + { + "epoch": 0.2729936683469676, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 31302 + }, + { + "epoch": 0.2730023896321362, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 31303 + }, + { + "epoch": 0.2730111109173048, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 31304 + }, + { + "epoch": 0.27301983220247333, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31305 + }, + { + "epoch": 0.27302855348764193, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 31306 + }, + { + "epoch": 0.27303727477281053, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 31307 + }, + { + "epoch": 0.2730459960579791, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 31308 + }, + { + "epoch": 0.2730547173431477, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 31309 + }, + { + "epoch": 0.2730634386283163, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 31310 + }, + { + "epoch": 0.2730721599134848, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 31311 + }, + { + "epoch": 0.2730808811986534, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 31312 + }, + { + "epoch": 0.273089602483822, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 31313 + }, + { + "epoch": 0.2730983237689906, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 31314 + }, + { + "epoch": 0.27310704505415917, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 31315 + }, + { + "epoch": 0.27311576633932777, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 31316 + }, + { + "epoch": 0.27312448762449637, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 31317 + }, + { + "epoch": 0.2731332089096649, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 31318 + }, + { + "epoch": 0.2731419301948335, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 31319 + }, + { + "epoch": 0.2731506514800021, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31320 + }, + { + "epoch": 0.27315937276517066, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31321 + }, + { + "epoch": 0.27316809405033926, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 31322 + }, + { + "epoch": 0.27317681533550786, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 31323 + }, + { + "epoch": 0.2731855366206764, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 31324 + }, + { + "epoch": 0.273194257905845, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 31325 + }, + { + "epoch": 0.2732029791910136, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 31326 + }, + { + "epoch": 0.27321170047618215, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 31327 + }, + { + "epoch": 0.27322042176135075, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 31328 + }, + { + "epoch": 0.27322914304651935, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 31329 + }, + { + "epoch": 0.2732378643316879, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 31330 + }, + { + "epoch": 0.2732465856168565, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 31331 + }, + { + "epoch": 0.2732553069020251, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 31332 + }, + { + "epoch": 0.27326402818719364, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 31333 + }, + { + "epoch": 0.27327274947236224, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 31334 + }, + { + "epoch": 0.27328147075753084, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 31335 + }, + { + "epoch": 0.2732901920426994, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 31336 + }, + { + "epoch": 0.273298913327868, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 31337 + }, + { + "epoch": 0.2733076346130366, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 31338 + }, + { + "epoch": 0.27331635589820513, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 31339 + }, + { + "epoch": 0.27332507718337373, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 31340 + }, + { + "epoch": 0.27333379846854233, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 31341 + }, + { + "epoch": 0.27334251975371093, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 31342 + }, + { + "epoch": 0.2733512410388795, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 31343 + }, + { + "epoch": 0.2733599623240481, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 31344 + }, + { + "epoch": 0.2733686836092167, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 31345 + }, + { + "epoch": 0.2733774048943852, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 31346 + }, + { + "epoch": 0.2733861261795538, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 31347 + }, + { + "epoch": 0.2733948474647224, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 31348 + }, + { + "epoch": 0.27340356874989097, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31349 + }, + { + "epoch": 0.27341229003505957, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 31350 + }, + { + "epoch": 0.27342101132022817, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 31351 + }, + { + "epoch": 0.2734297326053967, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 31352 + }, + { + "epoch": 0.2734384538905653, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 31353 + }, + { + "epoch": 0.2734471751757339, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 31354 + }, + { + "epoch": 0.27345589646090246, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 31355 + }, + { + "epoch": 0.27346461774607106, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 31356 + }, + { + "epoch": 0.27347333903123966, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 31357 + }, + { + "epoch": 0.2734820603164082, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 31358 + }, + { + "epoch": 0.2734907816015768, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 31359 + }, + { + "epoch": 0.2734995028867454, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 31360 + }, + { + "epoch": 0.27350822417191395, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 31361 + }, + { + "epoch": 0.27351694545708255, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 31362 + }, + { + "epoch": 0.27352566674225115, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 31363 + }, + { + "epoch": 0.2735343880274197, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 31364 + }, + { + "epoch": 0.2735431093125883, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 31365 + }, + { + "epoch": 0.2735518305977569, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 31366 + }, + { + "epoch": 0.2735605518829255, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 31367 + }, + { + "epoch": 0.27356927316809404, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 31368 + }, + { + "epoch": 0.27357799445326264, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 31369 + }, + { + "epoch": 0.27358671573843124, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 31370 + }, + { + "epoch": 0.2735954370235998, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 31371 + }, + { + "epoch": 0.2736041583087684, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 31372 + }, + { + "epoch": 0.273612879593937, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 31373 + }, + { + "epoch": 0.27362160087910553, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 31374 + }, + { + "epoch": 0.27363032216427413, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 31375 + }, + { + "epoch": 0.27363904344944273, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 31376 + }, + { + "epoch": 0.2736477647346113, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31377 + }, + { + "epoch": 0.2736564860197799, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 31378 + }, + { + "epoch": 0.2736652073049485, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31379 + }, + { + "epoch": 0.273673928590117, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 31380 + }, + { + "epoch": 0.2736826498752856, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 31381 + }, + { + "epoch": 0.2736913711604542, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 31382 + }, + { + "epoch": 0.27370009244562277, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 31383 + }, + { + "epoch": 0.27370881373079137, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 31384 + }, + { + "epoch": 0.27371753501595997, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 31385 + }, + { + "epoch": 0.2737262563011285, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 31386 + }, + { + "epoch": 0.2737349775862971, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 31387 + }, + { + "epoch": 0.2737436988714657, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 31388 + }, + { + "epoch": 0.27375242015663426, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 31389 + }, + { + "epoch": 0.27376114144180286, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 31390 + }, + { + "epoch": 0.27376986272697146, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 31391 + }, + { + "epoch": 0.27377858401214, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 31392 + }, + { + "epoch": 0.2737873052973086, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 31393 + }, + { + "epoch": 0.2737960265824772, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 31394 + }, + { + "epoch": 0.2738047478676458, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 31395 + }, + { + "epoch": 0.27381346915281435, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 31396 + }, + { + "epoch": 0.27382219043798295, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31397 + }, + { + "epoch": 0.27383091172315155, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 31398 + }, + { + "epoch": 0.2738396330083201, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 31399 + }, + { + "epoch": 0.2738483542934887, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 31400 + }, + { + "epoch": 0.2738570755786573, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 31401 + }, + { + "epoch": 0.27386579686382584, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 31402 + }, + { + "epoch": 0.27387451814899444, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 31403 + }, + { + "epoch": 0.27388323943416304, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 31404 + }, + { + "epoch": 0.2738919607193316, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 31405 + }, + { + "epoch": 0.2739006820045002, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 31406 + }, + { + "epoch": 0.2739094032896688, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 31407 + }, + { + "epoch": 0.27391812457483733, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 31408 + }, + { + "epoch": 0.27392684586000593, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31409 + }, + { + "epoch": 0.27393556714517453, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 31410 + }, + { + "epoch": 0.2739442884303431, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 31411 + }, + { + "epoch": 0.2739530097155117, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 31412 + }, + { + "epoch": 0.2739617310006803, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 31413 + }, + { + "epoch": 0.2739704522858488, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 31414 + }, + { + "epoch": 0.2739791735710174, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 31415 + }, + { + "epoch": 0.273987894856186, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 31416 + }, + { + "epoch": 0.27399661614135457, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 31417 + }, + { + "epoch": 0.27400533742652317, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 31418 + }, + { + "epoch": 0.27401405871169177, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 31419 + }, + { + "epoch": 0.2740227799968603, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 31420 + }, + { + "epoch": 0.2740315012820289, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 31421 + }, + { + "epoch": 0.2740402225671975, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 31422 + }, + { + "epoch": 0.2740489438523661, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 31423 + }, + { + "epoch": 0.27405766513753466, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 31424 + }, + { + "epoch": 0.27406638642270326, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 31425 + }, + { + "epoch": 0.27407510770787186, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 31426 + }, + { + "epoch": 0.2740838289930404, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9677, + "step": 31427 + }, + { + "epoch": 0.274092550278209, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 31428 + }, + { + "epoch": 0.2741012715633776, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 31429 + }, + { + "epoch": 0.27410999284854615, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 31430 + }, + { + "epoch": 0.27411871413371475, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 31431 + }, + { + "epoch": 0.27412743541888335, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 31432 + }, + { + "epoch": 0.2741361567040519, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 31433 + }, + { + "epoch": 0.2741448779892205, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 31434 + }, + { + "epoch": 0.2741535992743891, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 31435 + }, + { + "epoch": 0.27416232055955764, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 31436 + }, + { + "epoch": 0.27417104184472624, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 31437 + }, + { + "epoch": 0.27417976312989484, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 31438 + }, + { + "epoch": 0.2741884844150634, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 31439 + }, + { + "epoch": 0.274197205700232, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 31440 + }, + { + "epoch": 0.2742059269854006, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31441 + }, + { + "epoch": 0.27421464827056913, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 31442 + }, + { + "epoch": 0.27422336955573773, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 31443 + }, + { + "epoch": 0.27423209084090633, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 31444 + }, + { + "epoch": 0.2742408121260749, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 31445 + }, + { + "epoch": 0.2742495334112435, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 31446 + }, + { + "epoch": 0.2742582546964121, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 31447 + }, + { + "epoch": 0.2742669759815806, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 31448 + }, + { + "epoch": 0.2742756972667492, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 31449 + }, + { + "epoch": 0.2742844185519178, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 31450 + }, + { + "epoch": 0.2742931398370864, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 31451 + }, + { + "epoch": 0.27430186112225496, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 31452 + }, + { + "epoch": 0.27431058240742356, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 31453 + }, + { + "epoch": 0.27431930369259216, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 31454 + }, + { + "epoch": 0.2743280249777607, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 31455 + }, + { + "epoch": 0.2743367462629293, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 31456 + }, + { + "epoch": 0.2743454675480979, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 31457 + }, + { + "epoch": 0.27435418883326645, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31458 + }, + { + "epoch": 0.27436291011843505, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 31459 + }, + { + "epoch": 0.27437163140360366, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 31460 + }, + { + "epoch": 0.2743803526887722, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 31461 + }, + { + "epoch": 0.2743890739739408, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31462 + }, + { + "epoch": 0.2743977952591094, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 31463 + }, + { + "epoch": 0.27440651654427795, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 31464 + }, + { + "epoch": 0.27441523782944655, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31465 + }, + { + "epoch": 0.27442395911461515, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 31466 + }, + { + "epoch": 0.2744326803997837, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 31467 + }, + { + "epoch": 0.2744414016849523, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31468 + }, + { + "epoch": 0.2744501229701209, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 31469 + }, + { + "epoch": 0.27445884425528944, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 31470 + }, + { + "epoch": 0.27446756554045804, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 31471 + }, + { + "epoch": 0.27447628682562664, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 31472 + }, + { + "epoch": 0.2744850081107952, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 31473 + }, + { + "epoch": 0.2744937293959638, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 31474 + }, + { + "epoch": 0.2745024506811324, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 31475 + }, + { + "epoch": 0.2745111719663009, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 31476 + }, + { + "epoch": 0.2745198932514695, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 31477 + }, + { + "epoch": 0.2745286145366381, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 31478 + }, + { + "epoch": 0.2745373358218067, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 31479 + }, + { + "epoch": 0.27454605710697527, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 31480 + }, + { + "epoch": 0.2745547783921439, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 31481 + }, + { + "epoch": 0.2745634996773125, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 31482 + }, + { + "epoch": 0.274572220962481, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 31483 + }, + { + "epoch": 0.2745809422476496, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 31484 + }, + { + "epoch": 0.2745896635328182, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31485 + }, + { + "epoch": 0.27459838481798676, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 31486 + }, + { + "epoch": 0.27460710610315536, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 31487 + }, + { + "epoch": 0.27461582738832396, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 31488 + }, + { + "epoch": 0.2746245486734925, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 31489 + }, + { + "epoch": 0.2746332699586611, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 31490 + }, + { + "epoch": 0.2746419912438297, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 31491 + }, + { + "epoch": 0.27465071252899825, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 31492 + }, + { + "epoch": 0.27465943381416685, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 31493 + }, + { + "epoch": 0.27466815509933545, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 31494 + }, + { + "epoch": 0.274676876384504, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 31495 + }, + { + "epoch": 0.2746855976696726, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 31496 + }, + { + "epoch": 0.2746943189548412, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 31497 + }, + { + "epoch": 0.27470304024000974, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 31498 + }, + { + "epoch": 0.27471176152517834, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 31499 + }, + { + "epoch": 0.27472048281034694, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 31500 + }, + { + "epoch": 0.2747292040955155, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 31501 + }, + { + "epoch": 0.2747379253806841, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 31502 + }, + { + "epoch": 0.2747466466658527, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 31503 + }, + { + "epoch": 0.2747553679510213, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 31504 + }, + { + "epoch": 0.27476408923618983, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 31505 + }, + { + "epoch": 0.27477281052135843, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 31506 + }, + { + "epoch": 0.27478153180652704, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 31507 + }, + { + "epoch": 0.2747902530916956, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 31508 + }, + { + "epoch": 0.2747989743768642, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 31509 + }, + { + "epoch": 0.2748076956620328, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 31510 + }, + { + "epoch": 0.2748164169472013, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 31511 + }, + { + "epoch": 0.2748251382323699, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 31512 + }, + { + "epoch": 0.2748338595175385, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 31513 + }, + { + "epoch": 0.27484258080270707, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 31514 + }, + { + "epoch": 0.27485130208787567, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 31515 + }, + { + "epoch": 0.27486002337304427, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 31516 + }, + { + "epoch": 0.2748687446582128, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 31517 + }, + { + "epoch": 0.2748774659433814, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31518 + }, + { + "epoch": 0.27488618722855, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 31519 + }, + { + "epoch": 0.27489490851371856, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31520 + }, + { + "epoch": 0.27490362979888716, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 31521 + }, + { + "epoch": 0.27491235108405576, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 31522 + }, + { + "epoch": 0.2749210723692243, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 31523 + }, + { + "epoch": 0.2749297936543929, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 31524 + }, + { + "epoch": 0.2749385149395615, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 31525 + }, + { + "epoch": 0.27494723622473005, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 31526 + }, + { + "epoch": 0.27495595750989865, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 31527 + }, + { + "epoch": 0.27496467879506725, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 31528 + }, + { + "epoch": 0.2749734000802358, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 31529 + }, + { + "epoch": 0.2749821213654044, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 31530 + }, + { + "epoch": 0.274990842650573, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 31531 + }, + { + "epoch": 0.2749995639357416, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 31532 + }, + { + "epoch": 0.27500828522091014, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 31533 + }, + { + "epoch": 0.27501700650607874, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 31534 + }, + { + "epoch": 0.27502572779124734, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 31535 + }, + { + "epoch": 0.2750344490764159, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 31536 + }, + { + "epoch": 0.2750431703615845, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 31537 + }, + { + "epoch": 0.2750518916467531, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 31538 + }, + { + "epoch": 0.27506061293192163, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 31539 + }, + { + "epoch": 0.27506933421709023, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 31540 + }, + { + "epoch": 0.27507805550225883, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 31541 + }, + { + "epoch": 0.2750867767874274, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 31542 + }, + { + "epoch": 0.275095498072596, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 31543 + }, + { + "epoch": 0.2751042193577646, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 31544 + }, + { + "epoch": 0.2751129406429331, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 31545 + }, + { + "epoch": 0.2751216619281017, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 31546 + }, + { + "epoch": 0.2751303832132703, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 31547 + }, + { + "epoch": 0.27513910449843887, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 31548 + }, + { + "epoch": 0.27514782578360747, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 31549 + }, + { + "epoch": 0.27515654706877607, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 31550 + }, + { + "epoch": 0.2751652683539446, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 31551 + }, + { + "epoch": 0.2751739896391132, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 31552 + }, + { + "epoch": 0.2751827109242818, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 31553 + }, + { + "epoch": 0.27519143220945036, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 31554 + }, + { + "epoch": 0.27520015349461896, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 31555 + }, + { + "epoch": 0.27520887477978756, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 31556 + }, + { + "epoch": 0.2752175960649561, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 31557 + }, + { + "epoch": 0.2752263173501247, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 31558 + }, + { + "epoch": 0.2752350386352933, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 31559 + }, + { + "epoch": 0.2752437599204619, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 31560 + }, + { + "epoch": 0.27525248120563045, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 31561 + }, + { + "epoch": 0.27526120249079905, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 31562 + }, + { + "epoch": 0.27526992377596765, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 31563 + }, + { + "epoch": 0.2752786450611362, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31564 + }, + { + "epoch": 0.2752873663463048, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 31565 + }, + { + "epoch": 0.2752960876314734, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 31566 + }, + { + "epoch": 0.27530480891664194, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 31567 + }, + { + "epoch": 0.27531353020181054, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 31568 + }, + { + "epoch": 0.27532225148697914, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 31569 + }, + { + "epoch": 0.2753309727721477, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 31570 + }, + { + "epoch": 0.2753396940573163, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 31571 + }, + { + "epoch": 0.2753484153424849, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 31572 + }, + { + "epoch": 0.27535713662765343, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 31573 + }, + { + "epoch": 0.27536585791282203, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 31574 + }, + { + "epoch": 0.27537457919799063, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 31575 + }, + { + "epoch": 0.2753833004831592, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 31576 + }, + { + "epoch": 0.2753920217683278, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 31577 + }, + { + "epoch": 0.2754007430534964, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 31578 + }, + { + "epoch": 0.2754094643386649, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 31579 + }, + { + "epoch": 0.2754181856238335, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 31580 + }, + { + "epoch": 0.2754269069090021, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 31581 + }, + { + "epoch": 0.27543562819417067, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 31582 + }, + { + "epoch": 0.27544434947933927, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 31583 + }, + { + "epoch": 0.27545307076450787, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 31584 + }, + { + "epoch": 0.2754617920496764, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 31585 + }, + { + "epoch": 0.275470513334845, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 31586 + }, + { + "epoch": 0.2754792346200136, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 31587 + }, + { + "epoch": 0.2754879559051822, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 31588 + }, + { + "epoch": 0.27549667719035076, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 31589 + }, + { + "epoch": 0.27550539847551936, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 31590 + }, + { + "epoch": 0.27551411976068796, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 31591 + }, + { + "epoch": 0.2755228410458565, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 31592 + }, + { + "epoch": 0.2755315623310251, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 31593 + }, + { + "epoch": 0.2755402836161937, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 31594 + }, + { + "epoch": 0.27554900490136225, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 31595 + }, + { + "epoch": 0.27555772618653085, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 31596 + }, + { + "epoch": 0.27556644747169945, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 31597 + }, + { + "epoch": 0.275575168756868, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 31598 + }, + { + "epoch": 0.2755838900420366, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 31599 + }, + { + "epoch": 0.2755926113272052, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 31600 + }, + { + "epoch": 0.27560133261237374, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 31601 + }, + { + "epoch": 0.27561005389754234, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 31602 + }, + { + "epoch": 0.27561877518271094, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 31603 + }, + { + "epoch": 0.2756274964678795, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 31604 + }, + { + "epoch": 0.2756362177530481, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 31605 + }, + { + "epoch": 0.2756449390382167, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 31606 + }, + { + "epoch": 0.27565366032338523, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 31607 + }, + { + "epoch": 0.27566238160855383, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 31608 + }, + { + "epoch": 0.27567110289372243, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 31609 + }, + { + "epoch": 0.275679824178891, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 31610 + }, + { + "epoch": 0.2756885454640596, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 31611 + }, + { + "epoch": 0.2756972667492282, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 31612 + }, + { + "epoch": 0.2757059880343968, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31613 + }, + { + "epoch": 0.2757147093195653, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 31614 + }, + { + "epoch": 0.2757234306047339, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 31615 + }, + { + "epoch": 0.2757321518899025, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 31616 + }, + { + "epoch": 0.27574087317507107, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31617 + }, + { + "epoch": 0.27574959446023967, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 31618 + }, + { + "epoch": 0.27575831574540827, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 31619 + }, + { + "epoch": 0.2757670370305768, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 31620 + }, + { + "epoch": 0.2757757583157454, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 31621 + }, + { + "epoch": 0.275784479600914, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 31622 + }, + { + "epoch": 0.27579320088608256, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 31623 + }, + { + "epoch": 0.27580192217125116, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 31624 + }, + { + "epoch": 0.27581064345641976, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 31625 + }, + { + "epoch": 0.2758193647415883, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 31626 + }, + { + "epoch": 0.2758280860267569, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 31627 + }, + { + "epoch": 0.2758368073119255, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 31628 + }, + { + "epoch": 0.27584552859709405, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 31629 + }, + { + "epoch": 0.27585424988226265, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 31630 + }, + { + "epoch": 0.27586297116743125, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31631 + }, + { + "epoch": 0.2758716924525998, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 31632 + }, + { + "epoch": 0.2758804137377684, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 31633 + }, + { + "epoch": 0.275889135022937, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 31634 + }, + { + "epoch": 0.27589785630810554, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 31635 + }, + { + "epoch": 0.27590657759327414, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 31636 + }, + { + "epoch": 0.27591529887844274, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 31637 + }, + { + "epoch": 0.2759240201636113, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 31638 + }, + { + "epoch": 0.2759327414487799, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 31639 + }, + { + "epoch": 0.2759414627339485, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 31640 + }, + { + "epoch": 0.2759501840191171, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 31641 + }, + { + "epoch": 0.27595890530428563, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 31642 + }, + { + "epoch": 0.27596762658945423, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 31643 + }, + { + "epoch": 0.27597634787462283, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 31644 + }, + { + "epoch": 0.2759850691597914, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 31645 + }, + { + "epoch": 0.27599379044496, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 31646 + }, + { + "epoch": 0.2760025117301286, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 31647 + }, + { + "epoch": 0.2760112330152971, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 31648 + }, + { + "epoch": 0.2760199543004657, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 31649 + }, + { + "epoch": 0.2760286755856343, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 31650 + }, + { + "epoch": 0.27603739687080286, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 31651 + }, + { + "epoch": 0.27604611815597146, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 31652 + }, + { + "epoch": 0.27605483944114007, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 31653 + }, + { + "epoch": 0.2760635607263086, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 31654 + }, + { + "epoch": 0.2760722820114772, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 31655 + }, + { + "epoch": 0.2760810032966458, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 31656 + }, + { + "epoch": 0.27608972458181436, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 31657 + }, + { + "epoch": 0.27609844586698296, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 31658 + }, + { + "epoch": 0.27610716715215156, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 31659 + }, + { + "epoch": 0.2761158884373201, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 31660 + }, + { + "epoch": 0.2761246097224887, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 31661 + }, + { + "epoch": 0.2761333310076573, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 31662 + }, + { + "epoch": 0.27614205229282585, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 31663 + }, + { + "epoch": 0.27615077357799445, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 31664 + }, + { + "epoch": 0.27615949486316305, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 31665 + }, + { + "epoch": 0.2761682161483316, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 31666 + }, + { + "epoch": 0.2761769374335002, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 31667 + }, + { + "epoch": 0.2761856587186688, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 31668 + }, + { + "epoch": 0.2761943800038374, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 31669 + }, + { + "epoch": 0.27620310128900594, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 31670 + }, + { + "epoch": 0.27621182257417454, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 31671 + }, + { + "epoch": 0.27622054385934314, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 31672 + }, + { + "epoch": 0.2762292651445117, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 31673 + }, + { + "epoch": 0.2762379864296803, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 31674 + }, + { + "epoch": 0.2762467077148489, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 31675 + }, + { + "epoch": 0.2762554290000174, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 31676 + }, + { + "epoch": 0.276264150285186, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 31677 + }, + { + "epoch": 0.27627287157035463, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 31678 + }, + { + "epoch": 0.2762815928555232, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 31679 + }, + { + "epoch": 0.2762903141406918, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 31680 + }, + { + "epoch": 0.2762990354258604, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 31681 + }, + { + "epoch": 0.2763077567110289, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 31682 + }, + { + "epoch": 0.2763164779961975, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 31683 + }, + { + "epoch": 0.2763251992813661, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31684 + }, + { + "epoch": 0.27633392056653466, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 31685 + }, + { + "epoch": 0.27634264185170326, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 31686 + }, + { + "epoch": 0.27635136313687186, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 31687 + }, + { + "epoch": 0.2763600844220404, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 31688 + }, + { + "epoch": 0.276368805707209, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 31689 + }, + { + "epoch": 0.2763775269923776, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 31690 + }, + { + "epoch": 0.27638624827754615, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 31691 + }, + { + "epoch": 0.27639496956271475, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 31692 + }, + { + "epoch": 0.27640369084788335, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 31693 + }, + { + "epoch": 0.2764124121330519, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 31694 + }, + { + "epoch": 0.2764211334182205, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 31695 + }, + { + "epoch": 0.2764298547033891, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 31696 + }, + { + "epoch": 0.2764385759885577, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 31697 + }, + { + "epoch": 0.27644729727372624, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 31698 + }, + { + "epoch": 0.27645601855889484, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31699 + }, + { + "epoch": 0.27646473984406345, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 31700 + }, + { + "epoch": 0.276473461129232, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 31701 + }, + { + "epoch": 0.2764821824144006, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 31702 + }, + { + "epoch": 0.2764909036995692, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 31703 + }, + { + "epoch": 0.27649962498473774, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 31704 + }, + { + "epoch": 0.27650834626990634, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 31705 + }, + { + "epoch": 0.27651706755507494, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 31706 + }, + { + "epoch": 0.2765257888402435, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 31707 + }, + { + "epoch": 0.2765345101254121, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 31708 + }, + { + "epoch": 0.2765432314105807, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 31709 + }, + { + "epoch": 0.2765519526957492, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 31710 + }, + { + "epoch": 0.2765606739809178, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 31711 + }, + { + "epoch": 0.2765693952660864, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 31712 + }, + { + "epoch": 0.27657811655125497, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 31713 + }, + { + "epoch": 0.27658683783642357, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 31714 + }, + { + "epoch": 0.27659555912159217, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 31715 + }, + { + "epoch": 0.2766042804067607, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 31716 + }, + { + "epoch": 0.2766130016919293, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 31717 + }, + { + "epoch": 0.2766217229770979, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 31718 + }, + { + "epoch": 0.27663044426226646, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 31719 + }, + { + "epoch": 0.27663916554743506, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31720 + }, + { + "epoch": 0.27664788683260366, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 31721 + }, + { + "epoch": 0.27665660811777226, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 31722 + }, + { + "epoch": 0.2766653294029408, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 31723 + }, + { + "epoch": 0.2766740506881094, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 31724 + }, + { + "epoch": 0.276682771973278, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 31725 + }, + { + "epoch": 0.27669149325844655, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 31726 + }, + { + "epoch": 0.27670021454361515, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 31727 + }, + { + "epoch": 0.27670893582878375, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 31728 + }, + { + "epoch": 0.2767176571139523, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 31729 + }, + { + "epoch": 0.2767263783991209, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 31730 + }, + { + "epoch": 0.2767350996842895, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 31731 + }, + { + "epoch": 0.27674382096945804, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 31732 + }, + { + "epoch": 0.27675254225462664, + "grad_norm": 0.322265625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 31733 + }, + { + "epoch": 0.27676126353979524, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 31734 + }, + { + "epoch": 0.2767699848249638, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 31735 + }, + { + "epoch": 0.2767787061101324, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 31736 + }, + { + "epoch": 0.276787427395301, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 31737 + }, + { + "epoch": 0.27679614868046953, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 31738 + }, + { + "epoch": 0.27680486996563813, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 31739 + }, + { + "epoch": 0.27681359125080673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 31740 + }, + { + "epoch": 0.2768223125359753, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 31741 + }, + { + "epoch": 0.2768310338211439, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 31742 + }, + { + "epoch": 0.2768397551063125, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 31743 + }, + { + "epoch": 0.276848476391481, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 31744 + }, + { + "epoch": 0.2768571976766496, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 31745 + }, + { + "epoch": 0.2768659189618182, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 31746 + }, + { + "epoch": 0.27687464024698677, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 31747 + }, + { + "epoch": 0.27688336153215537, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 31748 + }, + { + "epoch": 0.27689208281732397, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 31749 + }, + { + "epoch": 0.27690080410249257, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 31750 + }, + { + "epoch": 0.2769095253876611, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 31751 + }, + { + "epoch": 0.2769182466728297, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 31752 + }, + { + "epoch": 0.2769269679579983, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 31753 + }, + { + "epoch": 0.27693568924316686, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 31754 + }, + { + "epoch": 0.27694441052833546, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 31755 + }, + { + "epoch": 0.27695313181350406, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 31756 + }, + { + "epoch": 0.2769618530986726, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 31757 + }, + { + "epoch": 0.2769705743838412, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 31758 + }, + { + "epoch": 0.2769792956690098, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 31759 + }, + { + "epoch": 0.27698801695417835, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31760 + }, + { + "epoch": 0.27699673823934695, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 31761 + }, + { + "epoch": 0.27700545952451555, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 31762 + }, + { + "epoch": 0.2770141808096841, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 31763 + }, + { + "epoch": 0.2770229020948527, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 31764 + }, + { + "epoch": 0.2770316233800213, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 31765 + }, + { + "epoch": 0.27704034466518984, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 31766 + }, + { + "epoch": 0.27704906595035844, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 31767 + }, + { + "epoch": 0.27705778723552704, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 31768 + }, + { + "epoch": 0.2770665085206956, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 31769 + }, + { + "epoch": 0.2770752298058642, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 31770 + }, + { + "epoch": 0.2770839510910328, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 31771 + }, + { + "epoch": 0.27709267237620133, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 31772 + }, + { + "epoch": 0.27710139366136993, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 31773 + }, + { + "epoch": 0.27711011494653853, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 31774 + }, + { + "epoch": 0.2771188362317071, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 31775 + }, + { + "epoch": 0.2771275575168757, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31776 + }, + { + "epoch": 0.2771362788020443, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 31777 + }, + { + "epoch": 0.2771450000872129, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 31778 + }, + { + "epoch": 0.2771537213723814, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 31779 + }, + { + "epoch": 0.27716244265755, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 31780 + }, + { + "epoch": 0.2771711639427186, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 31781 + }, + { + "epoch": 0.27717988522788717, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 31782 + }, + { + "epoch": 0.27718860651305577, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 31783 + }, + { + "epoch": 0.27719732779822437, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 31784 + }, + { + "epoch": 0.2772060490833929, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 31785 + }, + { + "epoch": 0.2772147703685615, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 31786 + }, + { + "epoch": 0.2772234916537301, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 31787 + }, + { + "epoch": 0.27723221293889866, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31788 + }, + { + "epoch": 0.27724093422406726, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 31789 + }, + { + "epoch": 0.27724965550923586, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 31790 + }, + { + "epoch": 0.2772583767944044, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 31791 + }, + { + "epoch": 0.277267098079573, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 31792 + }, + { + "epoch": 0.2772758193647416, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31793 + }, + { + "epoch": 0.27728454064991015, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 31794 + }, + { + "epoch": 0.27729326193507875, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 31795 + }, + { + "epoch": 0.27730198322024735, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 31796 + }, + { + "epoch": 0.2773107045054159, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 31797 + }, + { + "epoch": 0.2773194257905845, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 31798 + }, + { + "epoch": 0.2773281470757531, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 31799 + }, + { + "epoch": 0.27733686836092164, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 31800 + }, + { + "epoch": 0.27734558964609024, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 31801 + }, + { + "epoch": 0.27735431093125884, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 31802 + }, + { + "epoch": 0.2773630322164274, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 31803 + }, + { + "epoch": 0.277371753501596, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 31804 + }, + { + "epoch": 0.2773804747867646, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 31805 + }, + { + "epoch": 0.2773891960719332, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 31806 + }, + { + "epoch": 0.27739791735710173, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 31807 + }, + { + "epoch": 0.27740663864227033, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 31808 + }, + { + "epoch": 0.27741535992743893, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 31809 + }, + { + "epoch": 0.2774240812126075, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 31810 + }, + { + "epoch": 0.2774328024977761, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 31811 + }, + { + "epoch": 0.2774415237829447, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 31812 + }, + { + "epoch": 0.2774502450681132, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 31813 + }, + { + "epoch": 0.2774589663532818, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 31814 + }, + { + "epoch": 0.2774676876384504, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 31815 + }, + { + "epoch": 0.27747640892361897, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 31816 + }, + { + "epoch": 0.27748513020878757, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 31817 + }, + { + "epoch": 0.27749385149395617, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 31818 + }, + { + "epoch": 0.2775025727791247, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31819 + }, + { + "epoch": 0.2775112940642933, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 31820 + }, + { + "epoch": 0.2775200153494619, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 31821 + }, + { + "epoch": 0.27752873663463046, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 31822 + }, + { + "epoch": 0.27753745791979906, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 31823 + }, + { + "epoch": 0.27754617920496766, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31824 + }, + { + "epoch": 0.2775549004901362, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 31825 + }, + { + "epoch": 0.2775636217753048, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 31826 + }, + { + "epoch": 0.2775723430604734, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 31827 + }, + { + "epoch": 0.27758106434564195, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 31828 + }, + { + "epoch": 0.27758978563081055, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 31829 + }, + { + "epoch": 0.27759850691597915, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 31830 + }, + { + "epoch": 0.27760722820114775, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 31831 + }, + { + "epoch": 0.2776159494863163, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 31832 + }, + { + "epoch": 0.2776246707714849, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 31833 + }, + { + "epoch": 0.2776333920566535, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31834 + }, + { + "epoch": 0.27764211334182204, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 31835 + }, + { + "epoch": 0.27765083462699064, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 31836 + }, + { + "epoch": 0.27765955591215924, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 31837 + }, + { + "epoch": 0.2776682771973278, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 31838 + }, + { + "epoch": 0.2776769984824964, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 31839 + }, + { + "epoch": 0.277685719767665, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 31840 + }, + { + "epoch": 0.27769444105283353, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31841 + }, + { + "epoch": 0.27770316233800213, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 31842 + }, + { + "epoch": 0.27771188362317073, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 31843 + }, + { + "epoch": 0.2777206049083393, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 31844 + }, + { + "epoch": 0.2777293261935079, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 31845 + }, + { + "epoch": 0.2777380474786765, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 31846 + }, + { + "epoch": 0.277746768763845, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 31847 + }, + { + "epoch": 0.2777554900490136, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 31848 + }, + { + "epoch": 0.2777642113341822, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31849 + }, + { + "epoch": 0.27777293261935077, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 31850 + }, + { + "epoch": 0.27778165390451937, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 31851 + }, + { + "epoch": 0.27779037518968797, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 31852 + }, + { + "epoch": 0.2777990964748565, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 31853 + }, + { + "epoch": 0.2778078177600251, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 31854 + }, + { + "epoch": 0.2778165390451937, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 31855 + }, + { + "epoch": 0.27782526033036226, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 31856 + }, + { + "epoch": 0.27783398161553086, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 31857 + }, + { + "epoch": 0.27784270290069946, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 31858 + }, + { + "epoch": 0.27785142418586806, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 31859 + }, + { + "epoch": 0.2778601454710366, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 31860 + }, + { + "epoch": 0.2778688667562052, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 31861 + }, + { + "epoch": 0.2778775880413738, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 31862 + }, + { + "epoch": 0.27788630932654235, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 31863 + }, + { + "epoch": 0.27789503061171095, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 31864 + }, + { + "epoch": 0.27790375189687955, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 31865 + }, + { + "epoch": 0.2779124731820481, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 31866 + }, + { + "epoch": 0.2779211944672167, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 31867 + }, + { + "epoch": 0.2779299157523853, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 31868 + }, + { + "epoch": 0.27793863703755384, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 31869 + }, + { + "epoch": 0.27794735832272244, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 31870 + }, + { + "epoch": 0.27795607960789104, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 31871 + }, + { + "epoch": 0.2779648008930596, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 31872 + }, + { + "epoch": 0.2779735221782282, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 31873 + }, + { + "epoch": 0.2779822434633968, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 31874 + }, + { + "epoch": 0.27799096474856533, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 31875 + }, + { + "epoch": 0.27799968603373393, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 31876 + }, + { + "epoch": 0.27800840731890253, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 31877 + }, + { + "epoch": 0.2780171286040711, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 31878 + }, + { + "epoch": 0.2780258498892397, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 31879 + }, + { + "epoch": 0.2780345711744083, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 31880 + }, + { + "epoch": 0.2780432924595768, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 0.981, + "step": 31881 + }, + { + "epoch": 0.2780520137447454, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 31882 + }, + { + "epoch": 0.278060735029914, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 31883 + }, + { + "epoch": 0.27806945631508256, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 31884 + }, + { + "epoch": 0.27807817760025116, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 31885 + }, + { + "epoch": 0.27808689888541976, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 31886 + }, + { + "epoch": 0.27809562017058836, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 31887 + }, + { + "epoch": 0.2781043414557569, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 31888 + }, + { + "epoch": 0.2781130627409255, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 31889 + }, + { + "epoch": 0.2781217840260941, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 31890 + }, + { + "epoch": 0.27813050531126265, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 31891 + }, + { + "epoch": 0.27813922659643125, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 31892 + }, + { + "epoch": 0.27814794788159986, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 31893 + }, + { + "epoch": 0.2781566691667684, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 31894 + }, + { + "epoch": 0.278165390451937, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 31895 + }, + { + "epoch": 0.2781741117371056, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 31896 + }, + { + "epoch": 0.27818283302227415, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 31897 + }, + { + "epoch": 0.27819155430744275, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 31898 + }, + { + "epoch": 0.27820027559261135, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 31899 + }, + { + "epoch": 0.2782089968777799, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 31900 + }, + { + "epoch": 0.2782177181629485, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31901 + }, + { + "epoch": 0.2782264394481171, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 31902 + }, + { + "epoch": 0.27823516073328564, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 31903 + }, + { + "epoch": 0.27824388201845424, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 31904 + }, + { + "epoch": 0.27825260330362284, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 31905 + }, + { + "epoch": 0.2782613245887914, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 31906 + }, + { + "epoch": 0.27827004587396, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 31907 + }, + { + "epoch": 0.2782787671591286, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 31908 + }, + { + "epoch": 0.2782874884442971, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 31909 + }, + { + "epoch": 0.2782962097294657, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 31910 + }, + { + "epoch": 0.2783049310146343, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 31911 + }, + { + "epoch": 0.27831365229980287, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 31912 + }, + { + "epoch": 0.27832237358497147, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 31913 + }, + { + "epoch": 0.27833109487014007, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 31914 + }, + { + "epoch": 0.2783398161553087, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 31915 + }, + { + "epoch": 0.2783485374404772, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 31916 + }, + { + "epoch": 0.2783572587256458, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 31917 + }, + { + "epoch": 0.2783659800108144, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 31918 + }, + { + "epoch": 0.27837470129598296, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 31919 + }, + { + "epoch": 0.27838342258115156, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 31920 + }, + { + "epoch": 0.27839214386632016, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 31921 + }, + { + "epoch": 0.2784008651514887, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 31922 + }, + { + "epoch": 0.2784095864366573, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 31923 + }, + { + "epoch": 0.2784183077218259, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 31924 + }, + { + "epoch": 0.27842702900699445, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 31925 + }, + { + "epoch": 0.27843575029216305, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 31926 + }, + { + "epoch": 0.27844447157733165, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 31927 + }, + { + "epoch": 0.2784531928625002, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 31928 + }, + { + "epoch": 0.2784619141476688, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 31929 + }, + { + "epoch": 0.2784706354328374, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 31930 + }, + { + "epoch": 0.27847935671800594, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 31931 + }, + { + "epoch": 0.27848807800317454, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 31932 + }, + { + "epoch": 0.27849679928834314, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 31933 + }, + { + "epoch": 0.2785055205735117, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 31934 + }, + { + "epoch": 0.2785142418586803, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 31935 + }, + { + "epoch": 0.2785229631438489, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 31936 + }, + { + "epoch": 0.27853168442901743, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 31937 + }, + { + "epoch": 0.27854040571418603, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 31938 + }, + { + "epoch": 0.27854912699935463, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 31939 + }, + { + "epoch": 0.27855784828452324, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 31940 + }, + { + "epoch": 0.2785665695696918, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 31941 + }, + { + "epoch": 0.2785752908548604, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 31942 + }, + { + "epoch": 0.278584012140029, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 31943 + }, + { + "epoch": 0.2785927334251975, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 31944 + }, + { + "epoch": 0.2786014547103661, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 31945 + }, + { + "epoch": 0.2786101759955347, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 31946 + }, + { + "epoch": 0.27861889728070327, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 31947 + }, + { + "epoch": 0.27862761856587187, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 31948 + }, + { + "epoch": 0.27863633985104047, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 31949 + }, + { + "epoch": 0.278645061136209, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 31950 + }, + { + "epoch": 0.2786537824213776, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 31951 + }, + { + "epoch": 0.2786625037065462, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 31952 + }, + { + "epoch": 0.27867122499171476, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 31953 + }, + { + "epoch": 0.27867994627688336, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 31954 + }, + { + "epoch": 0.27868866756205196, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 31955 + }, + { + "epoch": 0.2786973888472205, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 31956 + }, + { + "epoch": 0.2787061101323891, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 31957 + }, + { + "epoch": 0.2787148314175577, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 31958 + }, + { + "epoch": 0.27872355270272625, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 31959 + }, + { + "epoch": 0.27873227398789485, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 31960 + }, + { + "epoch": 0.27874099527306345, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 31961 + }, + { + "epoch": 0.278749716558232, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 31962 + }, + { + "epoch": 0.2787584378434006, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 31963 + }, + { + "epoch": 0.2787671591285692, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 31964 + }, + { + "epoch": 0.27877588041373774, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 31965 + }, + { + "epoch": 0.27878460169890634, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 31966 + }, + { + "epoch": 0.27879332298407494, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 31967 + }, + { + "epoch": 0.27880204426924354, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 31968 + }, + { + "epoch": 0.2788107655544121, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 31969 + }, + { + "epoch": 0.2788194868395807, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 31970 + }, + { + "epoch": 0.2788282081247493, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 31971 + }, + { + "epoch": 0.27883692940991783, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 31972 + }, + { + "epoch": 0.27884565069508643, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 31973 + }, + { + "epoch": 0.27885437198025503, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 31974 + }, + { + "epoch": 0.2788630932654236, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 31975 + }, + { + "epoch": 0.2788718145505922, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 31976 + }, + { + "epoch": 0.2788805358357608, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 31977 + }, + { + "epoch": 0.2788892571209293, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 31978 + }, + { + "epoch": 0.2788979784060979, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 31979 + }, + { + "epoch": 0.2789066996912665, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 31980 + }, + { + "epoch": 0.27891542097643507, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 31981 + }, + { + "epoch": 0.27892414226160367, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 31982 + }, + { + "epoch": 0.27893286354677227, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 31983 + }, + { + "epoch": 0.2789415848319408, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 31984 + }, + { + "epoch": 0.2789503061171094, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 31985 + }, + { + "epoch": 0.278959027402278, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 31986 + }, + { + "epoch": 0.27896774868744656, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 31987 + }, + { + "epoch": 0.27897646997261516, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 31988 + }, + { + "epoch": 0.27898519125778376, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 31989 + }, + { + "epoch": 0.2789939125429523, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 31990 + }, + { + "epoch": 0.2790026338281209, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 31991 + }, + { + "epoch": 0.2790113551132895, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 31992 + }, + { + "epoch": 0.27902007639845805, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 31993 + }, + { + "epoch": 0.27902879768362665, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 31994 + }, + { + "epoch": 0.27903751896879525, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 31995 + }, + { + "epoch": 0.27904624025396385, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 31996 + }, + { + "epoch": 0.2790549615391324, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 31997 + }, + { + "epoch": 0.279063682824301, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 31998 + }, + { + "epoch": 0.2790724041094696, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 31999 + }, + { + "epoch": 0.27908112539463814, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 32000 + }, + { + "epoch": 0.27908984667980674, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 32001 + }, + { + "epoch": 0.27909856796497534, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 32002 + }, + { + "epoch": 0.2791072892501439, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 32003 + }, + { + "epoch": 0.2791160105353125, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 32004 + }, + { + "epoch": 0.2791247318204811, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 32005 + }, + { + "epoch": 0.27913345310564963, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 32006 + }, + { + "epoch": 0.27914217439081823, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 32007 + }, + { + "epoch": 0.27915089567598683, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 32008 + }, + { + "epoch": 0.2791596169611554, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 32009 + }, + { + "epoch": 0.279168338246324, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 32010 + }, + { + "epoch": 0.2791770595314926, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 32011 + }, + { + "epoch": 0.2791857808166611, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 32012 + }, + { + "epoch": 0.2791945021018297, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 32013 + }, + { + "epoch": 0.2792032233869983, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 32014 + }, + { + "epoch": 0.27921194467216687, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 32015 + }, + { + "epoch": 0.27922066595733547, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32016 + }, + { + "epoch": 0.27922938724250407, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 32017 + }, + { + "epoch": 0.2792381085276726, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 32018 + }, + { + "epoch": 0.2792468298128412, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 32019 + }, + { + "epoch": 0.2792555510980098, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 32020 + }, + { + "epoch": 0.27926427238317836, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 32021 + }, + { + "epoch": 0.27927299366834696, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 32022 + }, + { + "epoch": 0.27928171495351556, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 32023 + }, + { + "epoch": 0.27929043623868416, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 32024 + }, + { + "epoch": 0.2792991575238527, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 32025 + }, + { + "epoch": 0.2793078788090213, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 32026 + }, + { + "epoch": 0.2793166000941899, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 32027 + }, + { + "epoch": 0.27932532137935845, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 32028 + }, + { + "epoch": 0.27933404266452705, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 32029 + }, + { + "epoch": 0.27934276394969565, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 32030 + }, + { + "epoch": 0.2793514852348642, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 32031 + }, + { + "epoch": 0.2793602065200328, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 32032 + }, + { + "epoch": 0.2793689278052014, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 32033 + }, + { + "epoch": 0.27937764909036994, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 32034 + }, + { + "epoch": 0.27938637037553854, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 32035 + }, + { + "epoch": 0.27939509166070714, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 32036 + }, + { + "epoch": 0.2794038129458757, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 32037 + }, + { + "epoch": 0.2794125342310443, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 32038 + }, + { + "epoch": 0.2794212555162129, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 32039 + }, + { + "epoch": 0.27942997680138143, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 32040 + }, + { + "epoch": 0.27943869808655003, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 32041 + }, + { + "epoch": 0.27944741937171863, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 32042 + }, + { + "epoch": 0.2794561406568872, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 32043 + }, + { + "epoch": 0.2794648619420558, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 32044 + }, + { + "epoch": 0.2794735832272244, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 32045 + }, + { + "epoch": 0.2794823045123929, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 32046 + }, + { + "epoch": 0.2794910257975615, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32047 + }, + { + "epoch": 0.2794997470827301, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 32048 + }, + { + "epoch": 0.27950846836789867, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 32049 + }, + { + "epoch": 0.27951718965306727, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 32050 + }, + { + "epoch": 0.27952591093823587, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 32051 + }, + { + "epoch": 0.27953463222340447, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 32052 + }, + { + "epoch": 0.279543353508573, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 32053 + }, + { + "epoch": 0.2795520747937416, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 32054 + }, + { + "epoch": 0.2795607960789102, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 32055 + }, + { + "epoch": 0.27956951736407876, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 32056 + }, + { + "epoch": 0.27957823864924736, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 32057 + }, + { + "epoch": 0.27958695993441596, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 32058 + }, + { + "epoch": 0.2795956812195845, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 32059 + }, + { + "epoch": 0.2796044025047531, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 32060 + }, + { + "epoch": 0.2796131237899217, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 32061 + }, + { + "epoch": 0.27962184507509025, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 32062 + }, + { + "epoch": 0.27963056636025885, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 32063 + }, + { + "epoch": 0.27963928764542745, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32064 + }, + { + "epoch": 0.279648008930596, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 32065 + }, + { + "epoch": 0.2796567302157646, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 32066 + }, + { + "epoch": 0.2796654515009332, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 32067 + }, + { + "epoch": 0.27967417278610174, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 32068 + }, + { + "epoch": 0.27968289407127034, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 32069 + }, + { + "epoch": 0.27969161535643894, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 32070 + }, + { + "epoch": 0.2797003366416075, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 32071 + }, + { + "epoch": 0.2797090579267761, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 32072 + }, + { + "epoch": 0.2797177792119447, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 32073 + }, + { + "epoch": 0.27972650049711323, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 32074 + }, + { + "epoch": 0.27973522178228183, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32075 + }, + { + "epoch": 0.27974394306745043, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 32076 + }, + { + "epoch": 0.27975266435261903, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32077 + }, + { + "epoch": 0.2797613856377876, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32078 + }, + { + "epoch": 0.2797701069229562, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 32079 + }, + { + "epoch": 0.2797788282081248, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 32080 + }, + { + "epoch": 0.2797875494932933, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 32081 + }, + { + "epoch": 0.2797962707784619, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 32082 + }, + { + "epoch": 0.2798049920636305, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 32083 + }, + { + "epoch": 0.27981371334879906, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 32084 + }, + { + "epoch": 0.27982243463396766, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 32085 + }, + { + "epoch": 0.27983115591913627, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 32086 + }, + { + "epoch": 0.2798398772043048, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 32087 + }, + { + "epoch": 0.2798485984894734, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 32088 + }, + { + "epoch": 0.279857319774642, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 32089 + }, + { + "epoch": 0.27986604105981056, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 32090 + }, + { + "epoch": 0.27987476234497916, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 32091 + }, + { + "epoch": 0.27988348363014776, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32092 + }, + { + "epoch": 0.2798922049153163, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 32093 + }, + { + "epoch": 0.2799009262004849, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 32094 + }, + { + "epoch": 0.2799096474856535, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 32095 + }, + { + "epoch": 0.27991836877082205, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 32096 + }, + { + "epoch": 0.27992709005599065, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 32097 + }, + { + "epoch": 0.27993581134115925, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 32098 + }, + { + "epoch": 0.2799445326263278, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 32099 + }, + { + "epoch": 0.2799532539114964, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 32100 + }, + { + "epoch": 0.279961975196665, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 32101 + }, + { + "epoch": 0.27997069648183354, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 32102 + }, + { + "epoch": 0.27997941776700214, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 32103 + }, + { + "epoch": 0.27998813905217074, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 32104 + }, + { + "epoch": 0.27999686033733934, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 32105 + }, + { + "epoch": 0.2800055816225079, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.981, + "step": 32106 + }, + { + "epoch": 0.2800143029076765, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 32107 + }, + { + "epoch": 0.2800230241928451, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 32108 + }, + { + "epoch": 0.2800317454780136, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 32109 + }, + { + "epoch": 0.2800404667631822, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 32110 + }, + { + "epoch": 0.28004918804835083, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 32111 + }, + { + "epoch": 0.2800579093335194, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 32112 + }, + { + "epoch": 0.280066630618688, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 32113 + }, + { + "epoch": 0.2800753519038566, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 32114 + }, + { + "epoch": 0.2800840731890251, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 32115 + }, + { + "epoch": 0.2800927944741937, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 32116 + }, + { + "epoch": 0.2801015157593623, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 32117 + }, + { + "epoch": 0.28011023704453086, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 32118 + }, + { + "epoch": 0.28011895832969946, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 32119 + }, + { + "epoch": 0.28012767961486806, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 32120 + }, + { + "epoch": 0.2801364009000366, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 32121 + }, + { + "epoch": 0.2801451221852052, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 32122 + }, + { + "epoch": 0.2801538434703738, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 32123 + }, + { + "epoch": 0.28016256475554235, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 32124 + }, + { + "epoch": 0.28017128604071095, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 32125 + }, + { + "epoch": 0.28018000732587955, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 32126 + }, + { + "epoch": 0.2801887286110481, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32127 + }, + { + "epoch": 0.2801974498962167, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 32128 + }, + { + "epoch": 0.2802061711813853, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 32129 + }, + { + "epoch": 0.28021489246655384, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 32130 + }, + { + "epoch": 0.28022361375172244, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 32131 + }, + { + "epoch": 0.28023233503689104, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 32132 + }, + { + "epoch": 0.28024105632205965, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9736, + "step": 32133 + }, + { + "epoch": 0.2802497776072282, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 32134 + }, + { + "epoch": 0.2802584988923968, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 32135 + }, + { + "epoch": 0.2802672201775654, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 32136 + }, + { + "epoch": 0.28027594146273394, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 32137 + }, + { + "epoch": 0.28028466274790254, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 32138 + }, + { + "epoch": 0.28029338403307114, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 32139 + }, + { + "epoch": 0.2803021053182397, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 32140 + }, + { + "epoch": 0.2803108266034083, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 32141 + }, + { + "epoch": 0.2803195478885769, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 32142 + }, + { + "epoch": 0.2803282691737454, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 32143 + }, + { + "epoch": 0.280336990458914, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 32144 + }, + { + "epoch": 0.2803457117440826, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 32145 + }, + { + "epoch": 0.28035443302925117, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 32146 + }, + { + "epoch": 0.28036315431441977, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 32147 + }, + { + "epoch": 0.28037187559958837, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 32148 + }, + { + "epoch": 0.2803805968847569, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 32149 + }, + { + "epoch": 0.2803893181699255, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 32150 + }, + { + "epoch": 0.2803980394550941, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 32151 + }, + { + "epoch": 0.28040676074026266, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32152 + }, + { + "epoch": 0.28041548202543126, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 32153 + }, + { + "epoch": 0.28042420331059986, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 32154 + }, + { + "epoch": 0.2804329245957684, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 32155 + }, + { + "epoch": 0.280441645880937, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 32156 + }, + { + "epoch": 0.2804503671661056, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 32157 + }, + { + "epoch": 0.28045908845127415, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 32158 + }, + { + "epoch": 0.28046780973644275, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 32159 + }, + { + "epoch": 0.28047653102161135, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 32160 + }, + { + "epoch": 0.28048525230677995, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 32161 + }, + { + "epoch": 0.2804939735919485, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 32162 + }, + { + "epoch": 0.2805026948771171, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 32163 + }, + { + "epoch": 0.2805114161622857, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 32164 + }, + { + "epoch": 0.28052013744745424, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 32165 + }, + { + "epoch": 0.28052885873262284, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 32166 + }, + { + "epoch": 0.28053758001779144, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 32167 + }, + { + "epoch": 0.28054630130296, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 32168 + }, + { + "epoch": 0.2805550225881286, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 32169 + }, + { + "epoch": 0.2805637438732972, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 32170 + }, + { + "epoch": 0.28057246515846573, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 32171 + }, + { + "epoch": 0.28058118644363433, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 32172 + }, + { + "epoch": 0.28058990772880293, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 32173 + }, + { + "epoch": 0.2805986290139715, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 32174 + }, + { + "epoch": 0.2806073502991401, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 32175 + }, + { + "epoch": 0.2806160715843087, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 32176 + }, + { + "epoch": 0.2806247928694772, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9788, + "step": 32177 + }, + { + "epoch": 0.2806335141546458, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 32178 + }, + { + "epoch": 0.2806422354398144, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 32179 + }, + { + "epoch": 0.28065095672498297, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 32180 + }, + { + "epoch": 0.28065967801015157, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 32181 + }, + { + "epoch": 0.28066839929532017, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 32182 + }, + { + "epoch": 0.2806771205804887, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 32183 + }, + { + "epoch": 0.2806858418656573, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32184 + }, + { + "epoch": 0.2806945631508259, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 32185 + }, + { + "epoch": 0.2807032844359945, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 32186 + }, + { + "epoch": 0.28071200572116306, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 32187 + }, + { + "epoch": 0.28072072700633166, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 32188 + }, + { + "epoch": 0.28072944829150026, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 32189 + }, + { + "epoch": 0.2807381695766688, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 32190 + }, + { + "epoch": 0.2807468908618374, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 32191 + }, + { + "epoch": 0.280755612147006, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 32192 + }, + { + "epoch": 0.28076433343217455, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 32193 + }, + { + "epoch": 0.28077305471734315, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 32194 + }, + { + "epoch": 0.28078177600251175, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 32195 + }, + { + "epoch": 0.2807904972876803, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 32196 + }, + { + "epoch": 0.2807992185728489, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 32197 + }, + { + "epoch": 0.2808079398580175, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 32198 + }, + { + "epoch": 0.28081666114318604, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 32199 + }, + { + "epoch": 0.28082538242835464, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 32200 + }, + { + "epoch": 0.28083410371352324, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 32201 + }, + { + "epoch": 0.2808428249986918, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 32202 + }, + { + "epoch": 0.2808515462838604, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 32203 + }, + { + "epoch": 0.280860267569029, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 32204 + }, + { + "epoch": 0.28086898885419753, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 32205 + }, + { + "epoch": 0.28087771013936613, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 32206 + }, + { + "epoch": 0.28088643142453473, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 32207 + }, + { + "epoch": 0.2808951527097033, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 32208 + }, + { + "epoch": 0.2809038739948719, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 32209 + }, + { + "epoch": 0.2809125952800405, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 32210 + }, + { + "epoch": 0.280921316565209, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 32211 + }, + { + "epoch": 0.2809300378503776, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 32212 + }, + { + "epoch": 0.2809387591355462, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 32213 + }, + { + "epoch": 0.2809474804207148, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32214 + }, + { + "epoch": 0.28095620170588337, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 32215 + }, + { + "epoch": 0.28096492299105197, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 32216 + }, + { + "epoch": 0.28097364427622057, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 32217 + }, + { + "epoch": 0.2809823655613891, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 32218 + }, + { + "epoch": 0.2809910868465577, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 32219 + }, + { + "epoch": 0.2809998081317263, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 32220 + }, + { + "epoch": 0.28100852941689486, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32221 + }, + { + "epoch": 0.28101725070206346, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 32222 + }, + { + "epoch": 0.28102597198723206, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 32223 + }, + { + "epoch": 0.2810346932724006, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 32224 + }, + { + "epoch": 0.2810434145575692, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 32225 + }, + { + "epoch": 0.2810521358427378, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 32226 + }, + { + "epoch": 0.28106085712790635, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 32227 + }, + { + "epoch": 0.28106957841307495, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 32228 + }, + { + "epoch": 0.28107829969824355, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 32229 + }, + { + "epoch": 0.2810870209834121, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 32230 + }, + { + "epoch": 0.2810957422685807, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 32231 + }, + { + "epoch": 0.2811044635537493, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 32232 + }, + { + "epoch": 0.28111318483891784, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 32233 + }, + { + "epoch": 0.28112190612408644, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 32234 + }, + { + "epoch": 0.28113062740925504, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 32235 + }, + { + "epoch": 0.2811393486944236, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32236 + }, + { + "epoch": 0.2811480699795922, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 32237 + }, + { + "epoch": 0.2811567912647608, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 32238 + }, + { + "epoch": 0.28116551254992933, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 32239 + }, + { + "epoch": 0.28117423383509793, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 32240 + }, + { + "epoch": 0.28118295512026653, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 32241 + }, + { + "epoch": 0.28119167640543513, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 32242 + }, + { + "epoch": 0.2812003976906037, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 32243 + }, + { + "epoch": 0.2812091189757723, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 32244 + }, + { + "epoch": 0.2812178402609409, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 32245 + }, + { + "epoch": 0.2812265615461094, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 32246 + }, + { + "epoch": 0.281235282831278, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 32247 + }, + { + "epoch": 0.2812440041164466, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 32248 + }, + { + "epoch": 0.28125272540161517, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 32249 + }, + { + "epoch": 0.28126144668678377, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 32250 + }, + { + "epoch": 0.28127016797195237, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 32251 + }, + { + "epoch": 0.2812788892571209, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 32252 + }, + { + "epoch": 0.2812876105422895, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 32253 + }, + { + "epoch": 0.2812963318274581, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 32254 + }, + { + "epoch": 0.28130505311262666, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 32255 + }, + { + "epoch": 0.28131377439779526, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 32256 + }, + { + "epoch": 0.28132249568296386, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 32257 + }, + { + "epoch": 0.2813312169681324, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 32258 + }, + { + "epoch": 0.281339938253301, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 32259 + }, + { + "epoch": 0.2813486595384696, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 32260 + }, + { + "epoch": 0.28135738082363815, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 32261 + }, + { + "epoch": 0.28136610210880675, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 32262 + }, + { + "epoch": 0.28137482339397535, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 32263 + }, + { + "epoch": 0.2813835446791439, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 32264 + }, + { + "epoch": 0.2813922659643125, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 32265 + }, + { + "epoch": 0.2814009872494811, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 32266 + }, + { + "epoch": 0.28140970853464964, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 32267 + }, + { + "epoch": 0.28141842981981824, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 32268 + }, + { + "epoch": 0.28142715110498684, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 32269 + }, + { + "epoch": 0.28143587239015544, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 32270 + }, + { + "epoch": 0.281444593675324, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 32271 + }, + { + "epoch": 0.2814533149604926, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 32272 + }, + { + "epoch": 0.2814620362456612, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 32273 + }, + { + "epoch": 0.28147075753082973, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9809, + "step": 32274 + }, + { + "epoch": 0.28147947881599833, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 32275 + }, + { + "epoch": 0.28148820010116693, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 32276 + }, + { + "epoch": 0.2814969213863355, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 32277 + }, + { + "epoch": 0.2815056426715041, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 32278 + }, + { + "epoch": 0.2815143639566727, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 32279 + }, + { + "epoch": 0.2815230852418412, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 32280 + }, + { + "epoch": 0.2815318065270098, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 32281 + }, + { + "epoch": 0.2815405278121784, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 32282 + }, + { + "epoch": 0.28154924909734697, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 32283 + }, + { + "epoch": 0.28155797038251557, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 32284 + }, + { + "epoch": 0.28156669166768417, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 32285 + }, + { + "epoch": 0.2815754129528527, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 32286 + }, + { + "epoch": 0.2815841342380213, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32287 + }, + { + "epoch": 0.2815928555231899, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 32288 + }, + { + "epoch": 0.28160157680835846, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 32289 + }, + { + "epoch": 0.28161029809352706, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 32290 + }, + { + "epoch": 0.28161901937869566, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 32291 + }, + { + "epoch": 0.2816277406638642, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 32292 + }, + { + "epoch": 0.2816364619490328, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 32293 + }, + { + "epoch": 0.2816451832342014, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 32294 + }, + { + "epoch": 0.28165390451937, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 32295 + }, + { + "epoch": 0.28166262580453855, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 32296 + }, + { + "epoch": 0.28167134708970715, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 32297 + }, + { + "epoch": 0.28168006837487575, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32298 + }, + { + "epoch": 0.2816887896600443, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 32299 + }, + { + "epoch": 0.2816975109452129, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 32300 + }, + { + "epoch": 0.2817062322303815, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 32301 + }, + { + "epoch": 0.28171495351555004, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 32302 + }, + { + "epoch": 0.28172367480071864, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 32303 + }, + { + "epoch": 0.28173239608588724, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 32304 + }, + { + "epoch": 0.2817411173710558, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 32305 + }, + { + "epoch": 0.2817498386562244, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 32306 + }, + { + "epoch": 0.281758559941393, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 32307 + }, + { + "epoch": 0.28176728122656153, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 32308 + }, + { + "epoch": 0.28177600251173013, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 32309 + }, + { + "epoch": 0.28178472379689873, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32310 + }, + { + "epoch": 0.2817934450820673, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 32311 + }, + { + "epoch": 0.2818021663672359, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 32312 + }, + { + "epoch": 0.2818108876524045, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 32313 + }, + { + "epoch": 0.281819608937573, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 32314 + }, + { + "epoch": 0.2818283302227416, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 32315 + }, + { + "epoch": 0.2818370515079102, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 32316 + }, + { + "epoch": 0.28184577279307876, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 32317 + }, + { + "epoch": 0.28185449407824736, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 32318 + }, + { + "epoch": 0.28186321536341596, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 32319 + }, + { + "epoch": 0.2818719366485845, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32320 + }, + { + "epoch": 0.2818806579337531, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32321 + }, + { + "epoch": 0.2818893792189217, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 32322 + }, + { + "epoch": 0.2818981005040903, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 32323 + }, + { + "epoch": 0.28190682178925885, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 32324 + }, + { + "epoch": 0.28191554307442745, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 32325 + }, + { + "epoch": 0.28192426435959606, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 32326 + }, + { + "epoch": 0.2819329856447646, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 32327 + }, + { + "epoch": 0.2819417069299332, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 32328 + }, + { + "epoch": 0.2819504282151018, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 32329 + }, + { + "epoch": 0.28195914950027035, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 32330 + }, + { + "epoch": 0.28196787078543895, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 32331 + }, + { + "epoch": 0.28197659207060755, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32332 + }, + { + "epoch": 0.2819853133557761, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 32333 + }, + { + "epoch": 0.2819940346409447, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 32334 + }, + { + "epoch": 0.2820027559261133, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32335 + }, + { + "epoch": 0.28201147721128184, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 32336 + }, + { + "epoch": 0.28202019849645044, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 32337 + }, + { + "epoch": 0.28202891978161904, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 32338 + }, + { + "epoch": 0.2820376410667876, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 32339 + }, + { + "epoch": 0.2820463623519562, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 32340 + }, + { + "epoch": 0.2820550836371248, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 32341 + }, + { + "epoch": 0.2820638049222933, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 32342 + }, + { + "epoch": 0.2820725262074619, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 32343 + }, + { + "epoch": 0.2820812474926305, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 32344 + }, + { + "epoch": 0.28208996877779907, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 32345 + }, + { + "epoch": 0.28209869006296767, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 32346 + }, + { + "epoch": 0.28210741134813627, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 32347 + }, + { + "epoch": 0.2821161326333048, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 32348 + }, + { + "epoch": 0.2821248539184734, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 32349 + }, + { + "epoch": 0.282133575203642, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 32350 + }, + { + "epoch": 0.2821422964888106, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 32351 + }, + { + "epoch": 0.28215101777397916, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 32352 + }, + { + "epoch": 0.28215973905914776, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 32353 + }, + { + "epoch": 0.28216846034431636, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 32354 + }, + { + "epoch": 0.2821771816294849, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32355 + }, + { + "epoch": 0.2821859029146535, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 32356 + }, + { + "epoch": 0.2821946241998221, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 32357 + }, + { + "epoch": 0.28220334548499065, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 32358 + }, + { + "epoch": 0.28221206677015925, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 32359 + }, + { + "epoch": 0.28222078805532785, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 32360 + }, + { + "epoch": 0.2822295093404964, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 32361 + }, + { + "epoch": 0.282238230625665, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 32362 + }, + { + "epoch": 0.2822469519108336, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 32363 + }, + { + "epoch": 0.28225567319600214, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 32364 + }, + { + "epoch": 0.28226439448117074, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 32365 + }, + { + "epoch": 0.28227311576633934, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 32366 + }, + { + "epoch": 0.2822818370515079, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 32367 + }, + { + "epoch": 0.2822905583366765, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 32368 + }, + { + "epoch": 0.2822992796218451, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 32369 + }, + { + "epoch": 0.28230800090701363, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32370 + }, + { + "epoch": 0.28231672219218223, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 32371 + }, + { + "epoch": 0.28232544347735083, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 32372 + }, + { + "epoch": 0.2823341647625194, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 32373 + }, + { + "epoch": 0.282342886047688, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32374 + }, + { + "epoch": 0.2823516073328566, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 32375 + }, + { + "epoch": 0.2823603286180251, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 32376 + }, + { + "epoch": 0.2823690499031937, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 32377 + }, + { + "epoch": 0.2823777711883623, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 32378 + }, + { + "epoch": 0.2823864924735309, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 32379 + }, + { + "epoch": 0.28239521375869947, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 32380 + }, + { + "epoch": 0.28240393504386807, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 32381 + }, + { + "epoch": 0.28241265632903667, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 32382 + }, + { + "epoch": 0.2824213776142052, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 32383 + }, + { + "epoch": 0.2824300988993738, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 32384 + }, + { + "epoch": 0.2824388201845424, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 32385 + }, + { + "epoch": 0.28244754146971096, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 32386 + }, + { + "epoch": 0.28245626275487956, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 32387 + }, + { + "epoch": 0.28246498404004816, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 32388 + }, + { + "epoch": 0.2824737053252167, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 32389 + }, + { + "epoch": 0.2824824266103853, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 32390 + }, + { + "epoch": 0.2824911478955539, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 32391 + }, + { + "epoch": 0.28249986918072245, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 32392 + }, + { + "epoch": 0.28250859046589105, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 32393 + }, + { + "epoch": 0.28251731175105965, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 32394 + }, + { + "epoch": 0.2825260330362282, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 32395 + }, + { + "epoch": 0.2825347543213968, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 32396 + }, + { + "epoch": 0.2825434756065654, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 32397 + }, + { + "epoch": 0.28255219689173394, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 32398 + }, + { + "epoch": 0.28256091817690254, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 32399 + }, + { + "epoch": 0.28256963946207114, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 32400 + }, + { + "epoch": 0.2825783607472397, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 32401 + }, + { + "epoch": 0.2825870820324083, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 32402 + }, + { + "epoch": 0.2825958033175769, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 32403 + }, + { + "epoch": 0.2826045246027455, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 32404 + }, + { + "epoch": 0.28261324588791403, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 32405 + }, + { + "epoch": 0.28262196717308263, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 32406 + }, + { + "epoch": 0.28263068845825123, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 32407 + }, + { + "epoch": 0.2826394097434198, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 32408 + }, + { + "epoch": 0.2826481310285884, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 32409 + }, + { + "epoch": 0.282656852313757, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 32410 + }, + { + "epoch": 0.2826655735989255, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 32411 + }, + { + "epoch": 0.2826742948840941, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 32412 + }, + { + "epoch": 0.2826830161692627, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32413 + }, + { + "epoch": 0.28269173745443127, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 32414 + }, + { + "epoch": 0.28270045873959987, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 32415 + }, + { + "epoch": 0.28270918002476847, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 32416 + }, + { + "epoch": 0.282717901309937, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 32417 + }, + { + "epoch": 0.2827266225951056, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 32418 + }, + { + "epoch": 0.2827353438802742, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 32419 + }, + { + "epoch": 0.28274406516544276, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 32420 + }, + { + "epoch": 0.28275278645061136, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 32421 + }, + { + "epoch": 0.28276150773577996, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 32422 + }, + { + "epoch": 0.2827702290209485, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 32423 + }, + { + "epoch": 0.2827789503061171, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 32424 + }, + { + "epoch": 0.2827876715912857, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 32425 + }, + { + "epoch": 0.28279639287645425, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 32426 + }, + { + "epoch": 0.28280511416162285, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 32427 + }, + { + "epoch": 0.28281383544679145, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 32428 + }, + { + "epoch": 0.28282255673196, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 32429 + }, + { + "epoch": 0.2828312780171286, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 32430 + }, + { + "epoch": 0.2828399993022972, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 32431 + }, + { + "epoch": 0.2828487205874658, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 32432 + }, + { + "epoch": 0.28285744187263434, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 32433 + }, + { + "epoch": 0.28286616315780294, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 32434 + }, + { + "epoch": 0.28287488444297154, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 32435 + }, + { + "epoch": 0.2828836057281401, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 32436 + }, + { + "epoch": 0.2828923270133087, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 32437 + }, + { + "epoch": 0.2829010482984773, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 32438 + }, + { + "epoch": 0.28290976958364583, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 32439 + }, + { + "epoch": 0.28291849086881443, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 32440 + }, + { + "epoch": 0.28292721215398303, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 32441 + }, + { + "epoch": 0.2829359334391516, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 32442 + }, + { + "epoch": 0.2829446547243202, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 32443 + }, + { + "epoch": 0.2829533760094888, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 32444 + }, + { + "epoch": 0.2829620972946573, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 32445 + }, + { + "epoch": 0.2829708185798259, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 32446 + }, + { + "epoch": 0.2829795398649945, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 32447 + }, + { + "epoch": 0.28298826115016307, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 32448 + }, + { + "epoch": 0.28299698243533167, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 32449 + }, + { + "epoch": 0.28300570372050027, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 32450 + }, + { + "epoch": 0.2830144250056688, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 32451 + }, + { + "epoch": 0.2830231462908374, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 32452 + }, + { + "epoch": 0.283031867576006, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 32453 + }, + { + "epoch": 0.28304058886117456, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 32454 + }, + { + "epoch": 0.28304931014634316, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 32455 + }, + { + "epoch": 0.28305803143151176, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32456 + }, + { + "epoch": 0.2830667527166803, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 32457 + }, + { + "epoch": 0.2830754740018489, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 32458 + }, + { + "epoch": 0.2830841952870175, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 32459 + }, + { + "epoch": 0.2830929165721861, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 32460 + }, + { + "epoch": 0.28310163785735465, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 32461 + }, + { + "epoch": 0.28311035914252325, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 32462 + }, + { + "epoch": 0.28311908042769185, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 32463 + }, + { + "epoch": 0.2831278017128604, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 32464 + }, + { + "epoch": 0.283136522998029, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 32465 + }, + { + "epoch": 0.2831452442831976, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 32466 + }, + { + "epoch": 0.28315396556836614, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 32467 + }, + { + "epoch": 0.28316268685353474, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 32468 + }, + { + "epoch": 0.28317140813870334, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 32469 + }, + { + "epoch": 0.2831801294238719, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 32470 + }, + { + "epoch": 0.2831888507090405, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 32471 + }, + { + "epoch": 0.2831975719942091, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 32472 + }, + { + "epoch": 0.28320629327937763, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 32473 + }, + { + "epoch": 0.28321501456454623, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 32474 + }, + { + "epoch": 0.28322373584971483, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 32475 + }, + { + "epoch": 0.2832324571348834, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 32476 + }, + { + "epoch": 0.283241178420052, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 32477 + }, + { + "epoch": 0.2832498997052206, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 32478 + }, + { + "epoch": 0.2832586209903891, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 32479 + }, + { + "epoch": 0.2832673422755577, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 32480 + }, + { + "epoch": 0.2832760635607263, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 32481 + }, + { + "epoch": 0.28328478484589487, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 32482 + }, + { + "epoch": 0.28329350613106347, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 32483 + }, + { + "epoch": 0.28330222741623207, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 32484 + }, + { + "epoch": 0.2833109487014006, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 32485 + }, + { + "epoch": 0.2833196699865692, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 32486 + }, + { + "epoch": 0.2833283912717378, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 32487 + }, + { + "epoch": 0.2833371125569064, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 32488 + }, + { + "epoch": 0.28334583384207496, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 32489 + }, + { + "epoch": 0.28335455512724356, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 32490 + }, + { + "epoch": 0.28336327641241216, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 32491 + }, + { + "epoch": 0.2833719976975807, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 32492 + }, + { + "epoch": 0.2833807189827493, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 32493 + }, + { + "epoch": 0.2833894402679179, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 32494 + }, + { + "epoch": 0.28339816155308645, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 32495 + }, + { + "epoch": 0.28340688283825505, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 32496 + }, + { + "epoch": 0.28341560412342365, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 32497 + }, + { + "epoch": 0.2834243254085922, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 32498 + }, + { + "epoch": 0.2834330466937608, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 32499 + }, + { + "epoch": 0.2834417679789294, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 32500 + }, + { + "epoch": 0.28345048926409794, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 32501 + }, + { + "epoch": 0.28345921054926654, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32502 + }, + { + "epoch": 0.28346793183443514, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 32503 + }, + { + "epoch": 0.2834766531196037, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 32504 + }, + { + "epoch": 0.2834853744047723, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 32505 + }, + { + "epoch": 0.2834940956899409, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 32506 + }, + { + "epoch": 0.28350281697510943, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 32507 + }, + { + "epoch": 0.28351153826027803, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 32508 + }, + { + "epoch": 0.28352025954544663, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 32509 + }, + { + "epoch": 0.2835289808306152, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 32510 + }, + { + "epoch": 0.2835377021157838, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 32511 + }, + { + "epoch": 0.2835464234009524, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32512 + }, + { + "epoch": 0.2835551446861209, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 32513 + }, + { + "epoch": 0.2835638659712895, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 32514 + }, + { + "epoch": 0.2835725872564581, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 32515 + }, + { + "epoch": 0.2835813085416267, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 32516 + }, + { + "epoch": 0.28359002982679526, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 32517 + }, + { + "epoch": 0.28359875111196386, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 32518 + }, + { + "epoch": 0.28360747239713247, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 32519 + }, + { + "epoch": 0.283616193682301, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 32520 + }, + { + "epoch": 0.2836249149674696, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 32521 + }, + { + "epoch": 0.2836336362526382, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 32522 + }, + { + "epoch": 0.28364235753780676, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32523 + }, + { + "epoch": 0.28365107882297536, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 32524 + }, + { + "epoch": 0.28365980010814396, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 32525 + }, + { + "epoch": 0.2836685213933125, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 32526 + }, + { + "epoch": 0.2836772426784811, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 32527 + }, + { + "epoch": 0.2836859639636497, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 32528 + }, + { + "epoch": 0.28369468524881825, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 32529 + }, + { + "epoch": 0.28370340653398685, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 32530 + }, + { + "epoch": 0.28371212781915545, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 32531 + }, + { + "epoch": 0.283720849104324, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 32532 + }, + { + "epoch": 0.2837295703894926, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 32533 + }, + { + "epoch": 0.2837382916746612, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 32534 + }, + { + "epoch": 0.28374701295982974, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 32535 + }, + { + "epoch": 0.28375573424499834, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 32536 + }, + { + "epoch": 0.28376445553016694, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 32537 + }, + { + "epoch": 0.2837731768153355, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 32538 + }, + { + "epoch": 0.2837818981005041, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 32539 + }, + { + "epoch": 0.2837906193856727, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 32540 + }, + { + "epoch": 0.2837993406708413, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 32541 + }, + { + "epoch": 0.2838080619560098, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 32542 + }, + { + "epoch": 0.2838167832411784, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 32543 + }, + { + "epoch": 0.28382550452634703, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 32544 + }, + { + "epoch": 0.2838342258115156, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 32545 + }, + { + "epoch": 0.2838429470966842, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 32546 + }, + { + "epoch": 0.2838516683818528, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 32547 + }, + { + "epoch": 0.2838603896670213, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 32548 + }, + { + "epoch": 0.2838691109521899, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 32549 + }, + { + "epoch": 0.2838778322373585, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 32550 + }, + { + "epoch": 0.28388655352252706, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 32551 + }, + { + "epoch": 0.28389527480769566, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 32552 + }, + { + "epoch": 0.28390399609286426, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 32553 + }, + { + "epoch": 0.2839127173780328, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 32554 + }, + { + "epoch": 0.2839214386632014, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 32555 + }, + { + "epoch": 0.28393015994837, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 32556 + }, + { + "epoch": 0.28393888123353855, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 32557 + }, + { + "epoch": 0.28394760251870715, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 32558 + }, + { + "epoch": 0.28395632380387575, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 32559 + }, + { + "epoch": 0.2839650450890443, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9845, + "step": 32560 + }, + { + "epoch": 0.2839737663742129, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 32561 + }, + { + "epoch": 0.2839824876593815, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 32562 + }, + { + "epoch": 0.28399120894455004, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 32563 + }, + { + "epoch": 0.28399993022971864, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 32564 + }, + { + "epoch": 0.28400865151488724, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 32565 + }, + { + "epoch": 0.2840173728000558, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 32566 + }, + { + "epoch": 0.2840260940852244, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 32567 + }, + { + "epoch": 0.284034815370393, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 32568 + }, + { + "epoch": 0.2840435366555616, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 32569 + }, + { + "epoch": 0.28405225794073014, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 32570 + }, + { + "epoch": 0.28406097922589874, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 32571 + }, + { + "epoch": 0.28406970051106734, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 32572 + }, + { + "epoch": 0.2840784217962359, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 32573 + }, + { + "epoch": 0.2840871430814045, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 32574 + }, + { + "epoch": 0.2840958643665731, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32575 + }, + { + "epoch": 0.2841045856517416, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 32576 + }, + { + "epoch": 0.2841133069369102, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 32577 + }, + { + "epoch": 0.2841220282220788, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 32578 + }, + { + "epoch": 0.28413074950724737, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 32579 + }, + { + "epoch": 0.28413947079241597, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 32580 + }, + { + "epoch": 0.28414819207758457, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 32581 + }, + { + "epoch": 0.2841569133627531, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 32582 + }, + { + "epoch": 0.2841656346479217, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 32583 + }, + { + "epoch": 0.2841743559330903, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 32584 + }, + { + "epoch": 0.28418307721825886, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 32585 + }, + { + "epoch": 0.28419179850342746, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 32586 + }, + { + "epoch": 0.28420051978859606, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 32587 + }, + { + "epoch": 0.2842092410737646, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 32588 + }, + { + "epoch": 0.2842179623589332, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32589 + }, + { + "epoch": 0.2842266836441018, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 32590 + }, + { + "epoch": 0.28423540492927035, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 32591 + }, + { + "epoch": 0.28424412621443895, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 32592 + }, + { + "epoch": 0.28425284749960755, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 32593 + }, + { + "epoch": 0.2842615687847761, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 32594 + }, + { + "epoch": 0.2842702900699447, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 32595 + }, + { + "epoch": 0.2842790113551133, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 32596 + }, + { + "epoch": 0.2842877326402819, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 32597 + }, + { + "epoch": 0.28429645392545044, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 32598 + }, + { + "epoch": 0.28430517521061904, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32599 + }, + { + "epoch": 0.28431389649578764, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 32600 + }, + { + "epoch": 0.2843226177809562, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 32601 + }, + { + "epoch": 0.2843313390661248, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 32602 + }, + { + "epoch": 0.2843400603512934, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 32603 + }, + { + "epoch": 0.28434878163646193, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 32604 + }, + { + "epoch": 0.28435750292163053, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 32605 + }, + { + "epoch": 0.28436622420679913, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 32606 + }, + { + "epoch": 0.2843749454919677, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 32607 + }, + { + "epoch": 0.2843836667771363, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 32608 + }, + { + "epoch": 0.2843923880623049, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 32609 + }, + { + "epoch": 0.2844011093474734, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 32610 + }, + { + "epoch": 0.284409830632642, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 32611 + }, + { + "epoch": 0.2844185519178106, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 32612 + }, + { + "epoch": 0.28442727320297917, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 32613 + }, + { + "epoch": 0.28443599448814777, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 32614 + }, + { + "epoch": 0.28444471577331637, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 32615 + }, + { + "epoch": 0.2844534370584849, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 32616 + }, + { + "epoch": 0.2844621583436535, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 32617 + }, + { + "epoch": 0.2844708796288221, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 32618 + }, + { + "epoch": 0.28447960091399066, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 32619 + }, + { + "epoch": 0.28448832219915926, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 32620 + }, + { + "epoch": 0.28449704348432786, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 32621 + }, + { + "epoch": 0.2845057647694964, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 32622 + }, + { + "epoch": 0.284514486054665, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 32623 + }, + { + "epoch": 0.2845232073398336, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 32624 + }, + { + "epoch": 0.2845319286250022, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 32625 + }, + { + "epoch": 0.28454064991017075, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 32626 + }, + { + "epoch": 0.28454937119533935, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 32627 + }, + { + "epoch": 0.28455809248050795, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 32628 + }, + { + "epoch": 0.2845668137656765, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 32629 + }, + { + "epoch": 0.2845755350508451, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 32630 + }, + { + "epoch": 0.2845842563360137, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 32631 + }, + { + "epoch": 0.28459297762118224, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 32632 + }, + { + "epoch": 0.28460169890635084, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 32633 + }, + { + "epoch": 0.28461042019151944, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 32634 + }, + { + "epoch": 0.284619141476688, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 32635 + }, + { + "epoch": 0.2846278627618566, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 32636 + }, + { + "epoch": 0.2846365840470252, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 32637 + }, + { + "epoch": 0.28464530533219373, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 32638 + }, + { + "epoch": 0.28465402661736233, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 32639 + }, + { + "epoch": 0.28466274790253093, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 32640 + }, + { + "epoch": 0.2846714691876995, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 32641 + }, + { + "epoch": 0.2846801904728681, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 32642 + }, + { + "epoch": 0.2846889117580367, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 32643 + }, + { + "epoch": 0.2846976330432052, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 32644 + }, + { + "epoch": 0.2847063543283738, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 32645 + }, + { + "epoch": 0.2847150756135424, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 32646 + }, + { + "epoch": 0.28472379689871097, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 32647 + }, + { + "epoch": 0.28473251818387957, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 32648 + }, + { + "epoch": 0.28474123946904817, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 32649 + }, + { + "epoch": 0.28474996075421677, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 32650 + }, + { + "epoch": 0.2847586820393853, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 32651 + }, + { + "epoch": 0.2847674033245539, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 32652 + }, + { + "epoch": 0.2847761246097225, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 32653 + }, + { + "epoch": 0.28478484589489106, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 32654 + }, + { + "epoch": 0.28479356718005966, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 32655 + }, + { + "epoch": 0.28480228846522826, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 32656 + }, + { + "epoch": 0.2848110097503968, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 32657 + }, + { + "epoch": 0.2848197310355654, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 32658 + }, + { + "epoch": 0.284828452320734, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 32659 + }, + { + "epoch": 0.28483717360590255, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 32660 + }, + { + "epoch": 0.28484589489107115, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 32661 + }, + { + "epoch": 0.28485461617623975, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 32662 + }, + { + "epoch": 0.2848633374614083, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 32663 + }, + { + "epoch": 0.2848720587465769, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 32664 + }, + { + "epoch": 0.2848807800317455, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 32665 + }, + { + "epoch": 0.28488950131691404, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 32666 + }, + { + "epoch": 0.28489822260208264, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 32667 + }, + { + "epoch": 0.28490694388725124, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 32668 + }, + { + "epoch": 0.2849156651724198, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 32669 + }, + { + "epoch": 0.2849243864575884, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 32670 + }, + { + "epoch": 0.284933107742757, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 32671 + }, + { + "epoch": 0.28494182902792553, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 32672 + }, + { + "epoch": 0.28495055031309413, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 32673 + }, + { + "epoch": 0.28495927159826273, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 32674 + }, + { + "epoch": 0.2849679928834313, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 32675 + }, + { + "epoch": 0.2849767141685999, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 32676 + }, + { + "epoch": 0.2849854354537685, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 32677 + }, + { + "epoch": 0.2849941567389371, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 32678 + }, + { + "epoch": 0.2850028780241056, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 32679 + }, + { + "epoch": 0.2850115993092742, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 32680 + }, + { + "epoch": 0.2850203205944428, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 32681 + }, + { + "epoch": 0.28502904187961137, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 32682 + }, + { + "epoch": 0.28503776316477997, + "grad_norm": 0.41015625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 32683 + }, + { + "epoch": 0.28504648444994857, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 32684 + }, + { + "epoch": 0.2850552057351171, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 32685 + }, + { + "epoch": 0.2850639270202857, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 32686 + }, + { + "epoch": 0.2850726483054543, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 32687 + }, + { + "epoch": 0.28508136959062286, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 32688 + }, + { + "epoch": 0.28509009087579146, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 32689 + }, + { + "epoch": 0.28509881216096006, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 32690 + }, + { + "epoch": 0.2851075334461286, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 32691 + }, + { + "epoch": 0.2851162547312972, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 32692 + }, + { + "epoch": 0.2851249760164658, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 32693 + }, + { + "epoch": 0.28513369730163435, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 32694 + }, + { + "epoch": 0.28514241858680295, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 32695 + }, + { + "epoch": 0.28515113987197155, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 32696 + }, + { + "epoch": 0.2851598611571401, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 32697 + }, + { + "epoch": 0.2851685824423087, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 32698 + }, + { + "epoch": 0.2851773037274773, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 32699 + }, + { + "epoch": 0.28518602501264584, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 32700 + }, + { + "epoch": 0.28519474629781444, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 32701 + }, + { + "epoch": 0.28520346758298304, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32702 + }, + { + "epoch": 0.2852121888681516, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 32703 + }, + { + "epoch": 0.2852209101533202, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 32704 + }, + { + "epoch": 0.2852296314384888, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 32705 + }, + { + "epoch": 0.2852383527236574, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 32706 + }, + { + "epoch": 0.28524707400882593, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 32707 + }, + { + "epoch": 0.28525579529399453, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 32708 + }, + { + "epoch": 0.28526451657916313, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 32709 + }, + { + "epoch": 0.2852732378643317, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 32710 + }, + { + "epoch": 0.2852819591495003, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 32711 + }, + { + "epoch": 0.2852906804346689, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 32712 + }, + { + "epoch": 0.2852994017198374, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32713 + }, + { + "epoch": 0.285308123005006, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 32714 + }, + { + "epoch": 0.2853168442901746, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 32715 + }, + { + "epoch": 0.28532556557534317, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 32716 + }, + { + "epoch": 0.28533428686051177, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 32717 + }, + { + "epoch": 0.28534300814568037, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 32718 + }, + { + "epoch": 0.2853517294308489, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 32719 + }, + { + "epoch": 0.2853604507160175, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 32720 + }, + { + "epoch": 0.2853691720011861, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 32721 + }, + { + "epoch": 0.28537789328635466, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 32722 + }, + { + "epoch": 0.28538661457152326, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 32723 + }, + { + "epoch": 0.28539533585669186, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 32724 + }, + { + "epoch": 0.2854040571418604, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 32725 + }, + { + "epoch": 0.285412778427029, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 32726 + }, + { + "epoch": 0.2854214997121976, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 32727 + }, + { + "epoch": 0.28543022099736615, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 32728 + }, + { + "epoch": 0.28543894228253475, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 32729 + }, + { + "epoch": 0.28544766356770335, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 32730 + }, + { + "epoch": 0.2854563848528719, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 32731 + }, + { + "epoch": 0.2854651061380405, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 32732 + }, + { + "epoch": 0.2854738274232091, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 32733 + }, + { + "epoch": 0.2854825487083777, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 32734 + }, + { + "epoch": 0.28549126999354624, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 32735 + }, + { + "epoch": 0.28549999127871484, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 32736 + }, + { + "epoch": 0.28550871256388344, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 32737 + }, + { + "epoch": 0.285517433849052, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 32738 + }, + { + "epoch": 0.2855261551342206, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 32739 + }, + { + "epoch": 0.2855348764193892, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 32740 + }, + { + "epoch": 0.28554359770455773, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 32741 + }, + { + "epoch": 0.28555231898972633, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 32742 + }, + { + "epoch": 0.28556104027489493, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 32743 + }, + { + "epoch": 0.2855697615600635, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 32744 + }, + { + "epoch": 0.2855784828452321, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 32745 + }, + { + "epoch": 0.2855872041304007, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 32746 + }, + { + "epoch": 0.2855959254155692, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 32747 + }, + { + "epoch": 0.2856046467007378, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 32748 + }, + { + "epoch": 0.2856133679859064, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 32749 + }, + { + "epoch": 0.28562208927107496, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 32750 + }, + { + "epoch": 0.28563081055624356, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 32751 + }, + { + "epoch": 0.28563953184141216, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 32752 + }, + { + "epoch": 0.2856482531265807, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 32753 + }, + { + "epoch": 0.2856569744117493, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 32754 + }, + { + "epoch": 0.2856656956969179, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 32755 + }, + { + "epoch": 0.28567441698208645, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 32756 + }, + { + "epoch": 0.28568313826725505, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 32757 + }, + { + "epoch": 0.28569185955242365, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 32758 + }, + { + "epoch": 0.28570058083759226, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 32759 + }, + { + "epoch": 0.2857093021227608, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 32760 + }, + { + "epoch": 0.2857180234079294, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 32761 + }, + { + "epoch": 0.285726744693098, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 32762 + }, + { + "epoch": 0.28573546597826655, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 32763 + }, + { + "epoch": 0.28574418726343515, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 32764 + }, + { + "epoch": 0.28575290854860375, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 32765 + }, + { + "epoch": 0.2857616298337723, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 32766 + }, + { + "epoch": 0.2857703511189409, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 32767 + }, + { + "epoch": 0.2857790724041095, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 32768 + }, + { + "epoch": 0.28578779368927804, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 32769 + }, + { + "epoch": 0.28579651497444664, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 32770 + }, + { + "epoch": 0.28580523625961524, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 32771 + }, + { + "epoch": 0.2858139575447838, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 32772 + }, + { + "epoch": 0.2858226788299524, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 32773 + }, + { + "epoch": 0.285831400115121, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 32774 + }, + { + "epoch": 0.2858401214002895, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 32775 + }, + { + "epoch": 0.2858488426854581, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 32776 + }, + { + "epoch": 0.2858575639706267, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 32777 + }, + { + "epoch": 0.28586628525579527, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 32778 + }, + { + "epoch": 0.28587500654096387, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 32779 + }, + { + "epoch": 0.28588372782613247, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 32780 + }, + { + "epoch": 0.285892449111301, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 32781 + }, + { + "epoch": 0.2859011703964696, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 32782 + }, + { + "epoch": 0.2859098916816382, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 32783 + }, + { + "epoch": 0.28591861296680676, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 32784 + }, + { + "epoch": 0.28592733425197536, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 32785 + }, + { + "epoch": 0.28593605553714396, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 32786 + }, + { + "epoch": 0.28594477682231256, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 32787 + }, + { + "epoch": 0.2859534981074811, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 32788 + }, + { + "epoch": 0.2859622193926497, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 32789 + }, + { + "epoch": 0.2859709406778183, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 32790 + }, + { + "epoch": 0.28597966196298685, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 32791 + }, + { + "epoch": 0.28598838324815545, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 32792 + }, + { + "epoch": 0.28599710453332405, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 32793 + }, + { + "epoch": 0.2860058258184926, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 32794 + }, + { + "epoch": 0.2860145471036612, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 32795 + }, + { + "epoch": 0.2860232683888298, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 32796 + }, + { + "epoch": 0.28603198967399834, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 32797 + }, + { + "epoch": 0.28604071095916694, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 32798 + }, + { + "epoch": 0.28604943224433554, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 32799 + }, + { + "epoch": 0.2860581535295041, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 32800 + }, + { + "epoch": 0.2860668748146727, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 32801 + }, + { + "epoch": 0.2860755960998413, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 32802 + }, + { + "epoch": 0.28608431738500983, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 32803 + }, + { + "epoch": 0.28609303867017843, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 32804 + }, + { + "epoch": 0.28610175995534703, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 32805 + }, + { + "epoch": 0.2861104812405156, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 32806 + }, + { + "epoch": 0.2861192025256842, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 32807 + }, + { + "epoch": 0.2861279238108528, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 32808 + }, + { + "epoch": 0.2861366450960213, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32809 + }, + { + "epoch": 0.2861453663811899, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 32810 + }, + { + "epoch": 0.2861540876663585, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 32811 + }, + { + "epoch": 0.28616280895152707, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 32812 + }, + { + "epoch": 0.28617153023669567, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 32813 + }, + { + "epoch": 0.28618025152186427, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 32814 + }, + { + "epoch": 0.28618897280703287, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 32815 + }, + { + "epoch": 0.2861976940922014, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 32816 + }, + { + "epoch": 0.28620641537737, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 32817 + }, + { + "epoch": 0.2862151366625386, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 32818 + }, + { + "epoch": 0.28622385794770716, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 32819 + }, + { + "epoch": 0.28623257923287576, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 32820 + }, + { + "epoch": 0.28624130051804436, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 32821 + }, + { + "epoch": 0.2862500218032129, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 32822 + }, + { + "epoch": 0.2862587430883815, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 32823 + }, + { + "epoch": 0.2862674643735501, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 32824 + }, + { + "epoch": 0.28627618565871865, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 32825 + }, + { + "epoch": 0.28628490694388725, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 32826 + }, + { + "epoch": 0.28629362822905585, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 32827 + }, + { + "epoch": 0.2863023495142244, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 32828 + }, + { + "epoch": 0.286311070799393, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 32829 + }, + { + "epoch": 0.2863197920845616, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 32830 + }, + { + "epoch": 0.28632851336973014, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 32831 + }, + { + "epoch": 0.28633723465489874, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 32832 + }, + { + "epoch": 0.28634595594006734, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 32833 + }, + { + "epoch": 0.2863546772252359, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 32834 + }, + { + "epoch": 0.2863633985104045, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 32835 + }, + { + "epoch": 0.2863721197955731, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 32836 + }, + { + "epoch": 0.28638084108074163, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 32837 + }, + { + "epoch": 0.28638956236591023, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 32838 + }, + { + "epoch": 0.28639828365107883, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 32839 + }, + { + "epoch": 0.2864070049362474, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 32840 + }, + { + "epoch": 0.286415726221416, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 32841 + }, + { + "epoch": 0.2864244475065846, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 32842 + }, + { + "epoch": 0.2864331687917532, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 32843 + }, + { + "epoch": 0.2864418900769217, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 32844 + }, + { + "epoch": 0.2864506113620903, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 32845 + }, + { + "epoch": 0.2864593326472589, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 32846 + }, + { + "epoch": 0.28646805393242747, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 32847 + }, + { + "epoch": 0.28647677521759607, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 32848 + }, + { + "epoch": 0.28648549650276467, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 32849 + }, + { + "epoch": 0.2864942177879332, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 32850 + }, + { + "epoch": 0.2865029390731018, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 32851 + }, + { + "epoch": 0.2865116603582704, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32852 + }, + { + "epoch": 0.28652038164343896, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 32853 + }, + { + "epoch": 0.28652910292860756, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 32854 + }, + { + "epoch": 0.28653782421377616, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 32855 + }, + { + "epoch": 0.2865465454989447, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 32856 + }, + { + "epoch": 0.2865552667841133, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 32857 + }, + { + "epoch": 0.2865639880692819, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 32858 + }, + { + "epoch": 0.28657270935445045, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 32859 + }, + { + "epoch": 0.28658143063961905, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 32860 + }, + { + "epoch": 0.28659015192478765, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 32861 + }, + { + "epoch": 0.2865988732099562, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 32862 + }, + { + "epoch": 0.2866075944951248, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 32863 + }, + { + "epoch": 0.2866163157802934, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 32864 + }, + { + "epoch": 0.28662503706546194, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 32865 + }, + { + "epoch": 0.28663375835063054, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 32866 + }, + { + "epoch": 0.28664247963579914, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 32867 + }, + { + "epoch": 0.28665120092096774, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 32868 + }, + { + "epoch": 0.2866599222061363, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 32869 + }, + { + "epoch": 0.2866686434913049, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 32870 + }, + { + "epoch": 0.2866773647764735, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 32871 + }, + { + "epoch": 0.28668608606164203, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 32872 + }, + { + "epoch": 0.28669480734681063, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32873 + }, + { + "epoch": 0.28670352863197923, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 32874 + }, + { + "epoch": 0.2867122499171478, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 32875 + }, + { + "epoch": 0.2867209712023164, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 32876 + }, + { + "epoch": 0.286729692487485, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 32877 + }, + { + "epoch": 0.2867384137726535, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 32878 + }, + { + "epoch": 0.2867471350578221, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 32879 + }, + { + "epoch": 0.2867558563429907, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 32880 + }, + { + "epoch": 0.28676457762815927, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 32881 + }, + { + "epoch": 0.28677329891332787, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 32882 + }, + { + "epoch": 0.28678202019849647, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 32883 + }, + { + "epoch": 0.286790741483665, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 32884 + }, + { + "epoch": 0.2867994627688336, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 32885 + }, + { + "epoch": 0.2868081840540022, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 32886 + }, + { + "epoch": 0.28681690533917076, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 32887 + }, + { + "epoch": 0.28682562662433936, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 32888 + }, + { + "epoch": 0.28683434790950796, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 32889 + }, + { + "epoch": 0.2868430691946765, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 32890 + }, + { + "epoch": 0.2868517904798451, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 32891 + }, + { + "epoch": 0.2868605117650137, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 32892 + }, + { + "epoch": 0.28686923305018225, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 32893 + }, + { + "epoch": 0.28687795433535085, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 32894 + }, + { + "epoch": 0.28688667562051945, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 32895 + }, + { + "epoch": 0.28689539690568805, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 32896 + }, + { + "epoch": 0.2869041181908566, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 32897 + }, + { + "epoch": 0.2869128394760252, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 32898 + }, + { + "epoch": 0.2869215607611938, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 32899 + }, + { + "epoch": 0.28693028204636234, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 32900 + }, + { + "epoch": 0.28693900333153094, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 32901 + }, + { + "epoch": 0.28694772461669954, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 32902 + }, + { + "epoch": 0.2869564459018681, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 32903 + }, + { + "epoch": 0.2869651671870367, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 32904 + }, + { + "epoch": 0.2869738884722053, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 32905 + }, + { + "epoch": 0.28698260975737383, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 32906 + }, + { + "epoch": 0.28699133104254243, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32907 + }, + { + "epoch": 0.28700005232771103, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 32908 + }, + { + "epoch": 0.2870087736128796, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 32909 + }, + { + "epoch": 0.2870174948980482, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 32910 + }, + { + "epoch": 0.2870262161832168, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 32911 + }, + { + "epoch": 0.2870349374683853, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 32912 + }, + { + "epoch": 0.2870436587535539, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 32913 + }, + { + "epoch": 0.2870523800387225, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 32914 + }, + { + "epoch": 0.28706110132389107, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 32915 + }, + { + "epoch": 0.28706982260905967, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 32916 + }, + { + "epoch": 0.28707854389422827, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 32917 + }, + { + "epoch": 0.2870872651793968, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 32918 + }, + { + "epoch": 0.2870959864645654, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 32919 + }, + { + "epoch": 0.287104707749734, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 32920 + }, + { + "epoch": 0.28711342903490256, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 32921 + }, + { + "epoch": 0.28712215032007116, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 32922 + }, + { + "epoch": 0.28713087160523976, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 32923 + }, + { + "epoch": 0.28713959289040836, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 32924 + }, + { + "epoch": 0.2871483141755769, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32925 + }, + { + "epoch": 0.2871570354607455, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 32926 + }, + { + "epoch": 0.2871657567459141, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 32927 + }, + { + "epoch": 0.28717447803108265, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 32928 + }, + { + "epoch": 0.28718319931625125, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 32929 + }, + { + "epoch": 0.28719192060141985, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 32930 + }, + { + "epoch": 0.2872006418865884, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 32931 + }, + { + "epoch": 0.287209363171757, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 32932 + }, + { + "epoch": 0.2872180844569256, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 32933 + }, + { + "epoch": 0.28722680574209414, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 32934 + }, + { + "epoch": 0.28723552702726274, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 32935 + }, + { + "epoch": 0.28724424831243134, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 32936 + }, + { + "epoch": 0.2872529695975999, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 32937 + }, + { + "epoch": 0.2872616908827685, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 32938 + }, + { + "epoch": 0.2872704121679371, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 32939 + }, + { + "epoch": 0.28727913345310563, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 32940 + }, + { + "epoch": 0.28728785473827423, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 32941 + }, + { + "epoch": 0.28729657602344283, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 32942 + }, + { + "epoch": 0.2873052973086114, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 32943 + }, + { + "epoch": 0.28731401859378, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 32944 + }, + { + "epoch": 0.2873227398789486, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 32945 + }, + { + "epoch": 0.2873314611641171, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 32946 + }, + { + "epoch": 0.2873401824492857, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 32947 + }, + { + "epoch": 0.2873489037344543, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 32948 + }, + { + "epoch": 0.28735762501962286, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 32949 + }, + { + "epoch": 0.28736634630479146, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 32950 + }, + { + "epoch": 0.28737506758996006, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 32951 + }, + { + "epoch": 0.28738378887512867, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 32952 + }, + { + "epoch": 0.2873925101602972, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 32953 + }, + { + "epoch": 0.2874012314454658, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 32954 + }, + { + "epoch": 0.2874099527306344, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 32955 + }, + { + "epoch": 0.28741867401580296, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 32956 + }, + { + "epoch": 0.28742739530097156, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 32957 + }, + { + "epoch": 0.28743611658614016, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 32958 + }, + { + "epoch": 0.2874448378713087, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 32959 + }, + { + "epoch": 0.2874535591564773, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 32960 + }, + { + "epoch": 0.2874622804416459, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 32961 + }, + { + "epoch": 0.28747100172681445, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 32962 + }, + { + "epoch": 0.28747972301198305, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 32963 + }, + { + "epoch": 0.28748844429715165, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 32964 + }, + { + "epoch": 0.2874971655823202, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 32965 + }, + { + "epoch": 0.2875058868674888, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 32966 + }, + { + "epoch": 0.2875146081526574, + "grad_norm": 0.06982421875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 32967 + }, + { + "epoch": 0.28752332943782594, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 32968 + }, + { + "epoch": 0.28753205072299454, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 32969 + }, + { + "epoch": 0.28754077200816314, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 32970 + }, + { + "epoch": 0.2875494932933317, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 32971 + }, + { + "epoch": 0.2875582145785003, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 32972 + }, + { + "epoch": 0.2875669358636689, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 32973 + }, + { + "epoch": 0.2875756571488374, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 32974 + }, + { + "epoch": 0.287584378434006, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 32975 + }, + { + "epoch": 0.2875930997191746, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 32976 + }, + { + "epoch": 0.2876018210043432, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 32977 + }, + { + "epoch": 0.2876105422895118, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 32978 + }, + { + "epoch": 0.2876192635746804, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 32979 + }, + { + "epoch": 0.287627984859849, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 32980 + }, + { + "epoch": 0.2876367061450175, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 32981 + }, + { + "epoch": 0.2876454274301861, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 32982 + }, + { + "epoch": 0.2876541487153547, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 32983 + }, + { + "epoch": 0.28766287000052326, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 32984 + }, + { + "epoch": 0.28767159128569186, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 32985 + }, + { + "epoch": 0.28768031257086046, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 32986 + }, + { + "epoch": 0.287689033856029, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 32987 + }, + { + "epoch": 0.2876977551411976, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 32988 + }, + { + "epoch": 0.2877064764263662, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 32989 + }, + { + "epoch": 0.28771519771153475, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 32990 + }, + { + "epoch": 0.28772391899670335, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 32991 + }, + { + "epoch": 0.28773264028187195, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 32992 + }, + { + "epoch": 0.2877413615670405, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 32993 + }, + { + "epoch": 0.2877500828522091, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 32994 + }, + { + "epoch": 0.2877588041373777, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 32995 + }, + { + "epoch": 0.28776752542254624, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 32996 + }, + { + "epoch": 0.28777624670771484, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 32997 + }, + { + "epoch": 0.28778496799288344, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 32998 + }, + { + "epoch": 0.287793689278052, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 32999 + }, + { + "epoch": 0.2878024105632206, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 33000 + }, + { + "epoch": 0.2878111318483892, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 33001 + }, + { + "epoch": 0.28781985313355773, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 33002 + }, + { + "epoch": 0.28782857441872634, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 33003 + }, + { + "epoch": 0.28783729570389494, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 33004 + }, + { + "epoch": 0.28784601698906354, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 33005 + }, + { + "epoch": 0.2878547382742321, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33006 + }, + { + "epoch": 0.2878634595594007, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 33007 + }, + { + "epoch": 0.2878721808445693, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33008 + }, + { + "epoch": 0.2878809021297378, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 33009 + }, + { + "epoch": 0.2878896234149064, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 33010 + }, + { + "epoch": 0.287898344700075, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 33011 + }, + { + "epoch": 0.28790706598524357, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 33012 + }, + { + "epoch": 0.28791578727041217, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 33013 + }, + { + "epoch": 0.28792450855558077, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 33014 + }, + { + "epoch": 0.2879332298407493, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33015 + }, + { + "epoch": 0.2879419511259179, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 33016 + }, + { + "epoch": 0.2879506724110865, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 33017 + }, + { + "epoch": 0.28795939369625506, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 33018 + }, + { + "epoch": 0.28796811498142366, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 33019 + }, + { + "epoch": 0.28797683626659226, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 33020 + }, + { + "epoch": 0.2879855575517608, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 33021 + }, + { + "epoch": 0.2879942788369294, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 33022 + }, + { + "epoch": 0.288003000122098, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 33023 + }, + { + "epoch": 0.28801172140726655, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 33024 + }, + { + "epoch": 0.28802044269243515, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 33025 + }, + { + "epoch": 0.28802916397760375, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33026 + }, + { + "epoch": 0.2880378852627723, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33027 + }, + { + "epoch": 0.2880466065479409, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 33028 + }, + { + "epoch": 0.2880553278331095, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 33029 + }, + { + "epoch": 0.28806404911827804, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 33030 + }, + { + "epoch": 0.28807277040344664, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 33031 + }, + { + "epoch": 0.28808149168861524, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 33032 + }, + { + "epoch": 0.28809021297378384, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 33033 + }, + { + "epoch": 0.2880989342589524, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 33034 + }, + { + "epoch": 0.288107655544121, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 33035 + }, + { + "epoch": 0.2881163768292896, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 33036 + }, + { + "epoch": 0.28812509811445813, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 33037 + }, + { + "epoch": 0.28813381939962673, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 33038 + }, + { + "epoch": 0.28814254068479533, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 33039 + }, + { + "epoch": 0.2881512619699639, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 33040 + }, + { + "epoch": 0.2881599832551325, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 33041 + }, + { + "epoch": 0.2881687045403011, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 33042 + }, + { + "epoch": 0.2881774258254696, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33043 + }, + { + "epoch": 0.2881861471106382, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 33044 + }, + { + "epoch": 0.2881948683958068, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33045 + }, + { + "epoch": 0.28820358968097537, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 33046 + }, + { + "epoch": 0.28821231096614397, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 33047 + }, + { + "epoch": 0.28822103225131257, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 33048 + }, + { + "epoch": 0.2882297535364811, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 33049 + }, + { + "epoch": 0.2882384748216497, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 33050 + }, + { + "epoch": 0.2882471961068183, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 33051 + }, + { + "epoch": 0.28825591739198686, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 33052 + }, + { + "epoch": 0.28826463867715546, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 33053 + }, + { + "epoch": 0.28827335996232406, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 33054 + }, + { + "epoch": 0.2882820812474926, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9783, + "step": 33055 + }, + { + "epoch": 0.2882908025326612, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 33056 + }, + { + "epoch": 0.2882995238178298, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 33057 + }, + { + "epoch": 0.28830824510299835, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 33058 + }, + { + "epoch": 0.28831696638816695, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 33059 + }, + { + "epoch": 0.28832568767333555, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 33060 + }, + { + "epoch": 0.28833440895850415, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33061 + }, + { + "epoch": 0.2883431302436727, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 33062 + }, + { + "epoch": 0.2883518515288413, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 33063 + }, + { + "epoch": 0.2883605728140099, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 33064 + }, + { + "epoch": 0.28836929409917844, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33065 + }, + { + "epoch": 0.28837801538434704, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 33066 + }, + { + "epoch": 0.28838673666951564, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 33067 + }, + { + "epoch": 0.2883954579546842, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 33068 + }, + { + "epoch": 0.2884041792398528, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 33069 + }, + { + "epoch": 0.2884129005250214, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 33070 + }, + { + "epoch": 0.28842162181018993, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 33071 + }, + { + "epoch": 0.28843034309535853, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 33072 + }, + { + "epoch": 0.28843906438052713, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 33073 + }, + { + "epoch": 0.2884477856656957, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33074 + }, + { + "epoch": 0.2884565069508643, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 33075 + }, + { + "epoch": 0.2884652282360329, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 33076 + }, + { + "epoch": 0.2884739495212014, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 33077 + }, + { + "epoch": 0.28848267080637, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 33078 + }, + { + "epoch": 0.2884913920915386, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 33079 + }, + { + "epoch": 0.28850011337670717, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 33080 + }, + { + "epoch": 0.28850883466187577, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 33081 + }, + { + "epoch": 0.28851755594704437, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 33082 + }, + { + "epoch": 0.2885262772322129, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 33083 + }, + { + "epoch": 0.2885349985173815, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 33084 + }, + { + "epoch": 0.2885437198025501, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 33085 + }, + { + "epoch": 0.28855244108771866, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 33086 + }, + { + "epoch": 0.28856116237288726, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 33087 + }, + { + "epoch": 0.28856988365805586, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 33088 + }, + { + "epoch": 0.28857860494322446, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 33089 + }, + { + "epoch": 0.288587326228393, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 33090 + }, + { + "epoch": 0.2885960475135616, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 33091 + }, + { + "epoch": 0.2886047687987302, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 33092 + }, + { + "epoch": 0.28861349008389875, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 33093 + }, + { + "epoch": 0.28862221136906735, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 33094 + }, + { + "epoch": 0.28863093265423595, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 33095 + }, + { + "epoch": 0.2886396539394045, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 33096 + }, + { + "epoch": 0.2886483752245731, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 33097 + }, + { + "epoch": 0.2886570965097417, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 33098 + }, + { + "epoch": 0.28866581779491024, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 33099 + }, + { + "epoch": 0.28867453908007884, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 33100 + }, + { + "epoch": 0.28868326036524744, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 33101 + }, + { + "epoch": 0.288691981650416, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 33102 + }, + { + "epoch": 0.2887007029355846, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 33103 + }, + { + "epoch": 0.2887094242207532, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 33104 + }, + { + "epoch": 0.28871814550592173, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 33105 + }, + { + "epoch": 0.28872686679109033, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 33106 + }, + { + "epoch": 0.28873558807625893, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 33107 + }, + { + "epoch": 0.2887443093614275, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 33108 + }, + { + "epoch": 0.2887530306465961, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 33109 + }, + { + "epoch": 0.2887617519317647, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 33110 + }, + { + "epoch": 0.2887704732169332, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 33111 + }, + { + "epoch": 0.2887791945021018, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9816, + "step": 33112 + }, + { + "epoch": 0.2887879157872704, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 33113 + }, + { + "epoch": 0.288796637072439, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 33114 + }, + { + "epoch": 0.28880535835760757, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33115 + }, + { + "epoch": 0.28881407964277617, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 33116 + }, + { + "epoch": 0.28882280092794477, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33117 + }, + { + "epoch": 0.2888315222131133, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 33118 + }, + { + "epoch": 0.2888402434982819, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 33119 + }, + { + "epoch": 0.2888489647834505, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 33120 + }, + { + "epoch": 0.28885768606861906, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 33121 + }, + { + "epoch": 0.28886640735378766, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 33122 + }, + { + "epoch": 0.28887512863895626, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 33123 + }, + { + "epoch": 0.2888838499241248, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 33124 + }, + { + "epoch": 0.2888925712092934, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 33125 + }, + { + "epoch": 0.288901292494462, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 33126 + }, + { + "epoch": 0.28891001377963055, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33127 + }, + { + "epoch": 0.28891873506479915, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 33128 + }, + { + "epoch": 0.28892745634996775, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 33129 + }, + { + "epoch": 0.2889361776351363, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 33130 + }, + { + "epoch": 0.2889448989203049, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 33131 + }, + { + "epoch": 0.2889536202054735, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 33132 + }, + { + "epoch": 0.28896234149064204, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 33133 + }, + { + "epoch": 0.28897106277581064, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 33134 + }, + { + "epoch": 0.28897978406097924, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33135 + }, + { + "epoch": 0.2889885053461478, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33136 + }, + { + "epoch": 0.2889972266313164, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 33137 + }, + { + "epoch": 0.289005947916485, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 33138 + }, + { + "epoch": 0.28901466920165353, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 33139 + }, + { + "epoch": 0.28902339048682213, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 33140 + }, + { + "epoch": 0.28903211177199073, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 33141 + }, + { + "epoch": 0.28904083305715933, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 33142 + }, + { + "epoch": 0.2890495543423279, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 33143 + }, + { + "epoch": 0.2890582756274965, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 33144 + }, + { + "epoch": 0.2890669969126651, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 33145 + }, + { + "epoch": 0.2890757181978336, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 33146 + }, + { + "epoch": 0.2890844394830022, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 33147 + }, + { + "epoch": 0.2890931607681708, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 33148 + }, + { + "epoch": 0.28910188205333937, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 33149 + }, + { + "epoch": 0.28911060333850797, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 33150 + }, + { + "epoch": 0.28911932462367657, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33151 + }, + { + "epoch": 0.2891280459088451, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 33152 + }, + { + "epoch": 0.2891367671940137, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33153 + }, + { + "epoch": 0.2891454884791823, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 33154 + }, + { + "epoch": 0.28915420976435086, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 33155 + }, + { + "epoch": 0.28916293104951946, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 33156 + }, + { + "epoch": 0.28917165233468806, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 33157 + }, + { + "epoch": 0.2891803736198566, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 33158 + }, + { + "epoch": 0.2891890949050252, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 33159 + }, + { + "epoch": 0.2891978161901938, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33160 + }, + { + "epoch": 0.28920653747536235, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 33161 + }, + { + "epoch": 0.28921525876053095, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 33162 + }, + { + "epoch": 0.28922398004569955, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 33163 + }, + { + "epoch": 0.2892327013308681, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 33164 + }, + { + "epoch": 0.2892414226160367, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 33165 + }, + { + "epoch": 0.2892501439012053, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 33166 + }, + { + "epoch": 0.28925886518637384, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 33167 + }, + { + "epoch": 0.28926758647154244, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33168 + }, + { + "epoch": 0.28927630775671104, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 33169 + }, + { + "epoch": 0.28928502904187964, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 33170 + }, + { + "epoch": 0.2892937503270482, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 33171 + }, + { + "epoch": 0.2893024716122168, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 33172 + }, + { + "epoch": 0.2893111928973854, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 33173 + }, + { + "epoch": 0.28931991418255393, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 33174 + }, + { + "epoch": 0.28932863546772253, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 33175 + }, + { + "epoch": 0.28933735675289113, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 33176 + }, + { + "epoch": 0.2893460780380597, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 33177 + }, + { + "epoch": 0.2893547993232283, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 33178 + }, + { + "epoch": 0.2893635206083969, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 33179 + }, + { + "epoch": 0.2893722418935654, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 33180 + }, + { + "epoch": 0.289380963178734, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 33181 + }, + { + "epoch": 0.2893896844639026, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 33182 + }, + { + "epoch": 0.28939840574907116, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 33183 + }, + { + "epoch": 0.28940712703423976, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 33184 + }, + { + "epoch": 0.28941584831940836, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 33185 + }, + { + "epoch": 0.2894245696045769, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 33186 + }, + { + "epoch": 0.2894332908897455, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 33187 + }, + { + "epoch": 0.2894420121749141, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 33188 + }, + { + "epoch": 0.28945073346008265, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 33189 + }, + { + "epoch": 0.28945945474525125, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 33190 + }, + { + "epoch": 0.28946817603041985, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 33191 + }, + { + "epoch": 0.2894768973155884, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33192 + }, + { + "epoch": 0.289485618600757, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 33193 + }, + { + "epoch": 0.2894943398859256, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 33194 + }, + { + "epoch": 0.28950306117109414, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 33195 + }, + { + "epoch": 0.28951178245626275, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 33196 + }, + { + "epoch": 0.28952050374143135, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 33197 + }, + { + "epoch": 0.28952922502659995, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33198 + }, + { + "epoch": 0.2895379463117685, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 33199 + }, + { + "epoch": 0.2895466675969371, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 33200 + }, + { + "epoch": 0.2895553888821057, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33201 + }, + { + "epoch": 0.28956411016727424, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 33202 + }, + { + "epoch": 0.28957283145244284, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 33203 + }, + { + "epoch": 0.28958155273761144, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33204 + }, + { + "epoch": 0.28959027402278, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 33205 + }, + { + "epoch": 0.2895989953079486, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 33206 + }, + { + "epoch": 0.2896077165931172, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 33207 + }, + { + "epoch": 0.2896164378782857, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 33208 + }, + { + "epoch": 0.2896251591634543, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 33209 + }, + { + "epoch": 0.2896338804486229, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 33210 + }, + { + "epoch": 0.28964260173379147, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 33211 + }, + { + "epoch": 0.28965132301896007, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 33212 + }, + { + "epoch": 0.28966004430412867, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 33213 + }, + { + "epoch": 0.2896687655892972, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 33214 + }, + { + "epoch": 0.2896774868744658, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 33215 + }, + { + "epoch": 0.2896862081596344, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33216 + }, + { + "epoch": 0.28969492944480296, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 33217 + }, + { + "epoch": 0.28970365072997156, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 33218 + }, + { + "epoch": 0.28971237201514016, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 33219 + }, + { + "epoch": 0.2897210933003087, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 33220 + }, + { + "epoch": 0.2897298145854773, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33221 + }, + { + "epoch": 0.2897385358706459, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 33222 + }, + { + "epoch": 0.2897472571558145, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 33223 + }, + { + "epoch": 0.28975597844098305, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 33224 + }, + { + "epoch": 0.28976469972615165, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 33225 + }, + { + "epoch": 0.28977342101132025, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 33226 + }, + { + "epoch": 0.2897821422964888, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33227 + }, + { + "epoch": 0.2897908635816574, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33228 + }, + { + "epoch": 0.289799584866826, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 33229 + }, + { + "epoch": 0.28980830615199454, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 33230 + }, + { + "epoch": 0.28981702743716314, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 33231 + }, + { + "epoch": 0.28982574872233174, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 33232 + }, + { + "epoch": 0.2898344700075003, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 33233 + }, + { + "epoch": 0.2898431912926689, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 33234 + }, + { + "epoch": 0.2898519125778375, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33235 + }, + { + "epoch": 0.28986063386300603, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 33236 + }, + { + "epoch": 0.28986935514817463, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 33237 + }, + { + "epoch": 0.28987807643334323, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 33238 + }, + { + "epoch": 0.2898867977185118, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 33239 + }, + { + "epoch": 0.2898955190036804, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 33240 + }, + { + "epoch": 0.289904240288849, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33241 + }, + { + "epoch": 0.2899129615740175, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 33242 + }, + { + "epoch": 0.2899216828591861, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 33243 + }, + { + "epoch": 0.2899304041443547, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 33244 + }, + { + "epoch": 0.28993912542952327, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9832, + "step": 33245 + }, + { + "epoch": 0.28994784671469187, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 33246 + }, + { + "epoch": 0.28995656799986047, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 33247 + }, + { + "epoch": 0.289965289285029, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 33248 + }, + { + "epoch": 0.2899740105701976, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 33249 + }, + { + "epoch": 0.2899827318553662, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 33250 + }, + { + "epoch": 0.2899914531405348, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 33251 + }, + { + "epoch": 0.29000017442570336, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 33252 + }, + { + "epoch": 0.29000889571087196, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 33253 + }, + { + "epoch": 0.29001761699604056, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33254 + }, + { + "epoch": 0.2900263382812091, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33255 + }, + { + "epoch": 0.2900350595663777, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 33256 + }, + { + "epoch": 0.2900437808515463, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 33257 + }, + { + "epoch": 0.29005250213671485, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 33258 + }, + { + "epoch": 0.29006122342188345, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 33259 + }, + { + "epoch": 0.29006994470705205, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 33260 + }, + { + "epoch": 0.2900786659922206, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 33261 + }, + { + "epoch": 0.2900873872773892, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 33262 + }, + { + "epoch": 0.2900961085625578, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 33263 + }, + { + "epoch": 0.29010482984772634, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 33264 + }, + { + "epoch": 0.29011355113289494, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 33265 + }, + { + "epoch": 0.29012227241806354, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 33266 + }, + { + "epoch": 0.2901309937032321, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 33267 + }, + { + "epoch": 0.2901397149884007, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 33268 + }, + { + "epoch": 0.2901484362735693, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33269 + }, + { + "epoch": 0.29015715755873783, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 33270 + }, + { + "epoch": 0.29016587884390643, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 33271 + }, + { + "epoch": 0.29017460012907503, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 33272 + }, + { + "epoch": 0.2901833214142436, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 33273 + }, + { + "epoch": 0.2901920426994122, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 33274 + }, + { + "epoch": 0.2902007639845808, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 33275 + }, + { + "epoch": 0.2902094852697493, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 33276 + }, + { + "epoch": 0.2902182065549179, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 33277 + }, + { + "epoch": 0.2902269278400865, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 33278 + }, + { + "epoch": 0.2902356491252551, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 33279 + }, + { + "epoch": 0.29024437041042367, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 33280 + }, + { + "epoch": 0.29025309169559227, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 33281 + }, + { + "epoch": 0.29026181298076087, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 33282 + }, + { + "epoch": 0.2902705342659294, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 33283 + }, + { + "epoch": 0.290279255551098, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 33284 + }, + { + "epoch": 0.2902879768362666, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33285 + }, + { + "epoch": 0.29029669812143516, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 33286 + }, + { + "epoch": 0.29030541940660376, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 33287 + }, + { + "epoch": 0.29031414069177236, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 33288 + }, + { + "epoch": 0.2903228619769409, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 33289 + }, + { + "epoch": 0.2903315832621095, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 33290 + }, + { + "epoch": 0.2903403045472781, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 33291 + }, + { + "epoch": 0.29034902583244665, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 33292 + }, + { + "epoch": 0.29035774711761525, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 33293 + }, + { + "epoch": 0.29036646840278385, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 33294 + }, + { + "epoch": 0.2903751896879524, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 33295 + }, + { + "epoch": 0.290383910973121, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 33296 + }, + { + "epoch": 0.2903926322582896, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 33297 + }, + { + "epoch": 0.29040135354345814, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 33298 + }, + { + "epoch": 0.29041007482862674, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 33299 + }, + { + "epoch": 0.29041879611379534, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 33300 + }, + { + "epoch": 0.2904275173989639, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 33301 + }, + { + "epoch": 0.2904362386841325, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 33302 + }, + { + "epoch": 0.2904449599693011, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 33303 + }, + { + "epoch": 0.29045368125446963, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 33304 + }, + { + "epoch": 0.29046240253963823, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 33305 + }, + { + "epoch": 0.29047112382480683, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 33306 + }, + { + "epoch": 0.29047984510997543, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 33307 + }, + { + "epoch": 0.290488566395144, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 33308 + }, + { + "epoch": 0.2904972876803126, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 33309 + }, + { + "epoch": 0.2905060089654812, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 33310 + }, + { + "epoch": 0.2905147302506497, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33311 + }, + { + "epoch": 0.2905234515358183, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 33312 + }, + { + "epoch": 0.2905321728209869, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 33313 + }, + { + "epoch": 0.29054089410615547, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 33314 + }, + { + "epoch": 0.29054961539132407, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33315 + }, + { + "epoch": 0.29055833667649267, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 33316 + }, + { + "epoch": 0.2905670579616612, + "grad_norm": 0.314453125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 33317 + }, + { + "epoch": 0.2905757792468298, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 33318 + }, + { + "epoch": 0.2905845005319984, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 33319 + }, + { + "epoch": 0.29059322181716696, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33320 + }, + { + "epoch": 0.29060194310233556, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 33321 + }, + { + "epoch": 0.29061066438750416, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 33322 + }, + { + "epoch": 0.2906193856726727, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 33323 + }, + { + "epoch": 0.2906281069578413, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 33324 + }, + { + "epoch": 0.2906368282430099, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 33325 + }, + { + "epoch": 0.29064554952817845, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 33326 + }, + { + "epoch": 0.29065427081334705, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33327 + }, + { + "epoch": 0.29066299209851565, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 33328 + }, + { + "epoch": 0.2906717133836842, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 33329 + }, + { + "epoch": 0.2906804346688528, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33330 + }, + { + "epoch": 0.2906891559540214, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 33331 + }, + { + "epoch": 0.29069787723919, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 33332 + }, + { + "epoch": 0.29070659852435854, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 33333 + }, + { + "epoch": 0.29071531980952714, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 33334 + }, + { + "epoch": 0.29072404109469574, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33335 + }, + { + "epoch": 0.2907327623798643, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 33336 + }, + { + "epoch": 0.2907414836650329, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 33337 + }, + { + "epoch": 0.2907502049502015, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 33338 + }, + { + "epoch": 0.29075892623537003, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 33339 + }, + { + "epoch": 0.29076764752053863, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 33340 + }, + { + "epoch": 0.29077636880570723, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33341 + }, + { + "epoch": 0.2907850900908758, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 33342 + }, + { + "epoch": 0.2907938113760444, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 33343 + }, + { + "epoch": 0.290802532661213, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 33344 + }, + { + "epoch": 0.2908112539463815, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 33345 + }, + { + "epoch": 0.2908199752315501, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 33346 + }, + { + "epoch": 0.2908286965167187, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 33347 + }, + { + "epoch": 0.29083741780188727, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 33348 + }, + { + "epoch": 0.29084613908705587, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 33349 + }, + { + "epoch": 0.29085486037222447, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 33350 + }, + { + "epoch": 0.290863581657393, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 33351 + }, + { + "epoch": 0.2908723029425616, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 33352 + }, + { + "epoch": 0.2908810242277302, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 33353 + }, + { + "epoch": 0.29088974551289876, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 33354 + }, + { + "epoch": 0.29089846679806736, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 33355 + }, + { + "epoch": 0.29090718808323596, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 33356 + }, + { + "epoch": 0.2909159093684045, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 33357 + }, + { + "epoch": 0.2909246306535731, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 33358 + }, + { + "epoch": 0.2909333519387417, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 33359 + }, + { + "epoch": 0.2909420732239103, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33360 + }, + { + "epoch": 0.29095079450907885, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 33361 + }, + { + "epoch": 0.29095951579424745, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 33362 + }, + { + "epoch": 0.29096823707941605, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 33363 + }, + { + "epoch": 0.2909769583645846, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 33364 + }, + { + "epoch": 0.2909856796497532, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 33365 + }, + { + "epoch": 0.2909944009349218, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 33366 + }, + { + "epoch": 0.29100312222009034, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 33367 + }, + { + "epoch": 0.29101184350525894, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33368 + }, + { + "epoch": 0.29102056479042754, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 33369 + }, + { + "epoch": 0.2910292860755961, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 33370 + }, + { + "epoch": 0.2910380073607647, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 33371 + }, + { + "epoch": 0.2910467286459333, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 33372 + }, + { + "epoch": 0.29105544993110183, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 33373 + }, + { + "epoch": 0.29106417121627043, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 33374 + }, + { + "epoch": 0.29107289250143903, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 33375 + }, + { + "epoch": 0.2910816137866076, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 33376 + }, + { + "epoch": 0.2910903350717762, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 33377 + }, + { + "epoch": 0.2910990563569448, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 33378 + }, + { + "epoch": 0.2911077776421133, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 33379 + }, + { + "epoch": 0.2911164989272819, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 33380 + }, + { + "epoch": 0.2911252202124505, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 33381 + }, + { + "epoch": 0.29113394149761906, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 33382 + }, + { + "epoch": 0.29114266278278766, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 33383 + }, + { + "epoch": 0.29115138406795626, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 33384 + }, + { + "epoch": 0.2911601053531248, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 33385 + }, + { + "epoch": 0.2911688266382934, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 33386 + }, + { + "epoch": 0.291177547923462, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 33387 + }, + { + "epoch": 0.2911862692086306, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 33388 + }, + { + "epoch": 0.29119499049379916, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 33389 + }, + { + "epoch": 0.29120371177896776, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 33390 + }, + { + "epoch": 0.29121243306413636, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 33391 + }, + { + "epoch": 0.2912211543493049, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 33392 + }, + { + "epoch": 0.2912298756344735, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 33393 + }, + { + "epoch": 0.2912385969196421, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 33394 + }, + { + "epoch": 0.29124731820481065, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 33395 + }, + { + "epoch": 0.29125603948997925, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 33396 + }, + { + "epoch": 0.29126476077514785, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 33397 + }, + { + "epoch": 0.2912734820603164, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 33398 + }, + { + "epoch": 0.291282203345485, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33399 + }, + { + "epoch": 0.2912909246306536, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 33400 + }, + { + "epoch": 0.29129964591582214, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33401 + }, + { + "epoch": 0.29130836720099074, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33402 + }, + { + "epoch": 0.29131708848615934, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 33403 + }, + { + "epoch": 0.2913258097713279, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 33404 + }, + { + "epoch": 0.2913345310564965, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33405 + }, + { + "epoch": 0.2913432523416651, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 33406 + }, + { + "epoch": 0.2913519736268336, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 33407 + }, + { + "epoch": 0.2913606949120022, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 33408 + }, + { + "epoch": 0.2913694161971708, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 33409 + }, + { + "epoch": 0.29137813748233937, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 33410 + }, + { + "epoch": 0.291386858767508, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 33411 + }, + { + "epoch": 0.2913955800526766, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 33412 + }, + { + "epoch": 0.2914043013378451, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 33413 + }, + { + "epoch": 0.2914130226230137, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 33414 + }, + { + "epoch": 0.2914217439081823, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9783, + "step": 33415 + }, + { + "epoch": 0.2914304651933509, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 33416 + }, + { + "epoch": 0.29143918647851946, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 33417 + }, + { + "epoch": 0.29144790776368806, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 33418 + }, + { + "epoch": 0.29145662904885666, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 33419 + }, + { + "epoch": 0.2914653503340252, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 33420 + }, + { + "epoch": 0.2914740716191938, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 33421 + }, + { + "epoch": 0.2914827929043624, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 33422 + }, + { + "epoch": 0.29149151418953095, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 33423 + }, + { + "epoch": 0.29150023547469955, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 33424 + }, + { + "epoch": 0.29150895675986815, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 33425 + }, + { + "epoch": 0.2915176780450367, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 33426 + }, + { + "epoch": 0.2915263993302053, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 33427 + }, + { + "epoch": 0.2915351206153739, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 33428 + }, + { + "epoch": 0.29154384190054244, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33429 + }, + { + "epoch": 0.29155256318571104, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 33430 + }, + { + "epoch": 0.29156128447087964, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 33431 + }, + { + "epoch": 0.2915700057560482, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 33432 + }, + { + "epoch": 0.2915787270412168, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 33433 + }, + { + "epoch": 0.2915874483263854, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 33434 + }, + { + "epoch": 0.29159616961155393, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 33435 + }, + { + "epoch": 0.29160489089672254, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 33436 + }, + { + "epoch": 0.29161361218189114, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 33437 + }, + { + "epoch": 0.2916223334670597, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 33438 + }, + { + "epoch": 0.2916310547522283, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 33439 + }, + { + "epoch": 0.2916397760373969, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 33440 + }, + { + "epoch": 0.2916484973225655, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 33441 + }, + { + "epoch": 0.291657218607734, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 33442 + }, + { + "epoch": 0.2916659398929026, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33443 + }, + { + "epoch": 0.2916746611780712, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 33444 + }, + { + "epoch": 0.29168338246323977, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 33445 + }, + { + "epoch": 0.29169210374840837, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 33446 + }, + { + "epoch": 0.29170082503357697, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 33447 + }, + { + "epoch": 0.2917095463187455, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 33448 + }, + { + "epoch": 0.2917182676039141, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 33449 + }, + { + "epoch": 0.2917269888890827, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 33450 + }, + { + "epoch": 0.29173571017425126, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 33451 + }, + { + "epoch": 0.29174443145941986, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 33452 + }, + { + "epoch": 0.29175315274458846, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 33453 + }, + { + "epoch": 0.291761874029757, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 33454 + }, + { + "epoch": 0.2917705953149256, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 33455 + }, + { + "epoch": 0.2917793166000942, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33456 + }, + { + "epoch": 0.29178803788526275, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 33457 + }, + { + "epoch": 0.29179675917043135, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 33458 + }, + { + "epoch": 0.29180548045559995, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 33459 + }, + { + "epoch": 0.2918142017407685, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 33460 + }, + { + "epoch": 0.2918229230259371, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 33461 + }, + { + "epoch": 0.2918316443111057, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 33462 + }, + { + "epoch": 0.29184036559627424, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 33463 + }, + { + "epoch": 0.29184908688144284, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 33464 + }, + { + "epoch": 0.29185780816661144, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 33465 + }, + { + "epoch": 0.29186652945178, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 33466 + }, + { + "epoch": 0.2918752507369486, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 33467 + }, + { + "epoch": 0.2918839720221172, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 33468 + }, + { + "epoch": 0.2918926933072858, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 33469 + }, + { + "epoch": 0.29190141459245433, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 33470 + }, + { + "epoch": 0.29191013587762293, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 33471 + }, + { + "epoch": 0.29191885716279153, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 33472 + }, + { + "epoch": 0.2919275784479601, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 33473 + }, + { + "epoch": 0.2919362997331287, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 33474 + }, + { + "epoch": 0.2919450210182973, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 33475 + }, + { + "epoch": 0.2919537423034658, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 33476 + }, + { + "epoch": 0.2919624635886344, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 33477 + }, + { + "epoch": 0.291971184873803, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 33478 + }, + { + "epoch": 0.29197990615897157, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 33479 + }, + { + "epoch": 0.29198862744414017, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 33480 + }, + { + "epoch": 0.29199734872930877, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 33481 + }, + { + "epoch": 0.2920060700144773, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 33482 + }, + { + "epoch": 0.2920147912996459, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 33483 + }, + { + "epoch": 0.2920235125848145, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 33484 + }, + { + "epoch": 0.29203223386998306, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33485 + }, + { + "epoch": 0.29204095515515166, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 33486 + }, + { + "epoch": 0.29204967644032026, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 33487 + }, + { + "epoch": 0.2920583977254888, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 33488 + }, + { + "epoch": 0.2920671190106574, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 33489 + }, + { + "epoch": 0.292075840295826, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 33490 + }, + { + "epoch": 0.29208456158099455, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 33491 + }, + { + "epoch": 0.29209328286616315, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 33492 + }, + { + "epoch": 0.29210200415133175, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33493 + }, + { + "epoch": 0.2921107254365003, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 33494 + }, + { + "epoch": 0.2921194467216689, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 33495 + }, + { + "epoch": 0.2921281680068375, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 33496 + }, + { + "epoch": 0.2921368892920061, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 33497 + }, + { + "epoch": 0.29214561057717464, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 33498 + }, + { + "epoch": 0.29215433186234324, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 33499 + }, + { + "epoch": 0.29216305314751184, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33500 + }, + { + "epoch": 0.2921717744326804, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 33501 + }, + { + "epoch": 0.292180495717849, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 33502 + }, + { + "epoch": 0.2921892170030176, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 33503 + }, + { + "epoch": 0.29219793828818613, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 33504 + }, + { + "epoch": 0.29220665957335473, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 33505 + }, + { + "epoch": 0.29221538085852333, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 33506 + }, + { + "epoch": 0.2922241021436919, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 33507 + }, + { + "epoch": 0.2922328234288605, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 33508 + }, + { + "epoch": 0.2922415447140291, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 33509 + }, + { + "epoch": 0.2922502659991976, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 33510 + }, + { + "epoch": 0.2922589872843662, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 33511 + }, + { + "epoch": 0.2922677085695348, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 33512 + }, + { + "epoch": 0.29227642985470337, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 33513 + }, + { + "epoch": 0.29228515113987197, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33514 + }, + { + "epoch": 0.29229387242504057, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33515 + }, + { + "epoch": 0.2923025937102091, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 33516 + }, + { + "epoch": 0.2923113149953777, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 33517 + }, + { + "epoch": 0.2923200362805463, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 33518 + }, + { + "epoch": 0.29232875756571486, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 33519 + }, + { + "epoch": 0.29233747885088346, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9819, + "step": 33520 + }, + { + "epoch": 0.29234620013605206, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 33521 + }, + { + "epoch": 0.2923549214212206, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33522 + }, + { + "epoch": 0.2923636427063892, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9792, + "step": 33523 + }, + { + "epoch": 0.2923723639915578, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 33524 + }, + { + "epoch": 0.2923810852767264, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 33525 + }, + { + "epoch": 0.29238980656189495, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 33526 + }, + { + "epoch": 0.29239852784706355, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 33527 + }, + { + "epoch": 0.29240724913223215, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 33528 + }, + { + "epoch": 0.2924159704174007, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33529 + }, + { + "epoch": 0.2924246917025693, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 33530 + }, + { + "epoch": 0.2924334129877379, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 33531 + }, + { + "epoch": 0.29244213427290644, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 33532 + }, + { + "epoch": 0.29245085555807504, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 33533 + }, + { + "epoch": 0.29245957684324364, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 33534 + }, + { + "epoch": 0.2924682981284122, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 33535 + }, + { + "epoch": 0.2924770194135808, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33536 + }, + { + "epoch": 0.2924857406987494, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 33537 + }, + { + "epoch": 0.29249446198391793, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 33538 + }, + { + "epoch": 0.29250318326908653, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 33539 + }, + { + "epoch": 0.29251190455425513, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 33540 + }, + { + "epoch": 0.2925206258394237, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 33541 + }, + { + "epoch": 0.2925293471245923, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 33542 + }, + { + "epoch": 0.2925380684097609, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 33543 + }, + { + "epoch": 0.2925467896949294, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 33544 + }, + { + "epoch": 0.292555510980098, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 33545 + }, + { + "epoch": 0.2925642322652666, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 33546 + }, + { + "epoch": 0.29257295355043517, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 33547 + }, + { + "epoch": 0.29258167483560377, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 33548 + }, + { + "epoch": 0.29259039612077237, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 33549 + }, + { + "epoch": 0.29259911740594097, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 33550 + }, + { + "epoch": 0.2926078386911095, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 33551 + }, + { + "epoch": 0.2926165599762781, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 33552 + }, + { + "epoch": 0.2926252812614467, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 33553 + }, + { + "epoch": 0.29263400254661526, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 33554 + }, + { + "epoch": 0.29264272383178386, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 33555 + }, + { + "epoch": 0.29265144511695246, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 33556 + }, + { + "epoch": 0.292660166402121, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 33557 + }, + { + "epoch": 0.2926688876872896, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 33558 + }, + { + "epoch": 0.2926776089724582, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 33559 + }, + { + "epoch": 0.29268633025762675, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 33560 + }, + { + "epoch": 0.29269505154279535, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 33561 + }, + { + "epoch": 0.29270377282796395, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 33562 + }, + { + "epoch": 0.2927124941131325, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 33563 + }, + { + "epoch": 0.2927212153983011, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 33564 + }, + { + "epoch": 0.2927299366834697, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 33565 + }, + { + "epoch": 0.29273865796863824, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33566 + }, + { + "epoch": 0.29274737925380684, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 33567 + }, + { + "epoch": 0.29275610053897544, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 33568 + }, + { + "epoch": 0.292764821824144, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 33569 + }, + { + "epoch": 0.2927735431093126, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 33570 + }, + { + "epoch": 0.2927822643944812, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 33571 + }, + { + "epoch": 0.29279098567964973, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 33572 + }, + { + "epoch": 0.29279970696481833, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 33573 + }, + { + "epoch": 0.29280842824998693, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 33574 + }, + { + "epoch": 0.2928171495351555, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 33575 + }, + { + "epoch": 0.2928258708203241, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 33576 + }, + { + "epoch": 0.2928345921054927, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33577 + }, + { + "epoch": 0.2928433133906613, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 33578 + }, + { + "epoch": 0.2928520346758298, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 33579 + }, + { + "epoch": 0.2928607559609984, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 33580 + }, + { + "epoch": 0.292869477246167, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 33581 + }, + { + "epoch": 0.29287819853133557, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 33582 + }, + { + "epoch": 0.29288691981650417, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 33583 + }, + { + "epoch": 0.29289564110167277, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 33584 + }, + { + "epoch": 0.2929043623868413, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 33585 + }, + { + "epoch": 0.2929130836720099, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 33586 + }, + { + "epoch": 0.2929218049571785, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 33587 + }, + { + "epoch": 0.29293052624234706, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 33588 + }, + { + "epoch": 0.29293924752751566, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 33589 + }, + { + "epoch": 0.29294796881268426, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33590 + }, + { + "epoch": 0.2929566900978528, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 33591 + }, + { + "epoch": 0.2929654113830214, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 33592 + }, + { + "epoch": 0.29297413266819, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33593 + }, + { + "epoch": 0.29298285395335855, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 33594 + }, + { + "epoch": 0.29299157523852715, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 33595 + }, + { + "epoch": 0.29300029652369575, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 33596 + }, + { + "epoch": 0.2930090178088643, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 33597 + }, + { + "epoch": 0.2930177390940329, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 33598 + }, + { + "epoch": 0.2930264603792015, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 33599 + }, + { + "epoch": 0.29303518166437004, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 33600 + }, + { + "epoch": 0.29304390294953864, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 33601 + }, + { + "epoch": 0.29305262423470724, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 33602 + }, + { + "epoch": 0.2930613455198758, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 33603 + }, + { + "epoch": 0.2930700668050444, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 33604 + }, + { + "epoch": 0.293078788090213, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 33605 + }, + { + "epoch": 0.2930875093753816, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 33606 + }, + { + "epoch": 0.2930962306605501, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 33607 + }, + { + "epoch": 0.29310495194571873, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 33608 + }, + { + "epoch": 0.29311367323088733, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 33609 + }, + { + "epoch": 0.2931223945160559, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 33610 + }, + { + "epoch": 0.2931311158012245, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 33611 + }, + { + "epoch": 0.2931398370863931, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 33612 + }, + { + "epoch": 0.2931485583715616, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 33613 + }, + { + "epoch": 0.2931572796567302, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 33614 + }, + { + "epoch": 0.2931660009418988, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33615 + }, + { + "epoch": 0.29317472222706736, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 33616 + }, + { + "epoch": 0.29318344351223596, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 33617 + }, + { + "epoch": 0.29319216479740456, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 33618 + }, + { + "epoch": 0.2932008860825731, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 33619 + }, + { + "epoch": 0.2932096073677417, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 33620 + }, + { + "epoch": 0.2932183286529103, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 33621 + }, + { + "epoch": 0.29322704993807885, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 33622 + }, + { + "epoch": 0.29323577122324745, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 33623 + }, + { + "epoch": 0.29324449250841605, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 33624 + }, + { + "epoch": 0.2932532137935846, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33625 + }, + { + "epoch": 0.2932619350787532, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 33626 + }, + { + "epoch": 0.2932706563639218, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 33627 + }, + { + "epoch": 0.29327937764909034, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 33628 + }, + { + "epoch": 0.29328809893425895, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 33629 + }, + { + "epoch": 0.29329682021942755, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 33630 + }, + { + "epoch": 0.2933055415045961, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 33631 + }, + { + "epoch": 0.2933142627897647, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 33632 + }, + { + "epoch": 0.2933229840749333, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 33633 + }, + { + "epoch": 0.2933317053601019, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 33634 + }, + { + "epoch": 0.29334042664527044, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 33635 + }, + { + "epoch": 0.29334914793043904, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 33636 + }, + { + "epoch": 0.29335786921560764, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 33637 + }, + { + "epoch": 0.2933665905007762, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 33638 + }, + { + "epoch": 0.2933753117859448, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 33639 + }, + { + "epoch": 0.2933840330711134, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 33640 + }, + { + "epoch": 0.2933927543562819, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 33641 + }, + { + "epoch": 0.2934014756414505, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 33642 + }, + { + "epoch": 0.2934101969266191, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 33643 + }, + { + "epoch": 0.29341891821178767, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 33644 + }, + { + "epoch": 0.29342763949695627, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 33645 + }, + { + "epoch": 0.29343636078212487, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 33646 + }, + { + "epoch": 0.2934450820672934, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 33647 + }, + { + "epoch": 0.293453803352462, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 33648 + }, + { + "epoch": 0.2934625246376306, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 33649 + }, + { + "epoch": 0.29347124592279916, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33650 + }, + { + "epoch": 0.29347996720796776, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33651 + }, + { + "epoch": 0.29348868849313636, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 33652 + }, + { + "epoch": 0.2934974097783049, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 33653 + }, + { + "epoch": 0.2935061310634735, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 33654 + }, + { + "epoch": 0.2935148523486421, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33655 + }, + { + "epoch": 0.29352357363381065, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 33656 + }, + { + "epoch": 0.29353229491897925, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 33657 + }, + { + "epoch": 0.29354101620414785, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 33658 + }, + { + "epoch": 0.2935497374893164, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 33659 + }, + { + "epoch": 0.293558458774485, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 33660 + }, + { + "epoch": 0.2935671800596536, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 33661 + }, + { + "epoch": 0.2935759013448222, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 33662 + }, + { + "epoch": 0.29358462262999074, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 33663 + }, + { + "epoch": 0.29359334391515934, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 33664 + }, + { + "epoch": 0.29360206520032794, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 33665 + }, + { + "epoch": 0.2936107864854965, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 33666 + }, + { + "epoch": 0.2936195077706651, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 33667 + }, + { + "epoch": 0.2936282290558337, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 33668 + }, + { + "epoch": 0.29363695034100223, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 33669 + }, + { + "epoch": 0.29364567162617083, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 33670 + }, + { + "epoch": 0.29365439291133943, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 33671 + }, + { + "epoch": 0.293663114196508, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 33672 + }, + { + "epoch": 0.2936718354816766, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 33673 + }, + { + "epoch": 0.2936805567668452, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 33674 + }, + { + "epoch": 0.2936892780520137, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 33675 + }, + { + "epoch": 0.2936979993371823, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 33676 + }, + { + "epoch": 0.2937067206223509, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 33677 + }, + { + "epoch": 0.29371544190751947, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 33678 + }, + { + "epoch": 0.29372416319268807, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 33679 + }, + { + "epoch": 0.29373288447785667, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 33680 + }, + { + "epoch": 0.2937416057630252, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 33681 + }, + { + "epoch": 0.2937503270481938, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 33682 + }, + { + "epoch": 0.2937590483333624, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 33683 + }, + { + "epoch": 0.29376776961853096, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 33684 + }, + { + "epoch": 0.29377649090369956, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 33685 + }, + { + "epoch": 0.29378521218886816, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 33686 + }, + { + "epoch": 0.29379393347403676, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 33687 + }, + { + "epoch": 0.2938026547592053, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 33688 + }, + { + "epoch": 0.2938113760443739, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 33689 + }, + { + "epoch": 0.2938200973295425, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 33690 + }, + { + "epoch": 0.29382881861471105, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 33691 + }, + { + "epoch": 0.29383753989987965, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 33692 + }, + { + "epoch": 0.29384626118504825, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 33693 + }, + { + "epoch": 0.2938549824702168, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 33694 + }, + { + "epoch": 0.2938637037553854, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 33695 + }, + { + "epoch": 0.293872425040554, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 33696 + }, + { + "epoch": 0.29388114632572254, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33697 + }, + { + "epoch": 0.29388986761089114, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 33698 + }, + { + "epoch": 0.29389858889605974, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33699 + }, + { + "epoch": 0.2939073101812283, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 33700 + }, + { + "epoch": 0.2939160314663969, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 33701 + }, + { + "epoch": 0.2939247527515655, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 33702 + }, + { + "epoch": 0.29393347403673403, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 33703 + }, + { + "epoch": 0.29394219532190263, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 33704 + }, + { + "epoch": 0.29395091660707123, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 33705 + }, + { + "epoch": 0.2939596378922398, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 33706 + }, + { + "epoch": 0.2939683591774084, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 33707 + }, + { + "epoch": 0.293977080462577, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 33708 + }, + { + "epoch": 0.2939858017477455, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 33709 + }, + { + "epoch": 0.2939945230329141, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 33710 + }, + { + "epoch": 0.2940032443180827, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 33711 + }, + { + "epoch": 0.29401196560325127, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 33712 + }, + { + "epoch": 0.29402068688841987, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 33713 + }, + { + "epoch": 0.29402940817358847, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33714 + }, + { + "epoch": 0.29403812945875707, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 33715 + }, + { + "epoch": 0.2940468507439256, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 33716 + }, + { + "epoch": 0.2940555720290942, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33717 + }, + { + "epoch": 0.2940642933142628, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 33718 + }, + { + "epoch": 0.29407301459943136, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 33719 + }, + { + "epoch": 0.29408173588459996, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 33720 + }, + { + "epoch": 0.29409045716976856, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 33721 + }, + { + "epoch": 0.2940991784549371, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 33722 + }, + { + "epoch": 0.2941078997401057, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 33723 + }, + { + "epoch": 0.2941166210252743, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 33724 + }, + { + "epoch": 0.29412534231044285, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 33725 + }, + { + "epoch": 0.29413406359561145, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 33726 + }, + { + "epoch": 0.29414278488078005, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 33727 + }, + { + "epoch": 0.2941515061659486, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 33728 + }, + { + "epoch": 0.2941602274511172, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 33729 + }, + { + "epoch": 0.2941689487362858, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 33730 + }, + { + "epoch": 0.29417767002145434, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 33731 + }, + { + "epoch": 0.29418639130662294, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 33732 + }, + { + "epoch": 0.29419511259179154, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 33733 + }, + { + "epoch": 0.2942038338769601, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 33734 + }, + { + "epoch": 0.2942125551621287, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 33735 + }, + { + "epoch": 0.2942212764472973, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 33736 + }, + { + "epoch": 0.29422999773246583, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 33737 + }, + { + "epoch": 0.29423871901763443, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 33738 + }, + { + "epoch": 0.29424744030280303, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 33739 + }, + { + "epoch": 0.2942561615879716, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 33740 + }, + { + "epoch": 0.2942648828731402, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9825, + "step": 33741 + }, + { + "epoch": 0.2942736041583088, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 33742 + }, + { + "epoch": 0.2942823254434774, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33743 + }, + { + "epoch": 0.2942910467286459, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 33744 + }, + { + "epoch": 0.2942997680138145, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 33745 + }, + { + "epoch": 0.2943084892989831, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 33746 + }, + { + "epoch": 0.29431721058415167, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 33747 + }, + { + "epoch": 0.29432593186932027, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 33748 + }, + { + "epoch": 0.29433465315448887, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 33749 + }, + { + "epoch": 0.2943433744396574, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 33750 + }, + { + "epoch": 0.294352095724826, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 33751 + }, + { + "epoch": 0.2943608170099946, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 33752 + }, + { + "epoch": 0.29436953829516316, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 33753 + }, + { + "epoch": 0.29437825958033176, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 33754 + }, + { + "epoch": 0.29438698086550036, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 33755 + }, + { + "epoch": 0.2943957021506689, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 33756 + }, + { + "epoch": 0.2944044234358375, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 33757 + }, + { + "epoch": 0.2944131447210061, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 33758 + }, + { + "epoch": 0.29442186600617465, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 33759 + }, + { + "epoch": 0.29443058729134325, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 33760 + }, + { + "epoch": 0.29443930857651185, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 33761 + }, + { + "epoch": 0.2944480298616804, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 33762 + }, + { + "epoch": 0.294456751146849, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 33763 + }, + { + "epoch": 0.2944654724320176, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33764 + }, + { + "epoch": 0.29447419371718614, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 33765 + }, + { + "epoch": 0.29448291500235474, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 33766 + }, + { + "epoch": 0.29449163628752334, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33767 + }, + { + "epoch": 0.2945003575726919, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 33768 + }, + { + "epoch": 0.2945090788578605, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 33769 + }, + { + "epoch": 0.2945178001430291, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 33770 + }, + { + "epoch": 0.2945265214281977, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 33771 + }, + { + "epoch": 0.29453524271336623, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 33772 + }, + { + "epoch": 0.29454396399853483, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 33773 + }, + { + "epoch": 0.29455268528370343, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 33774 + }, + { + "epoch": 0.294561406568872, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 33775 + }, + { + "epoch": 0.2945701278540406, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 33776 + }, + { + "epoch": 0.2945788491392092, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 33777 + }, + { + "epoch": 0.2945875704243777, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 33778 + }, + { + "epoch": 0.2945962917095463, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 33779 + }, + { + "epoch": 0.2946050129947149, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 33780 + }, + { + "epoch": 0.29461373427988347, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 33781 + }, + { + "epoch": 0.29462245556505207, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 33782 + }, + { + "epoch": 0.29463117685022067, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 33783 + }, + { + "epoch": 0.2946398981353892, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 33784 + }, + { + "epoch": 0.2946486194205578, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 33785 + }, + { + "epoch": 0.2946573407057264, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 33786 + }, + { + "epoch": 0.29466606199089496, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 33787 + }, + { + "epoch": 0.29467478327606356, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33788 + }, + { + "epoch": 0.29468350456123216, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 33789 + }, + { + "epoch": 0.2946922258464007, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33790 + }, + { + "epoch": 0.2947009471315693, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 33791 + }, + { + "epoch": 0.2947096684167379, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 33792 + }, + { + "epoch": 0.29471838970190645, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 33793 + }, + { + "epoch": 0.29472711098707505, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 33794 + }, + { + "epoch": 0.29473583227224365, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 33795 + }, + { + "epoch": 0.29474455355741225, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 33796 + }, + { + "epoch": 0.2947532748425808, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 33797 + }, + { + "epoch": 0.2947619961277494, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 33798 + }, + { + "epoch": 0.294770717412918, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 33799 + }, + { + "epoch": 0.29477943869808654, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 33800 + }, + { + "epoch": 0.29478815998325514, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 33801 + }, + { + "epoch": 0.29479688126842374, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 33802 + }, + { + "epoch": 0.2948056025535923, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 33803 + }, + { + "epoch": 0.2948143238387609, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 33804 + }, + { + "epoch": 0.2948230451239295, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 33805 + }, + { + "epoch": 0.29483176640909803, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 33806 + }, + { + "epoch": 0.29484048769426663, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 33807 + }, + { + "epoch": 0.29484920897943523, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 33808 + }, + { + "epoch": 0.2948579302646038, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 33809 + }, + { + "epoch": 0.2948666515497724, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 33810 + }, + { + "epoch": 0.294875372834941, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 33811 + }, + { + "epoch": 0.2948840941201095, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 33812 + }, + { + "epoch": 0.2948928154052781, + "grad_norm": 0.34375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 33813 + }, + { + "epoch": 0.2949015366904467, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 33814 + }, + { + "epoch": 0.29491025797561526, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 33815 + }, + { + "epoch": 0.29491897926078386, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 33816 + }, + { + "epoch": 0.29492770054595246, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 33817 + }, + { + "epoch": 0.294936421831121, + "grad_norm": 0.404296875, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 33818 + }, + { + "epoch": 0.2949451431162896, + "grad_norm": 0.375, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 33819 + }, + { + "epoch": 0.2949538644014582, + "grad_norm": 0.453125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 33820 + }, + { + "epoch": 0.29496258568662675, + "grad_norm": 0.46484375, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 33821 + }, + { + "epoch": 0.29497130697179536, + "grad_norm": 0.466796875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 33822 + }, + { + "epoch": 0.29498002825696396, + "grad_norm": 0.54296875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 33823 + }, + { + "epoch": 0.29498874954213256, + "grad_norm": 0.46875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 33824 + }, + { + "epoch": 0.2949974708273011, + "grad_norm": 0.5546875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33825 + }, + { + "epoch": 0.2950061921124697, + "grad_norm": 0.5078125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33826 + }, + { + "epoch": 0.2950149133976383, + "grad_norm": 0.52734375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 33827 + }, + { + "epoch": 0.29502363468280685, + "grad_norm": 0.578125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 33828 + }, + { + "epoch": 0.29503235596797545, + "grad_norm": 0.5625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 33829 + }, + { + "epoch": 0.29504107725314405, + "grad_norm": 0.50390625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 33830 + }, + { + "epoch": 0.2950497985383126, + "grad_norm": 0.376953125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33831 + }, + { + "epoch": 0.2950585198234812, + "grad_norm": 0.431640625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 33832 + }, + { + "epoch": 0.2950672411086498, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 33833 + }, + { + "epoch": 0.29507596239381834, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 33834 + }, + { + "epoch": 0.29508468367898694, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 33835 + }, + { + "epoch": 0.29509340496415554, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 33836 + }, + { + "epoch": 0.2951021262493241, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 33837 + }, + { + "epoch": 0.2951108475344927, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 33838 + }, + { + "epoch": 0.2951195688196613, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 33839 + }, + { + "epoch": 0.2951282901048298, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 33840 + }, + { + "epoch": 0.2951370113899984, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 33841 + }, + { + "epoch": 0.295145732675167, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 33842 + }, + { + "epoch": 0.29515445396033557, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 33843 + }, + { + "epoch": 0.2951631752455042, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 33844 + }, + { + "epoch": 0.2951718965306728, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 33845 + }, + { + "epoch": 0.2951806178158413, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 33846 + }, + { + "epoch": 0.2951893391010099, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 33847 + }, + { + "epoch": 0.2951980603861785, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 33848 + }, + { + "epoch": 0.29520678167134706, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 33849 + }, + { + "epoch": 0.29521550295651566, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 33850 + }, + { + "epoch": 0.29522422424168426, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 33851 + }, + { + "epoch": 0.29523294552685286, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 33852 + }, + { + "epoch": 0.2952416668120214, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 33853 + }, + { + "epoch": 0.29525038809719, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 33854 + }, + { + "epoch": 0.2952591093823586, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 33855 + }, + { + "epoch": 0.29526783066752715, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 33856 + }, + { + "epoch": 0.29527655195269575, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 33857 + }, + { + "epoch": 0.29528527323786435, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 33858 + }, + { + "epoch": 0.2952939945230329, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 33859 + }, + { + "epoch": 0.2953027158082015, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 33860 + }, + { + "epoch": 0.2953114370933701, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 33861 + }, + { + "epoch": 0.29532015837853864, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 33862 + }, + { + "epoch": 0.29532887966370724, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 0.9811, + "step": 33863 + }, + { + "epoch": 0.29533760094887584, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 33864 + }, + { + "epoch": 0.2953463222340444, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 33865 + }, + { + "epoch": 0.295355043519213, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 33866 + }, + { + "epoch": 0.2953637648043816, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33867 + }, + { + "epoch": 0.29537248608955013, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 33868 + }, + { + "epoch": 0.29538120737471874, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 33869 + }, + { + "epoch": 0.29538992865988734, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 33870 + }, + { + "epoch": 0.2953986499450559, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 33871 + }, + { + "epoch": 0.2954073712302245, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33872 + }, + { + "epoch": 0.2954160925153931, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 33873 + }, + { + "epoch": 0.2954248138005616, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 33874 + }, + { + "epoch": 0.2954335350857302, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 33875 + }, + { + "epoch": 0.2954422563708988, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 33876 + }, + { + "epoch": 0.29545097765606737, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 33877 + }, + { + "epoch": 0.29545969894123597, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 33878 + }, + { + "epoch": 0.29546842022640457, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 33879 + }, + { + "epoch": 0.29547714151157317, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 33880 + }, + { + "epoch": 0.2954858627967417, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 33881 + }, + { + "epoch": 0.2954945840819103, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 33882 + }, + { + "epoch": 0.2955033053670789, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 33883 + }, + { + "epoch": 0.29551202665224746, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 33884 + }, + { + "epoch": 0.29552074793741606, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 33885 + }, + { + "epoch": 0.29552946922258466, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 33886 + }, + { + "epoch": 0.2955381905077532, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 33887 + }, + { + "epoch": 0.2955469117929218, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 33888 + }, + { + "epoch": 0.2955556330780904, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 33889 + }, + { + "epoch": 0.29556435436325895, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 33890 + }, + { + "epoch": 0.29557307564842755, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 33891 + }, + { + "epoch": 0.29558179693359615, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 33892 + }, + { + "epoch": 0.2955905182187647, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 33893 + }, + { + "epoch": 0.2955992395039333, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 33894 + }, + { + "epoch": 0.2956079607891019, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 33895 + }, + { + "epoch": 0.29561668207427044, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 33896 + }, + { + "epoch": 0.29562540335943904, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 33897 + }, + { + "epoch": 0.29563412464460764, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 33898 + }, + { + "epoch": 0.2956428459297762, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 33899 + }, + { + "epoch": 0.2956515672149448, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 33900 + }, + { + "epoch": 0.2956602885001134, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 33901 + }, + { + "epoch": 0.29566900978528193, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 33902 + }, + { + "epoch": 0.29567773107045053, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33903 + }, + { + "epoch": 0.29568645235561913, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 33904 + }, + { + "epoch": 0.29569517364078773, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 33905 + }, + { + "epoch": 0.2957038949259563, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 33906 + }, + { + "epoch": 0.2957126162111249, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 33907 + }, + { + "epoch": 0.2957213374962935, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 33908 + }, + { + "epoch": 0.295730058781462, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33909 + }, + { + "epoch": 0.2957387800666306, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 33910 + }, + { + "epoch": 0.2957475013517992, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33911 + }, + { + "epoch": 0.29575622263696777, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 33912 + }, + { + "epoch": 0.29576494392213637, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 33913 + }, + { + "epoch": 0.29577366520730497, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33914 + }, + { + "epoch": 0.2957823864924735, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 33915 + }, + { + "epoch": 0.2957911077776421, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 33916 + }, + { + "epoch": 0.2957998290628107, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 33917 + }, + { + "epoch": 0.29580855034797926, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 33918 + }, + { + "epoch": 0.29581727163314786, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 33919 + }, + { + "epoch": 0.29582599291831646, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 33920 + }, + { + "epoch": 0.295834714203485, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 33921 + }, + { + "epoch": 0.2958434354886536, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 33922 + }, + { + "epoch": 0.2958521567738222, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 33923 + }, + { + "epoch": 0.29586087805899075, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 33924 + }, + { + "epoch": 0.29586959934415935, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 33925 + }, + { + "epoch": 0.29587832062932795, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 33926 + }, + { + "epoch": 0.2958870419144965, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 33927 + }, + { + "epoch": 0.2958957631996651, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 33928 + }, + { + "epoch": 0.2959044844848337, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 33929 + }, + { + "epoch": 0.29591320577000224, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 33930 + }, + { + "epoch": 0.29592192705517084, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 33931 + }, + { + "epoch": 0.29593064834033944, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 33932 + }, + { + "epoch": 0.29593936962550804, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 33933 + }, + { + "epoch": 0.2959480909106766, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 33934 + }, + { + "epoch": 0.2959568121958452, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 33935 + }, + { + "epoch": 0.2959655334810138, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 33936 + }, + { + "epoch": 0.29597425476618233, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 33937 + }, + { + "epoch": 0.29598297605135093, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 33938 + }, + { + "epoch": 0.29599169733651953, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 33939 + }, + { + "epoch": 0.2960004186216881, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 33940 + }, + { + "epoch": 0.2960091399068567, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 33941 + }, + { + "epoch": 0.2960178611920253, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 33942 + }, + { + "epoch": 0.2960265824771938, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 33943 + }, + { + "epoch": 0.2960353037623624, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 33944 + }, + { + "epoch": 0.296044025047531, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 33945 + }, + { + "epoch": 0.29605274633269957, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 33946 + }, + { + "epoch": 0.29606146761786817, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 33947 + }, + { + "epoch": 0.29607018890303677, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 33948 + }, + { + "epoch": 0.2960789101882053, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 33949 + }, + { + "epoch": 0.2960876314733739, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 33950 + }, + { + "epoch": 0.2960963527585425, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 33951 + }, + { + "epoch": 0.29610507404371106, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 33952 + }, + { + "epoch": 0.29611379532887966, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 33953 + }, + { + "epoch": 0.29612251661404826, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33954 + }, + { + "epoch": 0.2961312378992168, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 33955 + }, + { + "epoch": 0.2961399591843854, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 33956 + }, + { + "epoch": 0.296148680469554, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 33957 + }, + { + "epoch": 0.29615740175472255, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 33958 + }, + { + "epoch": 0.29616612303989115, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 33959 + }, + { + "epoch": 0.29617484432505975, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 33960 + }, + { + "epoch": 0.29618356561022835, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 33961 + }, + { + "epoch": 0.2961922868953969, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 33962 + }, + { + "epoch": 0.2962010081805655, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 33963 + }, + { + "epoch": 0.2962097294657341, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 33964 + }, + { + "epoch": 0.29621845075090264, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 33965 + }, + { + "epoch": 0.29622717203607124, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 33966 + }, + { + "epoch": 0.29623589332123984, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 33967 + }, + { + "epoch": 0.2962446146064084, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 33968 + }, + { + "epoch": 0.296253335891577, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 33969 + }, + { + "epoch": 0.2962620571767456, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 33970 + }, + { + "epoch": 0.29627077846191413, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9728, + "step": 33971 + }, + { + "epoch": 0.29627949974708273, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 33972 + }, + { + "epoch": 0.29628822103225133, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 33973 + }, + { + "epoch": 0.2962969423174199, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 33974 + }, + { + "epoch": 0.2963056636025885, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 33975 + }, + { + "epoch": 0.2963143848877571, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33976 + }, + { + "epoch": 0.2963231061729256, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 33977 + }, + { + "epoch": 0.2963318274580942, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 33978 + }, + { + "epoch": 0.2963405487432628, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33979 + }, + { + "epoch": 0.29634927002843137, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 33980 + }, + { + "epoch": 0.29635799131359997, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 33981 + }, + { + "epoch": 0.29636671259876857, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 33982 + }, + { + "epoch": 0.2963754338839371, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 33983 + }, + { + "epoch": 0.2963841551691057, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 33984 + }, + { + "epoch": 0.2963928764542743, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 33985 + }, + { + "epoch": 0.29640159773944286, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 33986 + }, + { + "epoch": 0.29641031902461146, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 33987 + }, + { + "epoch": 0.29641904030978006, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 33988 + }, + { + "epoch": 0.29642776159494866, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 33989 + }, + { + "epoch": 0.2964364828801172, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 33990 + }, + { + "epoch": 0.2964452041652858, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 33991 + }, + { + "epoch": 0.2964539254504544, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 33992 + }, + { + "epoch": 0.29646264673562295, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 33993 + }, + { + "epoch": 0.29647136802079155, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 33994 + }, + { + "epoch": 0.29648008930596015, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 33995 + }, + { + "epoch": 0.2964888105911287, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 33996 + }, + { + "epoch": 0.2964975318762973, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 33997 + }, + { + "epoch": 0.2965062531614659, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 33998 + }, + { + "epoch": 0.29651497444663444, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 33999 + }, + { + "epoch": 0.29652369573180304, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 34000 + }, + { + "epoch": 0.29653241701697164, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 34001 + }, + { + "epoch": 0.2965411383021402, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 34002 + }, + { + "epoch": 0.2965498595873088, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 34003 + }, + { + "epoch": 0.2965585808724774, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 34004 + }, + { + "epoch": 0.29656730215764593, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 34005 + }, + { + "epoch": 0.29657602344281453, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 34006 + }, + { + "epoch": 0.29658474472798313, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 34007 + }, + { + "epoch": 0.2965934660131517, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 34008 + }, + { + "epoch": 0.2966021872983203, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 34009 + }, + { + "epoch": 0.2966109085834889, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 34010 + }, + { + "epoch": 0.2966196298686574, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 34011 + }, + { + "epoch": 0.296628351153826, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 34012 + }, + { + "epoch": 0.2966370724389946, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 34013 + }, + { + "epoch": 0.2966457937241632, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 34014 + }, + { + "epoch": 0.29665451500933177, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 34015 + }, + { + "epoch": 0.29666323629450037, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 34016 + }, + { + "epoch": 0.29667195757966897, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 34017 + }, + { + "epoch": 0.2966806788648375, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 34018 + }, + { + "epoch": 0.2966894001500061, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 34019 + }, + { + "epoch": 0.2966981214351747, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 34020 + }, + { + "epoch": 0.29670684272034326, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 34021 + }, + { + "epoch": 0.29671556400551186, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 34022 + }, + { + "epoch": 0.29672428529068046, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 34023 + }, + { + "epoch": 0.296733006575849, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 34024 + }, + { + "epoch": 0.2967417278610176, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 34025 + }, + { + "epoch": 0.2967504491461862, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 34026 + }, + { + "epoch": 0.29675917043135475, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 34027 + }, + { + "epoch": 0.29676789171652335, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 34028 + }, + { + "epoch": 0.29677661300169195, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 34029 + }, + { + "epoch": 0.2967853342868605, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 34030 + }, + { + "epoch": 0.2967940555720291, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 34031 + }, + { + "epoch": 0.2968027768571977, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 34032 + }, + { + "epoch": 0.29681149814236624, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 34033 + }, + { + "epoch": 0.29682021942753484, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 34034 + }, + { + "epoch": 0.29682894071270344, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 34035 + }, + { + "epoch": 0.296837661997872, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 34036 + }, + { + "epoch": 0.2968463832830406, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 34037 + }, + { + "epoch": 0.2968551045682092, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 34038 + }, + { + "epoch": 0.2968638258533777, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 34039 + }, + { + "epoch": 0.2968725471385463, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 34040 + }, + { + "epoch": 0.29688126842371493, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 34041 + }, + { + "epoch": 0.29688998970888353, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 34042 + }, + { + "epoch": 0.2968987109940521, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 34043 + }, + { + "epoch": 0.2969074322792207, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 34044 + }, + { + "epoch": 0.2969161535643893, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 34045 + }, + { + "epoch": 0.2969248748495578, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 34046 + }, + { + "epoch": 0.2969335961347264, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 34047 + }, + { + "epoch": 0.296942317419895, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 34048 + }, + { + "epoch": 0.29695103870506356, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 34049 + }, + { + "epoch": 0.29695975999023216, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 34050 + }, + { + "epoch": 0.29696848127540076, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 34051 + }, + { + "epoch": 0.2969772025605693, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 34052 + }, + { + "epoch": 0.2969859238457379, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 34053 + }, + { + "epoch": 0.2969946451309065, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 34054 + }, + { + "epoch": 0.29700336641607505, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 34055 + }, + { + "epoch": 0.29701208770124365, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 34056 + }, + { + "epoch": 0.29702080898641225, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 34057 + }, + { + "epoch": 0.2970295302715808, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 34058 + }, + { + "epoch": 0.2970382515567494, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 34059 + }, + { + "epoch": 0.297046972841918, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 34060 + }, + { + "epoch": 0.29705569412708654, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 34061 + }, + { + "epoch": 0.29706441541225515, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 34062 + }, + { + "epoch": 0.29707313669742375, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 34063 + }, + { + "epoch": 0.2970818579825923, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 34064 + }, + { + "epoch": 0.2970905792677609, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 34065 + }, + { + "epoch": 0.2970993005529295, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 34066 + }, + { + "epoch": 0.29710802183809804, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 34067 + }, + { + "epoch": 0.29711674312326664, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 34068 + }, + { + "epoch": 0.29712546440843524, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 34069 + }, + { + "epoch": 0.29713418569360384, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 34070 + }, + { + "epoch": 0.2971429069787724, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 34071 + }, + { + "epoch": 0.297151628263941, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 34072 + }, + { + "epoch": 0.2971603495491096, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 34073 + }, + { + "epoch": 0.2971690708342781, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 34074 + }, + { + "epoch": 0.2971777921194467, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 34075 + }, + { + "epoch": 0.2971865134046153, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 34076 + }, + { + "epoch": 0.29719523468978387, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 34077 + }, + { + "epoch": 0.29720395597495247, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 34078 + }, + { + "epoch": 0.29721267726012107, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 34079 + }, + { + "epoch": 0.2972213985452896, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 34080 + }, + { + "epoch": 0.2972301198304582, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 34081 + }, + { + "epoch": 0.2972388411156268, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34082 + }, + { + "epoch": 0.29724756240079536, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 34083 + }, + { + "epoch": 0.29725628368596396, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 34084 + }, + { + "epoch": 0.29726500497113256, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 34085 + }, + { + "epoch": 0.2972737262563011, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 34086 + }, + { + "epoch": 0.2972824475414697, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 34087 + }, + { + "epoch": 0.2972911688266383, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 34088 + }, + { + "epoch": 0.29729989011180685, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 34089 + }, + { + "epoch": 0.29730861139697545, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 34090 + }, + { + "epoch": 0.29731733268214405, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 34091 + }, + { + "epoch": 0.2973260539673126, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 34092 + }, + { + "epoch": 0.2973347752524812, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 34093 + }, + { + "epoch": 0.2973434965376498, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 34094 + }, + { + "epoch": 0.29735221782281834, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 34095 + }, + { + "epoch": 0.29736093910798694, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 34096 + }, + { + "epoch": 0.29736966039315554, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 34097 + }, + { + "epoch": 0.29737838167832414, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 34098 + }, + { + "epoch": 0.2973871029634927, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 34099 + }, + { + "epoch": 0.2973958242486613, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 34100 + }, + { + "epoch": 0.2974045455338299, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 34101 + }, + { + "epoch": 0.29741326681899843, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 34102 + }, + { + "epoch": 0.29742198810416703, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 34103 + }, + { + "epoch": 0.29743070938933563, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 34104 + }, + { + "epoch": 0.2974394306745042, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 34105 + }, + { + "epoch": 0.2974481519596728, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 34106 + }, + { + "epoch": 0.2974568732448414, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 34107 + }, + { + "epoch": 0.2974655945300099, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 34108 + }, + { + "epoch": 0.2974743158151785, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 34109 + }, + { + "epoch": 0.2974830371003471, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 34110 + }, + { + "epoch": 0.29749175838551567, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 34111 + }, + { + "epoch": 0.29750047967068427, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 34112 + }, + { + "epoch": 0.29750920095585287, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 34113 + }, + { + "epoch": 0.2975179222410214, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 34114 + }, + { + "epoch": 0.29752664352619, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 34115 + }, + { + "epoch": 0.2975353648113586, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 34116 + }, + { + "epoch": 0.29754408609652716, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 34117 + }, + { + "epoch": 0.29755280738169576, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 34118 + }, + { + "epoch": 0.29756152866686436, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 34119 + }, + { + "epoch": 0.2975702499520329, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 34120 + }, + { + "epoch": 0.2975789712372015, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34121 + }, + { + "epoch": 0.2975876925223701, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 34122 + }, + { + "epoch": 0.2975964138075387, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 34123 + }, + { + "epoch": 0.29760513509270725, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 34124 + }, + { + "epoch": 0.29761385637787585, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 34125 + }, + { + "epoch": 0.29762257766304445, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 34126 + }, + { + "epoch": 0.297631298948213, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 34127 + }, + { + "epoch": 0.2976400202333816, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 34128 + }, + { + "epoch": 0.2976487415185502, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 34129 + }, + { + "epoch": 0.29765746280371874, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 34130 + }, + { + "epoch": 0.29766618408888734, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 34131 + }, + { + "epoch": 0.29767490537405594, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 34132 + }, + { + "epoch": 0.2976836266592245, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 34133 + }, + { + "epoch": 0.2976923479443931, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 34134 + }, + { + "epoch": 0.2977010692295617, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 34135 + }, + { + "epoch": 0.29770979051473023, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 34136 + }, + { + "epoch": 0.29771851179989883, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 34137 + }, + { + "epoch": 0.29772723308506743, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 34138 + }, + { + "epoch": 0.297735954370236, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 34139 + }, + { + "epoch": 0.2977446756554046, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 34140 + }, + { + "epoch": 0.2977533969405732, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 34141 + }, + { + "epoch": 0.2977621182257417, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 34142 + }, + { + "epoch": 0.2977708395109103, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 34143 + }, + { + "epoch": 0.2977795607960789, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 34144 + }, + { + "epoch": 0.29778828208124747, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 34145 + }, + { + "epoch": 0.29779700336641607, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34146 + }, + { + "epoch": 0.29780572465158467, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 34147 + }, + { + "epoch": 0.2978144459367532, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 34148 + }, + { + "epoch": 0.2978231672219218, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 34149 + }, + { + "epoch": 0.2978318885070904, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 34150 + }, + { + "epoch": 0.297840609792259, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 34151 + }, + { + "epoch": 0.29784933107742756, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 34152 + }, + { + "epoch": 0.29785805236259616, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 34153 + }, + { + "epoch": 0.29786677364776476, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 34154 + }, + { + "epoch": 0.2978754949329333, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 34155 + }, + { + "epoch": 0.2978842162181019, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 34156 + }, + { + "epoch": 0.2978929375032705, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 34157 + }, + { + "epoch": 0.29790165878843905, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 34158 + }, + { + "epoch": 0.29791038007360765, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 34159 + }, + { + "epoch": 0.29791910135877625, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 34160 + }, + { + "epoch": 0.2979278226439448, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 34161 + }, + { + "epoch": 0.2979365439291134, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 34162 + }, + { + "epoch": 0.297945265214282, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 34163 + }, + { + "epoch": 0.29795398649945054, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 34164 + }, + { + "epoch": 0.29796270778461914, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 34165 + }, + { + "epoch": 0.29797142906978774, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 34166 + }, + { + "epoch": 0.2979801503549563, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 34167 + }, + { + "epoch": 0.2979888716401249, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 34168 + }, + { + "epoch": 0.2979975929252935, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 34169 + }, + { + "epoch": 0.29800631421046203, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 34170 + }, + { + "epoch": 0.29801503549563063, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 34171 + }, + { + "epoch": 0.29802375678079923, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 34172 + }, + { + "epoch": 0.2980324780659678, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 34173 + }, + { + "epoch": 0.2980411993511364, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 34174 + }, + { + "epoch": 0.298049920636305, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 34175 + }, + { + "epoch": 0.2980586419214735, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 34176 + }, + { + "epoch": 0.2980673632066421, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34177 + }, + { + "epoch": 0.2980760844918107, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 34178 + }, + { + "epoch": 0.2980848057769793, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 34179 + }, + { + "epoch": 0.29809352706214787, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 34180 + }, + { + "epoch": 0.29810224834731647, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 34181 + }, + { + "epoch": 0.29811096963248507, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 34182 + }, + { + "epoch": 0.2981196909176536, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 34183 + }, + { + "epoch": 0.2981284122028222, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 34184 + }, + { + "epoch": 0.2981371334879908, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 34185 + }, + { + "epoch": 0.29814585477315936, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 34186 + }, + { + "epoch": 0.29815457605832796, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 34187 + }, + { + "epoch": 0.29816329734349656, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 34188 + }, + { + "epoch": 0.2981720186286651, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 34189 + }, + { + "epoch": 0.2981807399138337, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 34190 + }, + { + "epoch": 0.2981894611990023, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 34191 + }, + { + "epoch": 0.29819818248417085, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 34192 + }, + { + "epoch": 0.29820690376933945, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 34193 + }, + { + "epoch": 0.29821562505450805, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 34194 + }, + { + "epoch": 0.2982243463396766, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 34195 + }, + { + "epoch": 0.2982330676248452, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 34196 + }, + { + "epoch": 0.2982417889100138, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 34197 + }, + { + "epoch": 0.29825051019518234, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 34198 + }, + { + "epoch": 0.29825923148035094, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 34199 + }, + { + "epoch": 0.29826795276551954, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 34200 + }, + { + "epoch": 0.2982766740506881, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 34201 + }, + { + "epoch": 0.2982853953358567, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 34202 + }, + { + "epoch": 0.2982941166210253, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 34203 + }, + { + "epoch": 0.29830283790619383, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 34204 + }, + { + "epoch": 0.29831155919136243, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 34205 + }, + { + "epoch": 0.29832028047653103, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 34206 + }, + { + "epoch": 0.29832900176169963, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 34207 + }, + { + "epoch": 0.2983377230468682, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 34208 + }, + { + "epoch": 0.2983464443320368, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 34209 + }, + { + "epoch": 0.2983551656172054, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 34210 + }, + { + "epoch": 0.2983638869023739, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 34211 + }, + { + "epoch": 0.2983726081875425, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 34212 + }, + { + "epoch": 0.2983813294727111, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 34213 + }, + { + "epoch": 0.29839005075787967, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 34214 + }, + { + "epoch": 0.29839877204304827, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 34215 + }, + { + "epoch": 0.29840749332821687, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 34216 + }, + { + "epoch": 0.2984162146133854, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 34217 + }, + { + "epoch": 0.298424935898554, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 34218 + }, + { + "epoch": 0.2984336571837226, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 34219 + }, + { + "epoch": 0.29844237846889116, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 34220 + }, + { + "epoch": 0.29845109975405976, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 34221 + }, + { + "epoch": 0.29845982103922836, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 34222 + }, + { + "epoch": 0.2984685423243969, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 34223 + }, + { + "epoch": 0.2984772636095655, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 34224 + }, + { + "epoch": 0.2984859848947341, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 34225 + }, + { + "epoch": 0.29849470617990265, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 34226 + }, + { + "epoch": 0.29850342746507125, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 34227 + }, + { + "epoch": 0.29851214875023985, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 34228 + }, + { + "epoch": 0.2985208700354084, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 34229 + }, + { + "epoch": 0.298529591320577, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 34230 + }, + { + "epoch": 0.2985383126057456, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 34231 + }, + { + "epoch": 0.29854703389091414, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 34232 + }, + { + "epoch": 0.29855575517608274, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 34233 + }, + { + "epoch": 0.29856447646125134, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 34234 + }, + { + "epoch": 0.29857319774641994, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 34235 + }, + { + "epoch": 0.2985819190315885, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 34236 + }, + { + "epoch": 0.2985906403167571, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 34237 + }, + { + "epoch": 0.2985993616019257, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 34238 + }, + { + "epoch": 0.29860808288709423, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 34239 + }, + { + "epoch": 0.29861680417226283, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 34240 + }, + { + "epoch": 0.29862552545743143, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 34241 + }, + { + "epoch": 0.2986342467426, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 34242 + }, + { + "epoch": 0.2986429680277686, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 34243 + }, + { + "epoch": 0.2986516893129372, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 34244 + }, + { + "epoch": 0.2986604105981057, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 34245 + }, + { + "epoch": 0.2986691318832743, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 34246 + }, + { + "epoch": 0.2986778531684429, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 34247 + }, + { + "epoch": 0.29868657445361146, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 34248 + }, + { + "epoch": 0.29869529573878006, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 34249 + }, + { + "epoch": 0.29870401702394866, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 34250 + }, + { + "epoch": 0.2987127383091172, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 34251 + }, + { + "epoch": 0.2987214595942858, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 34252 + }, + { + "epoch": 0.2987301808794544, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 34253 + }, + { + "epoch": 0.29873890216462295, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 34254 + }, + { + "epoch": 0.29874762344979156, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 34255 + }, + { + "epoch": 0.29875634473496016, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 34256 + }, + { + "epoch": 0.2987650660201287, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 34257 + }, + { + "epoch": 0.2987737873052973, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 34258 + }, + { + "epoch": 0.2987825085904659, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 34259 + }, + { + "epoch": 0.2987912298756345, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 34260 + }, + { + "epoch": 0.29879995116080305, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 34261 + }, + { + "epoch": 0.29880867244597165, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 34262 + }, + { + "epoch": 0.29881739373114025, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 34263 + }, + { + "epoch": 0.2988261150163088, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 34264 + }, + { + "epoch": 0.2988348363014774, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 34265 + }, + { + "epoch": 0.298843557586646, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 34266 + }, + { + "epoch": 0.29885227887181454, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 34267 + }, + { + "epoch": 0.29886100015698314, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 34268 + }, + { + "epoch": 0.29886972144215174, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 34269 + }, + { + "epoch": 0.2988784427273203, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 34270 + }, + { + "epoch": 0.2988871640124889, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 34271 + }, + { + "epoch": 0.2988958852976575, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 34272 + }, + { + "epoch": 0.298904606582826, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 34273 + }, + { + "epoch": 0.2989133278679946, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 34274 + }, + { + "epoch": 0.2989220491531632, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 34275 + }, + { + "epoch": 0.29893077043833177, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 34276 + }, + { + "epoch": 0.2989394917235004, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 34277 + }, + { + "epoch": 0.298948213008669, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 34278 + }, + { + "epoch": 0.2989569342938375, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 34279 + }, + { + "epoch": 0.2989656555790061, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 34280 + }, + { + "epoch": 0.2989743768641747, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 34281 + }, + { + "epoch": 0.29898309814934326, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 34282 + }, + { + "epoch": 0.29899181943451186, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 34283 + }, + { + "epoch": 0.29900054071968046, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 34284 + }, + { + "epoch": 0.299009262004849, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 34285 + }, + { + "epoch": 0.2990179832900176, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 34286 + }, + { + "epoch": 0.2990267045751862, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 34287 + }, + { + "epoch": 0.2990354258603548, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 34288 + }, + { + "epoch": 0.29904414714552335, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 34289 + }, + { + "epoch": 0.29905286843069195, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 34290 + }, + { + "epoch": 0.29906158971586055, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 34291 + }, + { + "epoch": 0.2990703110010291, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 34292 + }, + { + "epoch": 0.2990790322861977, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 34293 + }, + { + "epoch": 0.2990877535713663, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 34294 + }, + { + "epoch": 0.29909647485653484, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 34295 + }, + { + "epoch": 0.29910519614170344, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 34296 + }, + { + "epoch": 0.29911391742687204, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 34297 + }, + { + "epoch": 0.2991226387120406, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 34298 + }, + { + "epoch": 0.2991313599972092, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 34299 + }, + { + "epoch": 0.2991400812823778, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 34300 + }, + { + "epoch": 0.29914880256754633, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 34301 + }, + { + "epoch": 0.29915752385271493, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 34302 + }, + { + "epoch": 0.29916624513788354, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 34303 + }, + { + "epoch": 0.2991749664230521, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 34304 + }, + { + "epoch": 0.2991836877082207, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 34305 + }, + { + "epoch": 0.2991924089933893, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 34306 + }, + { + "epoch": 0.2992011302785578, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 34307 + }, + { + "epoch": 0.2992098515637264, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 34308 + }, + { + "epoch": 0.299218572848895, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 34309 + }, + { + "epoch": 0.29922729413406357, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 34310 + }, + { + "epoch": 0.29923601541923217, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 34311 + }, + { + "epoch": 0.29924473670440077, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 34312 + }, + { + "epoch": 0.2992534579895693, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 34313 + }, + { + "epoch": 0.2992621792747379, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 34314 + }, + { + "epoch": 0.2992709005599065, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 34315 + }, + { + "epoch": 0.2992796218450751, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 34316 + }, + { + "epoch": 0.29928834313024366, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 34317 + }, + { + "epoch": 0.29929706441541226, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 34318 + }, + { + "epoch": 0.29930578570058086, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 34319 + }, + { + "epoch": 0.2993145069857494, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34320 + }, + { + "epoch": 0.299323228270918, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 34321 + }, + { + "epoch": 0.2993319495560866, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 34322 + }, + { + "epoch": 0.29934067084125515, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 34323 + }, + { + "epoch": 0.29934939212642375, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 34324 + }, + { + "epoch": 0.29935811341159235, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 34325 + }, + { + "epoch": 0.2993668346967609, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 34326 + }, + { + "epoch": 0.2993755559819295, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 34327 + }, + { + "epoch": 0.2993842772670981, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 34328 + }, + { + "epoch": 0.29939299855226664, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 34329 + }, + { + "epoch": 0.29940171983743524, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 34330 + }, + { + "epoch": 0.29941044112260384, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 34331 + }, + { + "epoch": 0.2994191624077724, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 34332 + }, + { + "epoch": 0.299427883692941, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 34333 + }, + { + "epoch": 0.2994366049781096, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 34334 + }, + { + "epoch": 0.29944532626327813, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 34335 + }, + { + "epoch": 0.29945404754844673, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 34336 + }, + { + "epoch": 0.29946276883361533, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 34337 + }, + { + "epoch": 0.2994714901187839, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 34338 + }, + { + "epoch": 0.2994802114039525, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 34339 + }, + { + "epoch": 0.2994889326891211, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 34340 + }, + { + "epoch": 0.2994976539742896, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 34341 + }, + { + "epoch": 0.2995063752594582, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 34342 + }, + { + "epoch": 0.2995150965446268, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 34343 + }, + { + "epoch": 0.2995238178297954, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 34344 + }, + { + "epoch": 0.29953253911496397, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 34345 + }, + { + "epoch": 0.29954126040013257, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 34346 + }, + { + "epoch": 0.29954998168530117, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 34347 + }, + { + "epoch": 0.2995587029704697, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 34348 + }, + { + "epoch": 0.2995674242556383, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 34349 + }, + { + "epoch": 0.2995761455408069, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 34350 + }, + { + "epoch": 0.29958486682597546, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 34351 + }, + { + "epoch": 0.29959358811114406, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 34352 + }, + { + "epoch": 0.29960230939631266, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 34353 + }, + { + "epoch": 0.2996110306814812, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 34354 + }, + { + "epoch": 0.2996197519666498, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 34355 + }, + { + "epoch": 0.2996284732518184, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 34356 + }, + { + "epoch": 0.29963719453698695, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 34357 + }, + { + "epoch": 0.29964591582215555, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 34358 + }, + { + "epoch": 0.29965463710732415, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 34359 + }, + { + "epoch": 0.2996633583924927, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 34360 + }, + { + "epoch": 0.2996720796776613, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 34361 + }, + { + "epoch": 0.2996808009628299, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 34362 + }, + { + "epoch": 0.29968952224799844, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 34363 + }, + { + "epoch": 0.29969824353316704, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 34364 + }, + { + "epoch": 0.29970696481833564, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 34365 + }, + { + "epoch": 0.2997156861035042, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 34366 + }, + { + "epoch": 0.2997244073886728, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 34367 + }, + { + "epoch": 0.2997331286738414, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 34368 + }, + { + "epoch": 0.29974184995901, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 34369 + }, + { + "epoch": 0.29975057124417853, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 34370 + }, + { + "epoch": 0.29975929252934713, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 34371 + }, + { + "epoch": 0.29976801381451573, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 34372 + }, + { + "epoch": 0.2997767350996843, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 34373 + }, + { + "epoch": 0.2997854563848529, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 34374 + }, + { + "epoch": 0.2997941776700215, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 34375 + }, + { + "epoch": 0.29980289895519, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34376 + }, + { + "epoch": 0.2998116202403586, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 34377 + }, + { + "epoch": 0.2998203415255272, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 34378 + }, + { + "epoch": 0.29982906281069577, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 34379 + }, + { + "epoch": 0.29983778409586437, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 34380 + }, + { + "epoch": 0.29984650538103297, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 34381 + }, + { + "epoch": 0.2998552266662015, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 34382 + }, + { + "epoch": 0.2998639479513701, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 34383 + }, + { + "epoch": 0.2998726692365387, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 34384 + }, + { + "epoch": 0.29988139052170726, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 34385 + }, + { + "epoch": 0.29989011180687586, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 34386 + }, + { + "epoch": 0.29989883309204446, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 34387 + }, + { + "epoch": 0.299907554377213, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 34388 + }, + { + "epoch": 0.2999162756623816, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 34389 + }, + { + "epoch": 0.2999249969475502, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 34390 + }, + { + "epoch": 0.29993371823271875, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 34391 + }, + { + "epoch": 0.29994243951788735, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 34392 + }, + { + "epoch": 0.29995116080305595, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 34393 + }, + { + "epoch": 0.2999598820882245, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 34394 + }, + { + "epoch": 0.2999686033733931, + "grad_norm": 0.34765625, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 34395 + }, + { + "epoch": 0.2999773246585617, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 34396 + }, + { + "epoch": 0.2999860459437303, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 34397 + }, + { + "epoch": 0.29999476722889884, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 34398 + }, + { + "epoch": 0.30000348851406744, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 34399 + }, + { + "epoch": 0.30001220979923604, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 34400 + }, + { + "epoch": 0.3000209310844046, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 34401 + }, + { + "epoch": 0.3000296523695732, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 34402 + }, + { + "epoch": 0.3000383736547418, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 34403 + }, + { + "epoch": 0.30004709493991033, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 34404 + }, + { + "epoch": 0.30005581622507893, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 34405 + }, + { + "epoch": 0.30006453751024753, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 34406 + }, + { + "epoch": 0.3000732587954161, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 34407 + }, + { + "epoch": 0.3000819800805847, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 34408 + }, + { + "epoch": 0.3000907013657533, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 34409 + }, + { + "epoch": 0.3000994226509218, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 34410 + }, + { + "epoch": 0.3001081439360904, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 34411 + }, + { + "epoch": 0.300116865221259, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 34412 + }, + { + "epoch": 0.30012558650642757, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 34413 + }, + { + "epoch": 0.30013430779159617, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 34414 + }, + { + "epoch": 0.30014302907676477, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 34415 + }, + { + "epoch": 0.3001517503619333, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 34416 + }, + { + "epoch": 0.3001604716471019, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 34417 + }, + { + "epoch": 0.3001691929322705, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 34418 + }, + { + "epoch": 0.30017791421743906, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 34419 + }, + { + "epoch": 0.30018663550260766, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 34420 + }, + { + "epoch": 0.30019535678777626, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 34421 + }, + { + "epoch": 0.3002040780729448, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 34422 + }, + { + "epoch": 0.3002127993581134, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 34423 + }, + { + "epoch": 0.300221520643282, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 34424 + }, + { + "epoch": 0.3002302419284506, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 34425 + }, + { + "epoch": 0.30023896321361915, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 34426 + }, + { + "epoch": 0.30024768449878775, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 34427 + }, + { + "epoch": 0.30025640578395635, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 34428 + }, + { + "epoch": 0.3002651270691249, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 34429 + }, + { + "epoch": 0.3002738483542935, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 34430 + }, + { + "epoch": 0.3002825696394621, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 34431 + }, + { + "epoch": 0.30029129092463064, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 34432 + }, + { + "epoch": 0.30030001220979924, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 34433 + }, + { + "epoch": 0.30030873349496784, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 34434 + }, + { + "epoch": 0.3003174547801364, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 34435 + }, + { + "epoch": 0.300326176065305, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 34436 + }, + { + "epoch": 0.3003348973504736, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 34437 + }, + { + "epoch": 0.30034361863564213, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 34438 + }, + { + "epoch": 0.30035233992081073, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 34439 + }, + { + "epoch": 0.30036106120597933, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 34440 + }, + { + "epoch": 0.3003697824911479, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 34441 + }, + { + "epoch": 0.3003785037763165, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 34442 + }, + { + "epoch": 0.3003872250614851, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 34443 + }, + { + "epoch": 0.3003959463466536, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 34444 + }, + { + "epoch": 0.3004046676318222, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 34445 + }, + { + "epoch": 0.3004133889169908, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 34446 + }, + { + "epoch": 0.30042211020215936, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 34447 + }, + { + "epoch": 0.30043083148732797, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 34448 + }, + { + "epoch": 0.30043955277249657, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 34449 + }, + { + "epoch": 0.3004482740576651, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 34450 + }, + { + "epoch": 0.3004569953428337, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34451 + }, + { + "epoch": 0.3004657166280023, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 34452 + }, + { + "epoch": 0.3004744379131709, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 34453 + }, + { + "epoch": 0.30048315919833946, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 34454 + }, + { + "epoch": 0.30049188048350806, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 34455 + }, + { + "epoch": 0.30050060176867666, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 34456 + }, + { + "epoch": 0.3005093230538452, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 34457 + }, + { + "epoch": 0.3005180443390138, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 34458 + }, + { + "epoch": 0.3005267656241824, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 34459 + }, + { + "epoch": 0.30053548690935095, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 34460 + }, + { + "epoch": 0.30054420819451955, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 34461 + }, + { + "epoch": 0.30055292947968815, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 34462 + }, + { + "epoch": 0.3005616507648567, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 34463 + }, + { + "epoch": 0.3005703720500253, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 34464 + }, + { + "epoch": 0.3005790933351939, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 34465 + }, + { + "epoch": 0.30058781462036244, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 34466 + }, + { + "epoch": 0.30059653590553104, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 34467 + }, + { + "epoch": 0.30060525719069964, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 34468 + }, + { + "epoch": 0.3006139784758682, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 34469 + }, + { + "epoch": 0.3006226997610368, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 34470 + }, + { + "epoch": 0.3006314210462054, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 34471 + }, + { + "epoch": 0.3006401423313739, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 34472 + }, + { + "epoch": 0.3006488636165425, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 34473 + }, + { + "epoch": 0.30065758490171113, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 34474 + }, + { + "epoch": 0.3006663061868797, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 34475 + }, + { + "epoch": 0.3006750274720483, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 34476 + }, + { + "epoch": 0.3006837487572169, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 34477 + }, + { + "epoch": 0.3006924700423855, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 34478 + }, + { + "epoch": 0.300701191327554, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 34479 + }, + { + "epoch": 0.3007099126127226, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 34480 + }, + { + "epoch": 0.3007186338978912, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 34481 + }, + { + "epoch": 0.30072735518305976, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 34482 + }, + { + "epoch": 0.30073607646822836, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 34483 + }, + { + "epoch": 0.30074479775339696, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 34484 + }, + { + "epoch": 0.3007535190385655, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 34485 + }, + { + "epoch": 0.3007622403237341, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 34486 + }, + { + "epoch": 0.3007709616089027, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 34487 + }, + { + "epoch": 0.30077968289407125, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 34488 + }, + { + "epoch": 0.30078840417923985, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 34489 + }, + { + "epoch": 0.30079712546440845, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 34490 + }, + { + "epoch": 0.300805846749577, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 34491 + }, + { + "epoch": 0.3008145680347456, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 34492 + }, + { + "epoch": 0.3008232893199142, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 34493 + }, + { + "epoch": 0.30083201060508274, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 34494 + }, + { + "epoch": 0.30084073189025134, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 34495 + }, + { + "epoch": 0.30084945317541995, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 34496 + }, + { + "epoch": 0.3008581744605885, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 34497 + }, + { + "epoch": 0.3008668957457571, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 34498 + }, + { + "epoch": 0.3008756170309257, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 34499 + }, + { + "epoch": 0.30088433831609424, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 34500 + }, + { + "epoch": 0.30089305960126284, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 34501 + }, + { + "epoch": 0.30090178088643144, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 34502 + }, + { + "epoch": 0.3009105021716, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34503 + }, + { + "epoch": 0.3009192234567686, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 34504 + }, + { + "epoch": 0.3009279447419372, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 34505 + }, + { + "epoch": 0.3009366660271058, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 34506 + }, + { + "epoch": 0.3009453873122743, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 34507 + }, + { + "epoch": 0.3009541085974429, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 34508 + }, + { + "epoch": 0.3009628298826115, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 34509 + }, + { + "epoch": 0.30097155116778007, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 34510 + }, + { + "epoch": 0.30098027245294867, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 34511 + }, + { + "epoch": 0.30098899373811727, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 34512 + }, + { + "epoch": 0.3009977150232858, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 34513 + }, + { + "epoch": 0.3010064363084544, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 34514 + }, + { + "epoch": 0.301015157593623, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 34515 + }, + { + "epoch": 0.30102387887879156, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 34516 + }, + { + "epoch": 0.30103260016396016, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 34517 + }, + { + "epoch": 0.30104132144912876, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 34518 + }, + { + "epoch": 0.3010500427342973, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 34519 + }, + { + "epoch": 0.3010587640194659, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 34520 + }, + { + "epoch": 0.3010674853046345, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 34521 + }, + { + "epoch": 0.30107620658980305, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 34522 + }, + { + "epoch": 0.30108492787497165, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 34523 + }, + { + "epoch": 0.30109364916014025, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 34524 + }, + { + "epoch": 0.3011023704453088, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 34525 + }, + { + "epoch": 0.3011110917304774, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 34526 + }, + { + "epoch": 0.301119813015646, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 34527 + }, + { + "epoch": 0.30112853430081454, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 34528 + }, + { + "epoch": 0.30113725558598314, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 34529 + }, + { + "epoch": 0.30114597687115174, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 34530 + }, + { + "epoch": 0.3011546981563203, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 34531 + }, + { + "epoch": 0.3011634194414889, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 34532 + }, + { + "epoch": 0.3011721407266575, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 34533 + }, + { + "epoch": 0.3011808620118261, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 34534 + }, + { + "epoch": 0.30118958329699463, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 34535 + }, + { + "epoch": 0.30119830458216323, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 34536 + }, + { + "epoch": 0.30120702586733183, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 34537 + }, + { + "epoch": 0.3012157471525004, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 34538 + }, + { + "epoch": 0.301224468437669, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 34539 + }, + { + "epoch": 0.3012331897228376, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 34540 + }, + { + "epoch": 0.3012419110080061, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 34541 + }, + { + "epoch": 0.3012506322931747, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 34542 + }, + { + "epoch": 0.3012593535783433, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 34543 + }, + { + "epoch": 0.30126807486351187, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34544 + }, + { + "epoch": 0.30127679614868047, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 34545 + }, + { + "epoch": 0.30128551743384907, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34546 + }, + { + "epoch": 0.3012942387190176, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 34547 + }, + { + "epoch": 0.3013029600041862, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 34548 + }, + { + "epoch": 0.3013116812893548, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 34549 + }, + { + "epoch": 0.30132040257452336, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 34550 + }, + { + "epoch": 0.30132912385969196, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 34551 + }, + { + "epoch": 0.30133784514486056, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 34552 + }, + { + "epoch": 0.3013465664300291, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 34553 + }, + { + "epoch": 0.3013552877151977, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 34554 + }, + { + "epoch": 0.3013640090003663, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 34555 + }, + { + "epoch": 0.30137273028553485, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 34556 + }, + { + "epoch": 0.30138145157070345, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 34557 + }, + { + "epoch": 0.30139017285587205, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 34558 + }, + { + "epoch": 0.3013988941410406, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 34559 + }, + { + "epoch": 0.3014076154262092, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 34560 + }, + { + "epoch": 0.3014163367113778, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 34561 + }, + { + "epoch": 0.3014250579965464, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 34562 + }, + { + "epoch": 0.30143377928171494, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 34563 + }, + { + "epoch": 0.30144250056688354, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 34564 + }, + { + "epoch": 0.30145122185205214, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9766, + "step": 34565 + }, + { + "epoch": 0.3014599431372207, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 34566 + }, + { + "epoch": 0.3014686644223893, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 34567 + }, + { + "epoch": 0.3014773857075579, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 34568 + }, + { + "epoch": 0.30148610699272643, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 34569 + }, + { + "epoch": 0.30149482827789503, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 34570 + }, + { + "epoch": 0.30150354956306363, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 34571 + }, + { + "epoch": 0.3015122708482322, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 34572 + }, + { + "epoch": 0.3015209921334008, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 34573 + }, + { + "epoch": 0.3015297134185694, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 34574 + }, + { + "epoch": 0.3015384347037379, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 34575 + }, + { + "epoch": 0.3015471559889065, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 34576 + }, + { + "epoch": 0.3015558772740751, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 34577 + }, + { + "epoch": 0.30156459855924367, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 34578 + }, + { + "epoch": 0.30157331984441227, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 34579 + }, + { + "epoch": 0.30158204112958087, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 34580 + }, + { + "epoch": 0.3015907624147494, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 34581 + }, + { + "epoch": 0.301599483699918, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 34582 + }, + { + "epoch": 0.3016082049850866, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 34583 + }, + { + "epoch": 0.30161692627025516, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 34584 + }, + { + "epoch": 0.30162564755542376, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 34585 + }, + { + "epoch": 0.30163436884059236, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 34586 + }, + { + "epoch": 0.30164309012576096, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 34587 + }, + { + "epoch": 0.3016518114109295, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 34588 + }, + { + "epoch": 0.3016605326960981, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 34589 + }, + { + "epoch": 0.3016692539812667, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 34590 + }, + { + "epoch": 0.30167797526643525, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 34591 + }, + { + "epoch": 0.30168669655160385, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 34592 + }, + { + "epoch": 0.30169541783677245, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 34593 + }, + { + "epoch": 0.301704139121941, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 34594 + }, + { + "epoch": 0.3017128604071096, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 34595 + }, + { + "epoch": 0.3017215816922782, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 34596 + }, + { + "epoch": 0.30173030297744674, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 34597 + }, + { + "epoch": 0.30173902426261534, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 34598 + }, + { + "epoch": 0.30174774554778394, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 34599 + }, + { + "epoch": 0.3017564668329525, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 34600 + }, + { + "epoch": 0.3017651881181211, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 34601 + }, + { + "epoch": 0.3017739094032897, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 34602 + }, + { + "epoch": 0.30178263068845823, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 34603 + }, + { + "epoch": 0.30179135197362683, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 34604 + }, + { + "epoch": 0.30180007325879543, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 34605 + }, + { + "epoch": 0.301808794543964, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34606 + }, + { + "epoch": 0.3018175158291326, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 34607 + }, + { + "epoch": 0.3018262371143012, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 34608 + }, + { + "epoch": 0.3018349583994697, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 34609 + }, + { + "epoch": 0.3018436796846383, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 34610 + }, + { + "epoch": 0.3018524009698069, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 34611 + }, + { + "epoch": 0.30186112225497547, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 34612 + }, + { + "epoch": 0.30186984354014407, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 34613 + }, + { + "epoch": 0.30187856482531267, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 34614 + }, + { + "epoch": 0.30188728611048127, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 34615 + }, + { + "epoch": 0.3018960073956498, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 34616 + }, + { + "epoch": 0.3019047286808184, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 34617 + }, + { + "epoch": 0.301913449965987, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 34618 + }, + { + "epoch": 0.30192217125115556, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 34619 + }, + { + "epoch": 0.30193089253632416, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 34620 + }, + { + "epoch": 0.30193961382149276, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 34621 + }, + { + "epoch": 0.3019483351066613, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 34622 + }, + { + "epoch": 0.3019570563918299, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 34623 + }, + { + "epoch": 0.3019657776769985, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 34624 + }, + { + "epoch": 0.30197449896216705, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 34625 + }, + { + "epoch": 0.30198322024733565, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 34626 + }, + { + "epoch": 0.30199194153250425, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 34627 + }, + { + "epoch": 0.3020006628176728, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 34628 + }, + { + "epoch": 0.3020093841028414, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 34629 + }, + { + "epoch": 0.30201810538801, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34630 + }, + { + "epoch": 0.30202682667317854, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 34631 + }, + { + "epoch": 0.30203554795834714, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 34632 + }, + { + "epoch": 0.30204426924351574, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 34633 + }, + { + "epoch": 0.3020529905286843, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 34634 + }, + { + "epoch": 0.3020617118138529, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 34635 + }, + { + "epoch": 0.3020704330990215, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 34636 + }, + { + "epoch": 0.30207915438419003, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 34637 + }, + { + "epoch": 0.30208787566935863, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34638 + }, + { + "epoch": 0.30209659695452723, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34639 + }, + { + "epoch": 0.3021053182396958, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 34640 + }, + { + "epoch": 0.3021140395248644, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 34641 + }, + { + "epoch": 0.302122760810033, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 34642 + }, + { + "epoch": 0.3021314820952016, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 34643 + }, + { + "epoch": 0.3021402033803701, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 34644 + }, + { + "epoch": 0.3021489246655387, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 34645 + }, + { + "epoch": 0.3021576459507073, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 34646 + }, + { + "epoch": 0.30216636723587587, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 34647 + }, + { + "epoch": 0.30217508852104447, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 34648 + }, + { + "epoch": 0.30218380980621307, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 34649 + }, + { + "epoch": 0.3021925310913816, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 34650 + }, + { + "epoch": 0.3022012523765502, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 34651 + }, + { + "epoch": 0.3022099736617188, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9832, + "step": 34652 + }, + { + "epoch": 0.30221869494688736, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 34653 + }, + { + "epoch": 0.30222741623205596, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 34654 + }, + { + "epoch": 0.30223613751722456, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 34655 + }, + { + "epoch": 0.3022448588023931, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 34656 + }, + { + "epoch": 0.3022535800875617, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 34657 + }, + { + "epoch": 0.3022623013727303, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34658 + }, + { + "epoch": 0.30227102265789885, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 34659 + }, + { + "epoch": 0.30227974394306745, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 34660 + }, + { + "epoch": 0.30228846522823605, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 34661 + }, + { + "epoch": 0.3022971865134046, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 34662 + }, + { + "epoch": 0.3023059077985732, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 34663 + }, + { + "epoch": 0.3023146290837418, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 34664 + }, + { + "epoch": 0.30232335036891034, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 34665 + }, + { + "epoch": 0.30233207165407894, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 34666 + }, + { + "epoch": 0.30234079293924754, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34667 + }, + { + "epoch": 0.3023495142244161, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 34668 + }, + { + "epoch": 0.3023582355095847, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 34669 + }, + { + "epoch": 0.3023669567947533, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 34670 + }, + { + "epoch": 0.3023756780799219, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 34671 + }, + { + "epoch": 0.30238439936509043, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 34672 + }, + { + "epoch": 0.30239312065025903, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 34673 + }, + { + "epoch": 0.30240184193542763, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 34674 + }, + { + "epoch": 0.3024105632205962, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 34675 + }, + { + "epoch": 0.3024192845057648, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 34676 + }, + { + "epoch": 0.3024280057909334, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 34677 + }, + { + "epoch": 0.3024367270761019, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 34678 + }, + { + "epoch": 0.3024454483612705, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 34679 + }, + { + "epoch": 0.3024541696464391, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 34680 + }, + { + "epoch": 0.30246289093160766, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 34681 + }, + { + "epoch": 0.30247161221677626, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 34682 + }, + { + "epoch": 0.30248033350194486, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 34683 + }, + { + "epoch": 0.3024890547871134, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 34684 + }, + { + "epoch": 0.302497776072282, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 34685 + }, + { + "epoch": 0.3025064973574506, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 34686 + }, + { + "epoch": 0.30251521864261915, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 34687 + }, + { + "epoch": 0.30252393992778775, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 34688 + }, + { + "epoch": 0.30253266121295636, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 34689 + }, + { + "epoch": 0.3025413824981249, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 34690 + }, + { + "epoch": 0.3025501037832935, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 34691 + }, + { + "epoch": 0.3025588250684621, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 34692 + }, + { + "epoch": 0.30256754635363065, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 34693 + }, + { + "epoch": 0.30257626763879925, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 34694 + }, + { + "epoch": 0.30258498892396785, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 34695 + }, + { + "epoch": 0.3025937102091364, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 34696 + }, + { + "epoch": 0.302602431494305, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 34697 + }, + { + "epoch": 0.3026111527794736, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 34698 + }, + { + "epoch": 0.3026198740646422, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 34699 + }, + { + "epoch": 0.30262859534981074, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 34700 + }, + { + "epoch": 0.30263731663497934, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 34701 + }, + { + "epoch": 0.30264603792014794, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 34702 + }, + { + "epoch": 0.3026547592053165, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9808, + "step": 34703 + }, + { + "epoch": 0.3026634804904851, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 34704 + }, + { + "epoch": 0.3026722017756537, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 34705 + }, + { + "epoch": 0.3026809230608222, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 34706 + }, + { + "epoch": 0.3026896443459908, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 34707 + }, + { + "epoch": 0.3026983656311594, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 34708 + }, + { + "epoch": 0.30270708691632797, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 34709 + }, + { + "epoch": 0.3027158082014966, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 34710 + }, + { + "epoch": 0.3027245294866652, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 34711 + }, + { + "epoch": 0.3027332507718337, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 34712 + }, + { + "epoch": 0.3027419720570023, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 34713 + }, + { + "epoch": 0.3027506933421709, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 34714 + }, + { + "epoch": 0.30275941462733946, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 34715 + }, + { + "epoch": 0.30276813591250806, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 34716 + }, + { + "epoch": 0.30277685719767666, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 34717 + }, + { + "epoch": 0.3027855784828452, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 34718 + }, + { + "epoch": 0.3027942997680138, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 34719 + }, + { + "epoch": 0.3028030210531824, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 34720 + }, + { + "epoch": 0.30281174233835095, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 34721 + }, + { + "epoch": 0.30282046362351955, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 34722 + }, + { + "epoch": 0.30282918490868815, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 34723 + }, + { + "epoch": 0.30283790619385675, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 34724 + }, + { + "epoch": 0.3028466274790253, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 34725 + }, + { + "epoch": 0.3028553487641939, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 34726 + }, + { + "epoch": 0.3028640700493625, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 34727 + }, + { + "epoch": 0.30287279133453104, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 34728 + }, + { + "epoch": 0.30288151261969964, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 34729 + }, + { + "epoch": 0.30289023390486824, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 34730 + }, + { + "epoch": 0.3028989551900368, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 34731 + }, + { + "epoch": 0.3029076764752054, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 34732 + }, + { + "epoch": 0.302916397760374, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 34733 + }, + { + "epoch": 0.30292511904554253, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 34734 + }, + { + "epoch": 0.30293384033071113, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 34735 + }, + { + "epoch": 0.30294256161587974, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 34736 + }, + { + "epoch": 0.3029512829010483, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 34737 + }, + { + "epoch": 0.3029600041862169, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 34738 + }, + { + "epoch": 0.3029687254713855, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 34739 + }, + { + "epoch": 0.302977446756554, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 34740 + }, + { + "epoch": 0.3029861680417226, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 34741 + }, + { + "epoch": 0.3029948893268912, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 34742 + }, + { + "epoch": 0.30300361061205977, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 34743 + }, + { + "epoch": 0.30301233189722837, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 34744 + }, + { + "epoch": 0.30302105318239697, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 34745 + }, + { + "epoch": 0.3030297744675655, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 34746 + }, + { + "epoch": 0.3030384957527341, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 34747 + }, + { + "epoch": 0.3030472170379027, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 34748 + }, + { + "epoch": 0.30305593832307126, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 34749 + }, + { + "epoch": 0.30306465960823986, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 34750 + }, + { + "epoch": 0.30307338089340846, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 34751 + }, + { + "epoch": 0.30308210217857706, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 34752 + }, + { + "epoch": 0.3030908234637456, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 34753 + }, + { + "epoch": 0.3030995447489142, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 34754 + }, + { + "epoch": 0.3031082660340828, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 34755 + }, + { + "epoch": 0.30311698731925135, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 34756 + }, + { + "epoch": 0.30312570860441995, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 34757 + }, + { + "epoch": 0.30313442988958855, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 34758 + }, + { + "epoch": 0.3031431511747571, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34759 + }, + { + "epoch": 0.3031518724599257, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 34760 + }, + { + "epoch": 0.3031605937450943, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 34761 + }, + { + "epoch": 0.30316931503026284, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 34762 + }, + { + "epoch": 0.30317803631543144, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 34763 + }, + { + "epoch": 0.30318675760060004, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 34764 + }, + { + "epoch": 0.3031954788857686, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 34765 + }, + { + "epoch": 0.3032042001709372, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 34766 + }, + { + "epoch": 0.3032129214561058, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 34767 + }, + { + "epoch": 0.30322164274127433, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 34768 + }, + { + "epoch": 0.30323036402644293, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 34769 + }, + { + "epoch": 0.30323908531161153, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 34770 + }, + { + "epoch": 0.3032478065967801, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 34771 + }, + { + "epoch": 0.3032565278819487, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 34772 + }, + { + "epoch": 0.3032652491671173, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 34773 + }, + { + "epoch": 0.3032739704522858, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 34774 + }, + { + "epoch": 0.3032826917374544, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 34775 + }, + { + "epoch": 0.303291413022623, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 34776 + }, + { + "epoch": 0.30330013430779157, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 34777 + }, + { + "epoch": 0.30330885559296017, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 34778 + }, + { + "epoch": 0.30331757687812877, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 34779 + }, + { + "epoch": 0.30332629816329737, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 34780 + }, + { + "epoch": 0.3033350194484659, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 34781 + }, + { + "epoch": 0.3033437407336345, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 34782 + }, + { + "epoch": 0.3033524620188031, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 34783 + }, + { + "epoch": 0.30336118330397166, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 34784 + }, + { + "epoch": 0.30336990458914026, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 34785 + }, + { + "epoch": 0.30337862587430886, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 34786 + }, + { + "epoch": 0.3033873471594774, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 34787 + }, + { + "epoch": 0.303396068444646, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 34788 + }, + { + "epoch": 0.3034047897298146, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 34789 + }, + { + "epoch": 0.30341351101498315, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 34790 + }, + { + "epoch": 0.30342223230015175, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 34791 + }, + { + "epoch": 0.30343095358532035, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 34792 + }, + { + "epoch": 0.3034396748704889, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 34793 + }, + { + "epoch": 0.3034483961556575, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 34794 + }, + { + "epoch": 0.3034571174408261, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 34795 + }, + { + "epoch": 0.30346583872599464, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 34796 + }, + { + "epoch": 0.30347456001116324, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 34797 + }, + { + "epoch": 0.30348328129633184, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9835, + "step": 34798 + }, + { + "epoch": 0.3034920025815004, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 34799 + }, + { + "epoch": 0.303500723866669, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 34800 + }, + { + "epoch": 0.3035094451518376, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 34801 + }, + { + "epoch": 0.30351816643700613, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 34802 + }, + { + "epoch": 0.30352688772217473, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 34803 + }, + { + "epoch": 0.30353560900734333, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 34804 + }, + { + "epoch": 0.3035443302925119, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 34805 + }, + { + "epoch": 0.3035530515776805, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 34806 + }, + { + "epoch": 0.3035617728628491, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 34807 + }, + { + "epoch": 0.3035704941480177, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 34808 + }, + { + "epoch": 0.3035792154331862, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 34809 + }, + { + "epoch": 0.3035879367183548, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 34810 + }, + { + "epoch": 0.3035966580035234, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 34811 + }, + { + "epoch": 0.30360537928869197, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 34812 + }, + { + "epoch": 0.30361410057386057, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 34813 + }, + { + "epoch": 0.30362282185902917, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 34814 + }, + { + "epoch": 0.3036315431441977, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 34815 + }, + { + "epoch": 0.3036402644293663, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 34816 + }, + { + "epoch": 0.3036489857145349, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 34817 + }, + { + "epoch": 0.30365770699970346, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 34818 + }, + { + "epoch": 0.30366642828487206, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 34819 + }, + { + "epoch": 0.30367514957004066, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 34820 + }, + { + "epoch": 0.3036838708552092, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 34821 + }, + { + "epoch": 0.3036925921403778, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 34822 + }, + { + "epoch": 0.3037013134255464, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 34823 + }, + { + "epoch": 0.30371003471071495, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 34824 + }, + { + "epoch": 0.30371875599588355, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 34825 + }, + { + "epoch": 0.30372747728105215, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 34826 + }, + { + "epoch": 0.3037361985662207, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 34827 + }, + { + "epoch": 0.3037449198513893, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 34828 + }, + { + "epoch": 0.3037536411365579, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 34829 + }, + { + "epoch": 0.30376236242172644, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 34830 + }, + { + "epoch": 0.30377108370689504, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 34831 + }, + { + "epoch": 0.30377980499206364, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 34832 + }, + { + "epoch": 0.30378852627723224, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 34833 + }, + { + "epoch": 0.3037972475624008, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 34834 + }, + { + "epoch": 0.3038059688475694, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 34835 + }, + { + "epoch": 0.303814690132738, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 34836 + }, + { + "epoch": 0.30382341141790653, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 34837 + }, + { + "epoch": 0.30383213270307513, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 34838 + }, + { + "epoch": 0.30384085398824373, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 34839 + }, + { + "epoch": 0.3038495752734123, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 34840 + }, + { + "epoch": 0.3038582965585809, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 34841 + }, + { + "epoch": 0.3038670178437495, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 34842 + }, + { + "epoch": 0.303875739128918, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 34843 + }, + { + "epoch": 0.3038844604140866, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 34844 + }, + { + "epoch": 0.3038931816992552, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 34845 + }, + { + "epoch": 0.30390190298442377, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 34846 + }, + { + "epoch": 0.30391062426959237, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 34847 + }, + { + "epoch": 0.30391934555476097, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 34848 + }, + { + "epoch": 0.3039280668399295, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 34849 + }, + { + "epoch": 0.3039367881250981, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 34850 + }, + { + "epoch": 0.3039455094102667, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 34851 + }, + { + "epoch": 0.30395423069543526, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 34852 + }, + { + "epoch": 0.30396295198060386, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 34853 + }, + { + "epoch": 0.30397167326577246, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 34854 + }, + { + "epoch": 0.303980394550941, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 34855 + }, + { + "epoch": 0.3039891158361096, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 34856 + }, + { + "epoch": 0.3039978371212782, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 34857 + }, + { + "epoch": 0.30400655840644675, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 34858 + }, + { + "epoch": 0.30401527969161535, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 34859 + }, + { + "epoch": 0.30402400097678395, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 34860 + }, + { + "epoch": 0.30403272226195255, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 34861 + }, + { + "epoch": 0.3040414435471211, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 34862 + }, + { + "epoch": 0.3040501648322897, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 34863 + }, + { + "epoch": 0.3040588861174583, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 34864 + }, + { + "epoch": 0.30406760740262684, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 34865 + }, + { + "epoch": 0.30407632868779544, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 34866 + }, + { + "epoch": 0.30408504997296404, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 34867 + }, + { + "epoch": 0.3040937712581326, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 34868 + }, + { + "epoch": 0.3041024925433012, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 34869 + }, + { + "epoch": 0.3041112138284698, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 34870 + }, + { + "epoch": 0.30411993511363833, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 34871 + }, + { + "epoch": 0.30412865639880693, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 34872 + }, + { + "epoch": 0.30413737768397553, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 34873 + }, + { + "epoch": 0.3041460989691441, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 34874 + }, + { + "epoch": 0.3041548202543127, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 34875 + }, + { + "epoch": 0.3041635415394813, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 34876 + }, + { + "epoch": 0.3041722628246498, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 34877 + }, + { + "epoch": 0.3041809841098184, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 34878 + }, + { + "epoch": 0.304189705394987, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 34879 + }, + { + "epoch": 0.30419842668015556, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 34880 + }, + { + "epoch": 0.30420714796532417, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 34881 + }, + { + "epoch": 0.30421586925049277, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 34882 + }, + { + "epoch": 0.3042245905356613, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 34883 + }, + { + "epoch": 0.3042333118208299, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 34884 + }, + { + "epoch": 0.3042420331059985, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 34885 + }, + { + "epoch": 0.30425075439116706, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 34886 + }, + { + "epoch": 0.30425947567633566, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 34887 + }, + { + "epoch": 0.30426819696150426, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 34888 + }, + { + "epoch": 0.30427691824667286, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 34889 + }, + { + "epoch": 0.3042856395318414, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 34890 + }, + { + "epoch": 0.30429436081701, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 34891 + }, + { + "epoch": 0.3043030821021786, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 34892 + }, + { + "epoch": 0.30431180338734715, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 34893 + }, + { + "epoch": 0.30432052467251575, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 34894 + }, + { + "epoch": 0.30432924595768435, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 34895 + }, + { + "epoch": 0.3043379672428529, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 34896 + }, + { + "epoch": 0.3043466885280215, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 34897 + }, + { + "epoch": 0.3043554098131901, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34898 + }, + { + "epoch": 0.30436413109835864, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 34899 + }, + { + "epoch": 0.30437285238352724, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 34900 + }, + { + "epoch": 0.30438157366869584, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 34901 + }, + { + "epoch": 0.3043902949538644, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 34902 + }, + { + "epoch": 0.304399016239033, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 34903 + }, + { + "epoch": 0.3044077375242016, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 34904 + }, + { + "epoch": 0.3044164588093701, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 34905 + }, + { + "epoch": 0.3044251800945387, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 34906 + }, + { + "epoch": 0.30443390137970733, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 34907 + }, + { + "epoch": 0.3044426226648759, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 34908 + }, + { + "epoch": 0.3044513439500445, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 34909 + }, + { + "epoch": 0.3044600652352131, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 34910 + }, + { + "epoch": 0.3044687865203816, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 34911 + }, + { + "epoch": 0.3044775078055502, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 34912 + }, + { + "epoch": 0.3044862290907188, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 34913 + }, + { + "epoch": 0.30449495037588736, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 34914 + }, + { + "epoch": 0.30450367166105596, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 34915 + }, + { + "epoch": 0.30451239294622456, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 34916 + }, + { + "epoch": 0.30452111423139316, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 34917 + }, + { + "epoch": 0.3045298355165617, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 34918 + }, + { + "epoch": 0.3045385568017303, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 34919 + }, + { + "epoch": 0.3045472780868989, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 34920 + }, + { + "epoch": 0.30455599937206745, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 34921 + }, + { + "epoch": 0.30456472065723605, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 34922 + }, + { + "epoch": 0.30457344194240465, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 34923 + }, + { + "epoch": 0.3045821632275732, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 34924 + }, + { + "epoch": 0.3045908845127418, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 34925 + }, + { + "epoch": 0.3045996057979104, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 34926 + }, + { + "epoch": 0.30460832708307894, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 34927 + }, + { + "epoch": 0.30461704836824754, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 34928 + }, + { + "epoch": 0.30462576965341615, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 34929 + }, + { + "epoch": 0.3046344909385847, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 34930 + }, + { + "epoch": 0.3046432122237533, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 34931 + }, + { + "epoch": 0.3046519335089219, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 34932 + }, + { + "epoch": 0.30466065479409044, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 34933 + }, + { + "epoch": 0.30466937607925904, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 34934 + }, + { + "epoch": 0.30467809736442764, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 34935 + }, + { + "epoch": 0.3046868186495962, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 34936 + }, + { + "epoch": 0.3046955399347648, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 34937 + }, + { + "epoch": 0.3047042612199334, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 34938 + }, + { + "epoch": 0.3047129825051019, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 34939 + }, + { + "epoch": 0.3047217037902705, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 34940 + }, + { + "epoch": 0.3047304250754391, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 34941 + }, + { + "epoch": 0.3047391463606077, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 34942 + }, + { + "epoch": 0.30474786764577627, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 34943 + }, + { + "epoch": 0.30475658893094487, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 34944 + }, + { + "epoch": 0.30476531021611347, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 34945 + }, + { + "epoch": 0.304774031501282, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 34946 + }, + { + "epoch": 0.3047827527864506, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 34947 + }, + { + "epoch": 0.3047914740716192, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 34948 + }, + { + "epoch": 0.30480019535678776, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 34949 + }, + { + "epoch": 0.30480891664195636, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 34950 + }, + { + "epoch": 0.30481763792712496, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 34951 + }, + { + "epoch": 0.3048263592122935, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 34952 + }, + { + "epoch": 0.3048350804974621, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 34953 + }, + { + "epoch": 0.3048438017826307, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 34954 + }, + { + "epoch": 0.30485252306779925, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 34955 + }, + { + "epoch": 0.30486124435296785, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 34956 + }, + { + "epoch": 0.30486996563813645, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 34957 + }, + { + "epoch": 0.304878686923305, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 34958 + }, + { + "epoch": 0.3048874082084736, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 34959 + }, + { + "epoch": 0.3048961294936422, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 34960 + }, + { + "epoch": 0.30490485077881074, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 34961 + }, + { + "epoch": 0.30491357206397934, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 34962 + }, + { + "epoch": 0.30492229334914794, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 34963 + }, + { + "epoch": 0.3049310146343165, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 34964 + }, + { + "epoch": 0.3049397359194851, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 34965 + }, + { + "epoch": 0.3049484572046537, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 34966 + }, + { + "epoch": 0.30495717848982223, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 34967 + }, + { + "epoch": 0.30496589977499083, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 34968 + }, + { + "epoch": 0.30497462106015943, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 34969 + }, + { + "epoch": 0.30498334234532803, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 34970 + }, + { + "epoch": 0.3049920636304966, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 34971 + }, + { + "epoch": 0.3050007849156652, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 34972 + }, + { + "epoch": 0.3050095062008338, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 34973 + }, + { + "epoch": 0.3050182274860023, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 34974 + }, + { + "epoch": 0.3050269487711709, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 34975 + }, + { + "epoch": 0.3050356700563395, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 34976 + }, + { + "epoch": 0.30504439134150807, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 34977 + }, + { + "epoch": 0.30505311262667667, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 34978 + }, + { + "epoch": 0.30506183391184527, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 34979 + }, + { + "epoch": 0.3050705551970138, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 34980 + }, + { + "epoch": 0.3050792764821824, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 34981 + }, + { + "epoch": 0.305087997767351, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 34982 + }, + { + "epoch": 0.30509671905251956, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 34983 + }, + { + "epoch": 0.30510544033768816, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 34984 + }, + { + "epoch": 0.30511416162285676, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 34985 + }, + { + "epoch": 0.3051228829080253, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 34986 + }, + { + "epoch": 0.3051316041931939, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 34987 + }, + { + "epoch": 0.3051403254783625, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 34988 + }, + { + "epoch": 0.30514904676353105, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 34989 + }, + { + "epoch": 0.30515776804869965, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 34990 + }, + { + "epoch": 0.30516648933386825, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 34991 + }, + { + "epoch": 0.3051752106190368, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 34992 + }, + { + "epoch": 0.3051839319042054, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 34993 + }, + { + "epoch": 0.305192653189374, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34994 + }, + { + "epoch": 0.30520137447454254, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 34995 + }, + { + "epoch": 0.30521009575971114, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 34996 + }, + { + "epoch": 0.30521881704487974, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 34997 + }, + { + "epoch": 0.30522753833004834, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 34998 + }, + { + "epoch": 0.3052362596152169, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 34999 + }, + { + "epoch": 0.3052449809003855, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 35000 + }, + { + "epoch": 0.3052537021855541, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 35001 + }, + { + "epoch": 0.30526242347072263, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 35002 + }, + { + "epoch": 0.30527114475589123, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 35003 + }, + { + "epoch": 0.30527986604105983, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 35004 + }, + { + "epoch": 0.3052885873262284, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 35005 + }, + { + "epoch": 0.305297308611397, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 35006 + }, + { + "epoch": 0.3053060298965656, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 35007 + }, + { + "epoch": 0.3053147511817341, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 35008 + }, + { + "epoch": 0.3053234724669027, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 35009 + }, + { + "epoch": 0.3053321937520713, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 35010 + }, + { + "epoch": 0.30534091503723987, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 35011 + }, + { + "epoch": 0.30534963632240847, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35012 + }, + { + "epoch": 0.30535835760757707, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 35013 + }, + { + "epoch": 0.3053670788927456, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 35014 + }, + { + "epoch": 0.3053758001779142, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 35015 + }, + { + "epoch": 0.3053845214630828, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 35016 + }, + { + "epoch": 0.30539324274825136, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 35017 + }, + { + "epoch": 0.30540196403341996, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 35018 + }, + { + "epoch": 0.30541068531858856, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 35019 + }, + { + "epoch": 0.3054194066037571, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 35020 + }, + { + "epoch": 0.3054281278889257, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 35021 + }, + { + "epoch": 0.3054368491740943, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 35022 + }, + { + "epoch": 0.30544557045926285, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 35023 + }, + { + "epoch": 0.30545429174443145, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 35024 + }, + { + "epoch": 0.30546301302960005, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 35025 + }, + { + "epoch": 0.30547173431476865, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 35026 + }, + { + "epoch": 0.3054804555999372, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 35027 + }, + { + "epoch": 0.3054891768851058, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 35028 + }, + { + "epoch": 0.3054978981702744, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35029 + }, + { + "epoch": 0.30550661945544294, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 35030 + }, + { + "epoch": 0.30551534074061154, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 35031 + }, + { + "epoch": 0.30552406202578014, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 35032 + }, + { + "epoch": 0.3055327833109487, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 35033 + }, + { + "epoch": 0.3055415045961173, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 35034 + }, + { + "epoch": 0.3055502258812859, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 35035 + }, + { + "epoch": 0.30555894716645443, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 35036 + }, + { + "epoch": 0.30556766845162303, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 35037 + }, + { + "epoch": 0.30557638973679163, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 35038 + }, + { + "epoch": 0.3055851110219602, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 35039 + }, + { + "epoch": 0.3055938323071288, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 35040 + }, + { + "epoch": 0.3056025535922974, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 35041 + }, + { + "epoch": 0.3056112748774659, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 35042 + }, + { + "epoch": 0.3056199961626345, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 35043 + }, + { + "epoch": 0.3056287174478031, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 35044 + }, + { + "epoch": 0.30563743873297167, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 35045 + }, + { + "epoch": 0.30564616001814027, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 35046 + }, + { + "epoch": 0.30565488130330887, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 35047 + }, + { + "epoch": 0.3056636025884774, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 35048 + }, + { + "epoch": 0.305672323873646, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 35049 + }, + { + "epoch": 0.3056810451588146, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 35050 + }, + { + "epoch": 0.3056897664439832, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 35051 + }, + { + "epoch": 0.30569848772915176, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 35052 + }, + { + "epoch": 0.30570720901432036, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 35053 + }, + { + "epoch": 0.30571593029948896, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 35054 + }, + { + "epoch": 0.3057246515846575, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 35055 + }, + { + "epoch": 0.3057333728698261, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9805, + "step": 35056 + }, + { + "epoch": 0.3057420941549947, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 35057 + }, + { + "epoch": 0.30575081544016325, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 35058 + }, + { + "epoch": 0.30575953672533185, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9702, + "step": 35059 + }, + { + "epoch": 0.30576825801050045, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 35060 + }, + { + "epoch": 0.305776979295669, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35061 + }, + { + "epoch": 0.3057857005808376, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 35062 + }, + { + "epoch": 0.3057944218660062, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 35063 + }, + { + "epoch": 0.30580314315117474, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 35064 + }, + { + "epoch": 0.30581186443634334, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 35065 + }, + { + "epoch": 0.30582058572151194, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 35066 + }, + { + "epoch": 0.3058293070066805, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 35067 + }, + { + "epoch": 0.3058380282918491, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 35068 + }, + { + "epoch": 0.3058467495770177, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 35069 + }, + { + "epoch": 0.30585547086218623, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35070 + }, + { + "epoch": 0.30586419214735483, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 35071 + }, + { + "epoch": 0.30587291343252343, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 35072 + }, + { + "epoch": 0.305881634717692, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 35073 + }, + { + "epoch": 0.3058903560028606, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 35074 + }, + { + "epoch": 0.3058990772880292, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 35075 + }, + { + "epoch": 0.3059077985731977, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 35076 + }, + { + "epoch": 0.3059165198583663, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 35077 + }, + { + "epoch": 0.3059252411435349, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 35078 + }, + { + "epoch": 0.3059339624287035, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 35079 + }, + { + "epoch": 0.30594268371387207, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 35080 + }, + { + "epoch": 0.30595140499904067, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 35081 + }, + { + "epoch": 0.30596012628420927, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 35082 + }, + { + "epoch": 0.3059688475693778, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 35083 + }, + { + "epoch": 0.3059775688545464, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 35084 + }, + { + "epoch": 0.305986290139715, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 35085 + }, + { + "epoch": 0.30599501142488356, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 35086 + }, + { + "epoch": 0.30600373271005216, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 35087 + }, + { + "epoch": 0.30601245399522076, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 35088 + }, + { + "epoch": 0.3060211752803893, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 35089 + }, + { + "epoch": 0.3060298965655579, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 35090 + }, + { + "epoch": 0.3060386178507265, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 35091 + }, + { + "epoch": 0.30604733913589505, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 35092 + }, + { + "epoch": 0.30605606042106365, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 35093 + }, + { + "epoch": 0.30606478170623225, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 35094 + }, + { + "epoch": 0.3060735029914008, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 35095 + }, + { + "epoch": 0.3060822242765694, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 35096 + }, + { + "epoch": 0.306090945561738, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 35097 + }, + { + "epoch": 0.30609966684690654, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 35098 + }, + { + "epoch": 0.30610838813207514, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 35099 + }, + { + "epoch": 0.30611710941724374, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 35100 + }, + { + "epoch": 0.3061258307024123, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 35101 + }, + { + "epoch": 0.3061345519875809, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 35102 + }, + { + "epoch": 0.3061432732727495, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 35103 + }, + { + "epoch": 0.30615199455791803, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 35104 + }, + { + "epoch": 0.30616071584308663, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 35105 + }, + { + "epoch": 0.30616943712825523, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 35106 + }, + { + "epoch": 0.30617815841342383, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 35107 + }, + { + "epoch": 0.3061868796985924, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 35108 + }, + { + "epoch": 0.306195600983761, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 35109 + }, + { + "epoch": 0.3062043222689296, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 35110 + }, + { + "epoch": 0.3062130435540981, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35111 + }, + { + "epoch": 0.3062217648392667, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 35112 + }, + { + "epoch": 0.3062304861244353, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 35113 + }, + { + "epoch": 0.30623920740960386, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 35114 + }, + { + "epoch": 0.30624792869477246, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 35115 + }, + { + "epoch": 0.30625664997994106, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 35116 + }, + { + "epoch": 0.3062653712651096, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 35117 + }, + { + "epoch": 0.3062740925502782, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 35118 + }, + { + "epoch": 0.3062828138354468, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 35119 + }, + { + "epoch": 0.30629153512061535, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 35120 + }, + { + "epoch": 0.30630025640578395, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 35121 + }, + { + "epoch": 0.30630897769095256, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 35122 + }, + { + "epoch": 0.3063176989761211, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 35123 + }, + { + "epoch": 0.3063264202612897, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 35124 + }, + { + "epoch": 0.3063351415464583, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 35125 + }, + { + "epoch": 0.30634386283162685, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 35126 + }, + { + "epoch": 0.30635258411679545, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 35127 + }, + { + "epoch": 0.30636130540196405, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 35128 + }, + { + "epoch": 0.3063700266871326, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 35129 + }, + { + "epoch": 0.3063787479723012, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 35130 + }, + { + "epoch": 0.3063874692574698, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 35131 + }, + { + "epoch": 0.30639619054263834, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 35132 + }, + { + "epoch": 0.30640491182780694, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 35133 + }, + { + "epoch": 0.30641363311297554, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 35134 + }, + { + "epoch": 0.30642235439814414, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 35135 + }, + { + "epoch": 0.3064310756833127, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 35136 + }, + { + "epoch": 0.3064397969684813, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 35137 + }, + { + "epoch": 0.3064485182536499, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 35138 + }, + { + "epoch": 0.3064572395388184, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 35139 + }, + { + "epoch": 0.306465960823987, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 35140 + }, + { + "epoch": 0.3064746821091556, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 35141 + }, + { + "epoch": 0.30648340339432417, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 35142 + }, + { + "epoch": 0.3064921246794928, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 35143 + }, + { + "epoch": 0.3065008459646614, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 35144 + }, + { + "epoch": 0.3065095672498299, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 35145 + }, + { + "epoch": 0.3065182885349985, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 35146 + }, + { + "epoch": 0.3065270098201671, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 35147 + }, + { + "epoch": 0.30653573110533566, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 35148 + }, + { + "epoch": 0.30654445239050426, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 35149 + }, + { + "epoch": 0.30655317367567286, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 35150 + }, + { + "epoch": 0.3065618949608414, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 35151 + }, + { + "epoch": 0.30657061624601, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 35152 + }, + { + "epoch": 0.3065793375311786, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 35153 + }, + { + "epoch": 0.30658805881634715, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 35154 + }, + { + "epoch": 0.30659678010151575, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 35155 + }, + { + "epoch": 0.30660550138668435, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 35156 + }, + { + "epoch": 0.3066142226718529, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 35157 + }, + { + "epoch": 0.3066229439570215, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 35158 + }, + { + "epoch": 0.3066316652421901, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 35159 + }, + { + "epoch": 0.3066403865273587, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 35160 + }, + { + "epoch": 0.30664910781252724, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 35161 + }, + { + "epoch": 0.30665782909769584, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 35162 + }, + { + "epoch": 0.30666655038286444, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35163 + }, + { + "epoch": 0.306675271668033, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 35164 + }, + { + "epoch": 0.3066839929532016, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 35165 + }, + { + "epoch": 0.3066927142383702, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 35166 + }, + { + "epoch": 0.30670143552353873, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 35167 + }, + { + "epoch": 0.30671015680870733, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 35168 + }, + { + "epoch": 0.30671887809387594, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 35169 + }, + { + "epoch": 0.3067275993790445, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 35170 + }, + { + "epoch": 0.3067363206642131, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 35171 + }, + { + "epoch": 0.3067450419493817, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 35172 + }, + { + "epoch": 0.3067537632345502, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 35173 + }, + { + "epoch": 0.3067624845197188, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 35174 + }, + { + "epoch": 0.3067712058048874, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 35175 + }, + { + "epoch": 0.30677992709005597, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 35176 + }, + { + "epoch": 0.30678864837522457, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35177 + }, + { + "epoch": 0.30679736966039317, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 35178 + }, + { + "epoch": 0.3068060909455617, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 35179 + }, + { + "epoch": 0.3068148122307303, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 35180 + }, + { + "epoch": 0.3068235335158989, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 35181 + }, + { + "epoch": 0.30683225480106746, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 35182 + }, + { + "epoch": 0.30684097608623606, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 35183 + }, + { + "epoch": 0.30684969737140466, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 35184 + }, + { + "epoch": 0.3068584186565732, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 35185 + }, + { + "epoch": 0.3068671399417418, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 35186 + }, + { + "epoch": 0.3068758612269104, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 35187 + }, + { + "epoch": 0.306884582512079, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 35188 + }, + { + "epoch": 0.30689330379724755, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 35189 + }, + { + "epoch": 0.30690202508241615, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 35190 + }, + { + "epoch": 0.30691074636758475, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 35191 + }, + { + "epoch": 0.3069194676527533, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 35192 + }, + { + "epoch": 0.3069281889379219, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 35193 + }, + { + "epoch": 0.3069369102230905, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 35194 + }, + { + "epoch": 0.30694563150825904, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35195 + }, + { + "epoch": 0.30695435279342764, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 35196 + }, + { + "epoch": 0.30696307407859624, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 35197 + }, + { + "epoch": 0.3069717953637648, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 35198 + }, + { + "epoch": 0.3069805166489334, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 35199 + }, + { + "epoch": 0.306989237934102, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 35200 + }, + { + "epoch": 0.30699795921927053, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 35201 + }, + { + "epoch": 0.30700668050443913, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 35202 + }, + { + "epoch": 0.30701540178960773, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 35203 + }, + { + "epoch": 0.3070241230747763, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 35204 + }, + { + "epoch": 0.3070328443599449, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 35205 + }, + { + "epoch": 0.3070415656451135, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 35206 + }, + { + "epoch": 0.307050286930282, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 35207 + }, + { + "epoch": 0.3070590082154506, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 35208 + }, + { + "epoch": 0.3070677295006192, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 35209 + }, + { + "epoch": 0.30707645078578777, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35210 + }, + { + "epoch": 0.30708517207095637, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 35211 + }, + { + "epoch": 0.30709389335612497, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 35212 + }, + { + "epoch": 0.3071026146412935, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 35213 + }, + { + "epoch": 0.3071113359264621, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 35214 + }, + { + "epoch": 0.3071200572116307, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 35215 + }, + { + "epoch": 0.3071287784967993, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 35216 + }, + { + "epoch": 0.30713749978196786, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 35217 + }, + { + "epoch": 0.30714622106713646, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 35218 + }, + { + "epoch": 0.30715494235230506, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 35219 + }, + { + "epoch": 0.3071636636374736, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 35220 + }, + { + "epoch": 0.3071723849226422, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 35221 + }, + { + "epoch": 0.3071811062078108, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35222 + }, + { + "epoch": 0.30718982749297935, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 35223 + }, + { + "epoch": 0.30719854877814795, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 35224 + }, + { + "epoch": 0.30720727006331655, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 35225 + }, + { + "epoch": 0.3072159913484851, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 35226 + }, + { + "epoch": 0.3072247126336537, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 35227 + }, + { + "epoch": 0.3072334339188223, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 35228 + }, + { + "epoch": 0.30724215520399084, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 35229 + }, + { + "epoch": 0.30725087648915944, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 35230 + }, + { + "epoch": 0.30725959777432804, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 35231 + }, + { + "epoch": 0.3072683190594966, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35232 + }, + { + "epoch": 0.3072770403446652, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 35233 + }, + { + "epoch": 0.3072857616298338, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 35234 + }, + { + "epoch": 0.30729448291500233, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 35235 + }, + { + "epoch": 0.30730320420017093, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 35236 + }, + { + "epoch": 0.30731192548533953, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35237 + }, + { + "epoch": 0.3073206467705081, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 35238 + }, + { + "epoch": 0.3073293680556767, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 35239 + }, + { + "epoch": 0.3073380893408453, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 35240 + }, + { + "epoch": 0.3073468106260138, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 35241 + }, + { + "epoch": 0.3073555319111824, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 35242 + }, + { + "epoch": 0.307364253196351, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 35243 + }, + { + "epoch": 0.3073729744815196, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 35244 + }, + { + "epoch": 0.30738169576668817, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 35245 + }, + { + "epoch": 0.30739041705185677, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 35246 + }, + { + "epoch": 0.30739913833702537, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9785, + "step": 35247 + }, + { + "epoch": 0.3074078596221939, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 35248 + }, + { + "epoch": 0.3074165809073625, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 35249 + }, + { + "epoch": 0.3074253021925311, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 35250 + }, + { + "epoch": 0.30743402347769966, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 35251 + }, + { + "epoch": 0.30744274476286826, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 35252 + }, + { + "epoch": 0.30745146604803686, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 35253 + }, + { + "epoch": 0.3074601873332054, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 35254 + }, + { + "epoch": 0.307468908618374, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 35255 + }, + { + "epoch": 0.3074776299035426, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 35256 + }, + { + "epoch": 0.30748635118871115, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 35257 + }, + { + "epoch": 0.30749507247387975, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 35258 + }, + { + "epoch": 0.30750379375904835, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 35259 + }, + { + "epoch": 0.3075125150442169, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 35260 + }, + { + "epoch": 0.3075212363293855, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 35261 + }, + { + "epoch": 0.3075299576145541, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 35262 + }, + { + "epoch": 0.30753867889972264, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 35263 + }, + { + "epoch": 0.30754740018489124, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 35264 + }, + { + "epoch": 0.30755612147005984, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 35265 + }, + { + "epoch": 0.3075648427552284, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 35266 + }, + { + "epoch": 0.307573564040397, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 35267 + }, + { + "epoch": 0.3075822853255656, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 35268 + }, + { + "epoch": 0.30759100661073413, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 35269 + }, + { + "epoch": 0.30759972789590273, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 35270 + }, + { + "epoch": 0.30760844918107133, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 35271 + }, + { + "epoch": 0.30761717046623993, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35272 + }, + { + "epoch": 0.3076258917514085, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 35273 + }, + { + "epoch": 0.3076346130365771, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 35274 + }, + { + "epoch": 0.3076433343217457, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 35275 + }, + { + "epoch": 0.3076520556069142, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 35276 + }, + { + "epoch": 0.3076607768920828, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 35277 + }, + { + "epoch": 0.3076694981772514, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 35278 + }, + { + "epoch": 0.30767821946241997, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 35279 + }, + { + "epoch": 0.30768694074758857, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 35280 + }, + { + "epoch": 0.30769566203275717, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 35281 + }, + { + "epoch": 0.3077043833179257, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 35282 + }, + { + "epoch": 0.3077131046030943, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 35283 + }, + { + "epoch": 0.3077218258882629, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 35284 + }, + { + "epoch": 0.30773054717343146, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35285 + }, + { + "epoch": 0.30773926845860006, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 35286 + }, + { + "epoch": 0.30774798974376866, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35287 + }, + { + "epoch": 0.3077567110289372, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 35288 + }, + { + "epoch": 0.3077654323141058, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 35289 + }, + { + "epoch": 0.3077741535992744, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 35290 + }, + { + "epoch": 0.30778287488444295, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 35291 + }, + { + "epoch": 0.30779159616961155, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 35292 + }, + { + "epoch": 0.30780031745478015, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 35293 + }, + { + "epoch": 0.3078090387399487, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 35294 + }, + { + "epoch": 0.3078177600251173, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 35295 + }, + { + "epoch": 0.3078264813102859, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 35296 + }, + { + "epoch": 0.3078352025954545, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 35297 + }, + { + "epoch": 0.30784392388062304, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 35298 + }, + { + "epoch": 0.30785264516579164, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 35299 + }, + { + "epoch": 0.30786136645096024, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 35300 + }, + { + "epoch": 0.3078700877361288, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 35301 + }, + { + "epoch": 0.3078788090212974, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 35302 + }, + { + "epoch": 0.307887530306466, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 35303 + }, + { + "epoch": 0.30789625159163453, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 35304 + }, + { + "epoch": 0.30790497287680313, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 35305 + }, + { + "epoch": 0.30791369416197173, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 35306 + }, + { + "epoch": 0.3079224154471403, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 35307 + }, + { + "epoch": 0.3079311367323089, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 35308 + }, + { + "epoch": 0.3079398580174775, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 35309 + }, + { + "epoch": 0.307948579302646, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 35310 + }, + { + "epoch": 0.3079573005878146, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 35311 + }, + { + "epoch": 0.3079660218729832, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 35312 + }, + { + "epoch": 0.30797474315815176, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 35313 + }, + { + "epoch": 0.30798346444332036, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 35314 + }, + { + "epoch": 0.30799218572848897, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 35315 + }, + { + "epoch": 0.3080009070136575, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 35316 + }, + { + "epoch": 0.3080096282988261, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 35317 + }, + { + "epoch": 0.3080183495839947, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 35318 + }, + { + "epoch": 0.30802707086916326, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 35319 + }, + { + "epoch": 0.30803579215433186, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 35320 + }, + { + "epoch": 0.30804451343950046, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 35321 + }, + { + "epoch": 0.308053234724669, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 35322 + }, + { + "epoch": 0.3080619560098376, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 35323 + }, + { + "epoch": 0.3080706772950062, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 35324 + }, + { + "epoch": 0.3080793985801748, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 35325 + }, + { + "epoch": 0.30808811986534335, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 35326 + }, + { + "epoch": 0.30809684115051195, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 35327 + }, + { + "epoch": 0.30810556243568055, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 35328 + }, + { + "epoch": 0.3081142837208491, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 35329 + }, + { + "epoch": 0.3081230050060177, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 35330 + }, + { + "epoch": 0.3081317262911863, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 35331 + }, + { + "epoch": 0.30814044757635484, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 35332 + }, + { + "epoch": 0.30814916886152344, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 35333 + }, + { + "epoch": 0.30815789014669204, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 35334 + }, + { + "epoch": 0.3081666114318606, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 35335 + }, + { + "epoch": 0.3081753327170292, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 35336 + }, + { + "epoch": 0.3081840540021978, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 35337 + }, + { + "epoch": 0.3081927752873663, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 35338 + }, + { + "epoch": 0.3082014965725349, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 35339 + }, + { + "epoch": 0.30821021785770353, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 35340 + }, + { + "epoch": 0.3082189391428721, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9778, + "step": 35341 + }, + { + "epoch": 0.3082276604280407, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 35342 + }, + { + "epoch": 0.3082363817132093, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 35343 + }, + { + "epoch": 0.3082451029983778, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 35344 + }, + { + "epoch": 0.3082538242835464, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 35345 + }, + { + "epoch": 0.308262545568715, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 35346 + }, + { + "epoch": 0.30827126685388356, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35347 + }, + { + "epoch": 0.30827998813905216, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35348 + }, + { + "epoch": 0.30828870942422076, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 35349 + }, + { + "epoch": 0.3082974307093893, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 35350 + }, + { + "epoch": 0.3083061519945579, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 35351 + }, + { + "epoch": 0.3083148732797265, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 35352 + }, + { + "epoch": 0.3083235945648951, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 35353 + }, + { + "epoch": 0.30833231585006365, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 35354 + }, + { + "epoch": 0.30834103713523225, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 35355 + }, + { + "epoch": 0.30834975842040085, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 35356 + }, + { + "epoch": 0.3083584797055694, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 35357 + }, + { + "epoch": 0.308367200990738, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 35358 + }, + { + "epoch": 0.3083759222759066, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 35359 + }, + { + "epoch": 0.30838464356107514, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 35360 + }, + { + "epoch": 0.30839336484624374, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 35361 + }, + { + "epoch": 0.30840208613141235, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 35362 + }, + { + "epoch": 0.3084108074165809, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 35363 + }, + { + "epoch": 0.3084195287017495, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 35364 + }, + { + "epoch": 0.3084282499869181, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 35365 + }, + { + "epoch": 0.30843697127208664, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 35366 + }, + { + "epoch": 0.30844569255725524, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 35367 + }, + { + "epoch": 0.30845441384242384, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 35368 + }, + { + "epoch": 0.3084631351275924, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 35369 + }, + { + "epoch": 0.308471856412761, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 35370 + }, + { + "epoch": 0.3084805776979296, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 35371 + }, + { + "epoch": 0.3084892989830981, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 35372 + }, + { + "epoch": 0.3084980202682667, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 35373 + }, + { + "epoch": 0.3085067415534353, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 35374 + }, + { + "epoch": 0.30851546283860387, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 35375 + }, + { + "epoch": 0.30852418412377247, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 35376 + }, + { + "epoch": 0.30853290540894107, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 35377 + }, + { + "epoch": 0.3085416266941096, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 35378 + }, + { + "epoch": 0.3085503479792782, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 35379 + }, + { + "epoch": 0.3085590692644468, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 35380 + }, + { + "epoch": 0.3085677905496154, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 35381 + }, + { + "epoch": 0.30857651183478396, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 35382 + }, + { + "epoch": 0.30858523311995256, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 35383 + }, + { + "epoch": 0.30859395440512116, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 35384 + }, + { + "epoch": 0.3086026756902897, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 35385 + }, + { + "epoch": 0.3086113969754583, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 35386 + }, + { + "epoch": 0.3086201182606269, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 35387 + }, + { + "epoch": 0.30862883954579545, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 35388 + }, + { + "epoch": 0.30863756083096405, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 35389 + }, + { + "epoch": 0.30864628211613265, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35390 + }, + { + "epoch": 0.3086550034013012, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 35391 + }, + { + "epoch": 0.3086637246864698, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 35392 + }, + { + "epoch": 0.3086724459716384, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 35393 + }, + { + "epoch": 0.30868116725680694, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 35394 + }, + { + "epoch": 0.30868988854197554, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 35395 + }, + { + "epoch": 0.30869860982714414, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 35396 + }, + { + "epoch": 0.3087073311123127, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 35397 + }, + { + "epoch": 0.3087160523974813, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 35398 + }, + { + "epoch": 0.3087247736826499, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 35399 + }, + { + "epoch": 0.30873349496781843, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 35400 + }, + { + "epoch": 0.30874221625298703, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 35401 + }, + { + "epoch": 0.30875093753815563, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 35402 + }, + { + "epoch": 0.3087596588233242, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 35403 + }, + { + "epoch": 0.3087683801084928, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 35404 + }, + { + "epoch": 0.3087771013936614, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 35405 + }, + { + "epoch": 0.30878582267883, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 35406 + }, + { + "epoch": 0.3087945439639985, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 35407 + }, + { + "epoch": 0.3088032652491671, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 35408 + }, + { + "epoch": 0.3088119865343357, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 35409 + }, + { + "epoch": 0.30882070781950427, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 35410 + }, + { + "epoch": 0.30882942910467287, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 35411 + }, + { + "epoch": 0.30883815038984147, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 35412 + }, + { + "epoch": 0.30884687167501, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 35413 + }, + { + "epoch": 0.3088555929601786, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 35414 + }, + { + "epoch": 0.3088643142453472, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 35415 + }, + { + "epoch": 0.30887303553051576, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9818, + "step": 35416 + }, + { + "epoch": 0.30888175681568436, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 35417 + }, + { + "epoch": 0.30889047810085296, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 35418 + }, + { + "epoch": 0.3088991993860215, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35419 + }, + { + "epoch": 0.3089079206711901, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 35420 + }, + { + "epoch": 0.3089166419563587, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 35421 + }, + { + "epoch": 0.30892536324152725, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 35422 + }, + { + "epoch": 0.30893408452669585, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 35423 + }, + { + "epoch": 0.30894280581186445, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 35424 + }, + { + "epoch": 0.308951527097033, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 35425 + }, + { + "epoch": 0.3089602483822016, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 35426 + }, + { + "epoch": 0.3089689696673702, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 35427 + }, + { + "epoch": 0.30897769095253874, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 35428 + }, + { + "epoch": 0.30898641223770734, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 35429 + }, + { + "epoch": 0.30899513352287594, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 35430 + }, + { + "epoch": 0.3090038548080445, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 35431 + }, + { + "epoch": 0.3090125760932131, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 35432 + }, + { + "epoch": 0.3090212973783817, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 35433 + }, + { + "epoch": 0.3090300186635503, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 35434 + }, + { + "epoch": 0.30903873994871883, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 35435 + }, + { + "epoch": 0.30904746123388743, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 35436 + }, + { + "epoch": 0.30905618251905603, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 35437 + }, + { + "epoch": 0.3090649038042246, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 35438 + }, + { + "epoch": 0.3090736250893932, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 35439 + }, + { + "epoch": 0.3090823463745618, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 35440 + }, + { + "epoch": 0.3090910676597303, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 35441 + }, + { + "epoch": 0.3090997889448989, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 35442 + }, + { + "epoch": 0.3091085102300675, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 35443 + }, + { + "epoch": 0.30911723151523607, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 35444 + }, + { + "epoch": 0.30912595280040467, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 35445 + }, + { + "epoch": 0.30913467408557327, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 35446 + }, + { + "epoch": 0.3091433953707418, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 35447 + }, + { + "epoch": 0.3091521166559104, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 35448 + }, + { + "epoch": 0.309160837941079, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 35449 + }, + { + "epoch": 0.30916955922624756, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 35450 + }, + { + "epoch": 0.30917828051141616, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35451 + }, + { + "epoch": 0.30918700179658476, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 35452 + }, + { + "epoch": 0.3091957230817533, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 35453 + }, + { + "epoch": 0.3092044443669219, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 35454 + }, + { + "epoch": 0.3092131656520905, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 35455 + }, + { + "epoch": 0.30922188693725905, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 35456 + }, + { + "epoch": 0.30923060822242765, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 35457 + }, + { + "epoch": 0.30923932950759625, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 35458 + }, + { + "epoch": 0.3092480507927648, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 35459 + }, + { + "epoch": 0.3092567720779334, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 35460 + }, + { + "epoch": 0.309265493363102, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 35461 + }, + { + "epoch": 0.3092742146482706, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 35462 + }, + { + "epoch": 0.30928293593343914, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35463 + }, + { + "epoch": 0.30929165721860774, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 35464 + }, + { + "epoch": 0.30930037850377634, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 35465 + }, + { + "epoch": 0.3093090997889449, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 35466 + }, + { + "epoch": 0.3093178210741135, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 35467 + }, + { + "epoch": 0.3093265423592821, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 35468 + }, + { + "epoch": 0.30933526364445063, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 35469 + }, + { + "epoch": 0.30934398492961923, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 35470 + }, + { + "epoch": 0.30935270621478783, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 35471 + }, + { + "epoch": 0.3093614274999564, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 35472 + }, + { + "epoch": 0.309370148785125, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 35473 + }, + { + "epoch": 0.3093788700702936, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 35474 + }, + { + "epoch": 0.3093875913554621, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 35475 + }, + { + "epoch": 0.3093963126406307, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 35476 + }, + { + "epoch": 0.3094050339257993, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 35477 + }, + { + "epoch": 0.30941375521096787, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 35478 + }, + { + "epoch": 0.30942247649613647, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 35479 + }, + { + "epoch": 0.30943119778130507, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35480 + }, + { + "epoch": 0.3094399190664736, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 35481 + }, + { + "epoch": 0.3094486403516422, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 35482 + }, + { + "epoch": 0.3094573616368108, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 35483 + }, + { + "epoch": 0.30946608292197936, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 35484 + }, + { + "epoch": 0.30947480420714796, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 35485 + }, + { + "epoch": 0.30948352549231656, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 35486 + }, + { + "epoch": 0.3094922467774851, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 35487 + }, + { + "epoch": 0.3095009680626537, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 35488 + }, + { + "epoch": 0.3095096893478223, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 35489 + }, + { + "epoch": 0.3095184106329909, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 35490 + }, + { + "epoch": 0.30952713191815945, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 35491 + }, + { + "epoch": 0.30953585320332805, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 35492 + }, + { + "epoch": 0.30954457448849665, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 35493 + }, + { + "epoch": 0.3095532957736652, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 35494 + }, + { + "epoch": 0.3095620170588338, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 35495 + }, + { + "epoch": 0.3095707383440024, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 35496 + }, + { + "epoch": 0.30957945962917094, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 35497 + }, + { + "epoch": 0.30958818091433954, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 35498 + }, + { + "epoch": 0.30959690219950814, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 35499 + }, + { + "epoch": 0.3096056234846767, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 35500 + }, + { + "epoch": 0.3096143447698453, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 35501 + }, + { + "epoch": 0.3096230660550139, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 35502 + }, + { + "epoch": 0.30963178734018243, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 35503 + }, + { + "epoch": 0.30964050862535103, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 35504 + }, + { + "epoch": 0.30964922991051963, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 35505 + }, + { + "epoch": 0.3096579511956882, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 35506 + }, + { + "epoch": 0.3096666724808568, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 35507 + }, + { + "epoch": 0.3096753937660254, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 35508 + }, + { + "epoch": 0.3096841150511939, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 35509 + }, + { + "epoch": 0.3096928363363625, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 35510 + }, + { + "epoch": 0.3097015576215311, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 35511 + }, + { + "epoch": 0.30971027890669967, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 35512 + }, + { + "epoch": 0.30971900019186827, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 35513 + }, + { + "epoch": 0.30972772147703687, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 35514 + }, + { + "epoch": 0.30973644276220547, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 35515 + }, + { + "epoch": 0.309745164047374, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 35516 + }, + { + "epoch": 0.3097538853325426, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 35517 + }, + { + "epoch": 0.3097626066177112, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 35518 + }, + { + "epoch": 0.30977132790287976, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 35519 + }, + { + "epoch": 0.30978004918804836, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 35520 + }, + { + "epoch": 0.30978877047321696, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 35521 + }, + { + "epoch": 0.3097974917583855, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 35522 + }, + { + "epoch": 0.3098062130435541, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 35523 + }, + { + "epoch": 0.3098149343287227, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 35524 + }, + { + "epoch": 0.30982365561389125, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 35525 + }, + { + "epoch": 0.30983237689905985, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 35526 + }, + { + "epoch": 0.30984109818422845, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 35527 + }, + { + "epoch": 0.309849819469397, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 35528 + }, + { + "epoch": 0.3098585407545656, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 35529 + }, + { + "epoch": 0.3098672620397342, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 35530 + }, + { + "epoch": 0.30987598332490274, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 35531 + }, + { + "epoch": 0.30988470461007134, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 35532 + }, + { + "epoch": 0.30989342589523994, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 35533 + }, + { + "epoch": 0.3099021471804085, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 35534 + }, + { + "epoch": 0.3099108684655771, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 35535 + }, + { + "epoch": 0.3099195897507457, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 35536 + }, + { + "epoch": 0.30992831103591423, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 35537 + }, + { + "epoch": 0.30993703232108283, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 35538 + }, + { + "epoch": 0.30994575360625143, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 35539 + }, + { + "epoch": 0.30995447489142, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 35540 + }, + { + "epoch": 0.3099631961765886, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 35541 + }, + { + "epoch": 0.3099719174617572, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 35542 + }, + { + "epoch": 0.3099806387469258, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 35543 + }, + { + "epoch": 0.3099893600320943, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 35544 + }, + { + "epoch": 0.3099980813172629, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 35545 + }, + { + "epoch": 0.3100068026024315, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 35546 + }, + { + "epoch": 0.31001552388760006, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 35547 + }, + { + "epoch": 0.31002424517276866, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 35548 + }, + { + "epoch": 0.31003296645793726, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 35549 + }, + { + "epoch": 0.3100416877431058, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35550 + }, + { + "epoch": 0.3100504090282744, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 35551 + }, + { + "epoch": 0.310059130313443, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 35552 + }, + { + "epoch": 0.31006785159861155, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 35553 + }, + { + "epoch": 0.31007657288378015, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 35554 + }, + { + "epoch": 0.31008529416894876, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 35555 + }, + { + "epoch": 0.3100940154541173, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 35556 + }, + { + "epoch": 0.3101027367392859, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 35557 + }, + { + "epoch": 0.3101114580244545, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 35558 + }, + { + "epoch": 0.31012017930962305, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 35559 + }, + { + "epoch": 0.31012890059479165, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 35560 + }, + { + "epoch": 0.31013762187996025, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 35561 + }, + { + "epoch": 0.3101463431651288, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 35562 + }, + { + "epoch": 0.3101550644502974, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 35563 + }, + { + "epoch": 0.310163785735466, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35564 + }, + { + "epoch": 0.31017250702063454, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 35565 + }, + { + "epoch": 0.31018122830580314, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 35566 + }, + { + "epoch": 0.31018994959097174, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 35567 + }, + { + "epoch": 0.3101986708761403, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 35568 + }, + { + "epoch": 0.3102073921613089, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 35569 + }, + { + "epoch": 0.3102161134464775, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 35570 + }, + { + "epoch": 0.3102248347316461, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 35571 + }, + { + "epoch": 0.3102335560168146, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 35572 + }, + { + "epoch": 0.3102422773019832, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 35573 + }, + { + "epoch": 0.3102509985871518, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 35574 + }, + { + "epoch": 0.31025971987232037, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 35575 + }, + { + "epoch": 0.31026844115748897, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 35576 + }, + { + "epoch": 0.3102771624426576, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 35577 + }, + { + "epoch": 0.3102858837278261, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 35578 + }, + { + "epoch": 0.3102946050129947, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 35579 + }, + { + "epoch": 0.3103033262981633, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 35580 + }, + { + "epoch": 0.31031204758333186, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 35581 + }, + { + "epoch": 0.31032076886850046, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 35582 + }, + { + "epoch": 0.31032949015366906, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 35583 + }, + { + "epoch": 0.3103382114388376, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 35584 + }, + { + "epoch": 0.3103469327240062, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 35585 + }, + { + "epoch": 0.3103556540091748, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 35586 + }, + { + "epoch": 0.31036437529434335, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 35587 + }, + { + "epoch": 0.31037309657951195, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 35588 + }, + { + "epoch": 0.31038181786468055, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 35589 + }, + { + "epoch": 0.3103905391498491, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 35590 + }, + { + "epoch": 0.3103992604350177, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 35591 + }, + { + "epoch": 0.3104079817201863, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 35592 + }, + { + "epoch": 0.31041670300535484, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 35593 + }, + { + "epoch": 0.31042542429052344, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 35594 + }, + { + "epoch": 0.31043414557569204, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 35595 + }, + { + "epoch": 0.3104428668608606, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 35596 + }, + { + "epoch": 0.3104515881460292, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 35597 + }, + { + "epoch": 0.3104603094311978, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 35598 + }, + { + "epoch": 0.3104690307163664, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 35599 + }, + { + "epoch": 0.31047775200153493, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 35600 + }, + { + "epoch": 0.31048647328670353, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 35601 + }, + { + "epoch": 0.31049519457187214, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 35602 + }, + { + "epoch": 0.3105039158570407, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 35603 + }, + { + "epoch": 0.3105126371422093, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 35604 + }, + { + "epoch": 0.3105213584273779, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 35605 + }, + { + "epoch": 0.3105300797125464, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 35606 + }, + { + "epoch": 0.310538800997715, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 35607 + }, + { + "epoch": 0.3105475222828836, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 35608 + }, + { + "epoch": 0.31055624356805217, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 35609 + }, + { + "epoch": 0.31056496485322077, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 35610 + }, + { + "epoch": 0.31057368613838937, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 35611 + }, + { + "epoch": 0.3105824074235579, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 35612 + }, + { + "epoch": 0.3105911287087265, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 35613 + }, + { + "epoch": 0.3105998499938951, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 35614 + }, + { + "epoch": 0.31060857127906366, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 35615 + }, + { + "epoch": 0.31061729256423226, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 35616 + }, + { + "epoch": 0.31062601384940086, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 35617 + }, + { + "epoch": 0.3106347351345694, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 35618 + }, + { + "epoch": 0.310643456419738, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 35619 + }, + { + "epoch": 0.3106521777049066, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 35620 + }, + { + "epoch": 0.31066089899007515, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 35621 + }, + { + "epoch": 0.31066962027524375, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 35622 + }, + { + "epoch": 0.31067834156041235, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 35623 + }, + { + "epoch": 0.31068706284558095, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 35624 + }, + { + "epoch": 0.3106957841307495, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 35625 + }, + { + "epoch": 0.3107045054159181, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 35626 + }, + { + "epoch": 0.3107132267010867, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 35627 + }, + { + "epoch": 0.31072194798625524, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 35628 + }, + { + "epoch": 0.31073066927142384, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 35629 + }, + { + "epoch": 0.31073939055659244, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 35630 + }, + { + "epoch": 0.310748111841761, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 35631 + }, + { + "epoch": 0.3107568331269296, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 35632 + }, + { + "epoch": 0.3107655544120982, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 35633 + }, + { + "epoch": 0.31077427569726673, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 35634 + }, + { + "epoch": 0.31078299698243533, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 35635 + }, + { + "epoch": 0.31079171826760393, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 35636 + }, + { + "epoch": 0.3108004395527725, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 35637 + }, + { + "epoch": 0.3108091608379411, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 35638 + }, + { + "epoch": 0.3108178821231097, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 35639 + }, + { + "epoch": 0.3108266034082782, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35640 + }, + { + "epoch": 0.3108353246934468, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 35641 + }, + { + "epoch": 0.3108440459786154, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 35642 + }, + { + "epoch": 0.31085276726378397, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 35643 + }, + { + "epoch": 0.31086148854895257, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 35644 + }, + { + "epoch": 0.31087020983412117, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 35645 + }, + { + "epoch": 0.3108789311192897, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 35646 + }, + { + "epoch": 0.3108876524044583, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 35647 + }, + { + "epoch": 0.3108963736896269, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 35648 + }, + { + "epoch": 0.31090509497479546, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 35649 + }, + { + "epoch": 0.31091381625996406, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 35650 + }, + { + "epoch": 0.31092253754513266, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 35651 + }, + { + "epoch": 0.31093125883030126, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 35652 + }, + { + "epoch": 0.3109399801154698, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 35653 + }, + { + "epoch": 0.3109487014006384, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 35654 + }, + { + "epoch": 0.310957422685807, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 35655 + }, + { + "epoch": 0.31096614397097555, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 35656 + }, + { + "epoch": 0.31097486525614415, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 35657 + }, + { + "epoch": 0.31098358654131275, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 35658 + }, + { + "epoch": 0.3109923078264813, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 35659 + }, + { + "epoch": 0.3110010291116499, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 35660 + }, + { + "epoch": 0.3110097503968185, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 35661 + }, + { + "epoch": 0.31101847168198704, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 35662 + }, + { + "epoch": 0.31102719296715564, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 35663 + }, + { + "epoch": 0.31103591425232424, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 35664 + }, + { + "epoch": 0.3110446355374928, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 35665 + }, + { + "epoch": 0.3110533568226614, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 35666 + }, + { + "epoch": 0.31106207810783, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 35667 + }, + { + "epoch": 0.31107079939299853, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 35668 + }, + { + "epoch": 0.31107952067816713, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 35669 + }, + { + "epoch": 0.31108824196333573, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 35670 + }, + { + "epoch": 0.3110969632485043, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 35671 + }, + { + "epoch": 0.3111056845336729, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 35672 + }, + { + "epoch": 0.3111144058188415, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 35673 + }, + { + "epoch": 0.31112312710401, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 35674 + }, + { + "epoch": 0.3111318483891786, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 35675 + }, + { + "epoch": 0.3111405696743472, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 35676 + }, + { + "epoch": 0.31114929095951577, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 35677 + }, + { + "epoch": 0.31115801224468437, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35678 + }, + { + "epoch": 0.31116673352985297, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 35679 + }, + { + "epoch": 0.31117545481502157, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 35680 + }, + { + "epoch": 0.3111841761001901, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 35681 + }, + { + "epoch": 0.3111928973853587, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 35682 + }, + { + "epoch": 0.3112016186705273, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 35683 + }, + { + "epoch": 0.31121033995569586, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 35684 + }, + { + "epoch": 0.31121906124086446, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 35685 + }, + { + "epoch": 0.31122778252603306, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 35686 + }, + { + "epoch": 0.3112365038112016, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 35687 + }, + { + "epoch": 0.3112452250963702, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 35688 + }, + { + "epoch": 0.3112539463815388, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35689 + }, + { + "epoch": 0.31126266766670735, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 35690 + }, + { + "epoch": 0.31127138895187595, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 35691 + }, + { + "epoch": 0.31128011023704455, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 35692 + }, + { + "epoch": 0.3112888315222131, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 35693 + }, + { + "epoch": 0.3112975528073817, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 35694 + }, + { + "epoch": 0.3113062740925503, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 35695 + }, + { + "epoch": 0.31131499537771884, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 35696 + }, + { + "epoch": 0.31132371666288744, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 35697 + }, + { + "epoch": 0.31133243794805604, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 35698 + }, + { + "epoch": 0.3113411592332246, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 35699 + }, + { + "epoch": 0.3113498805183932, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35700 + }, + { + "epoch": 0.3113586018035618, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 35701 + }, + { + "epoch": 0.31136732308873033, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 35702 + }, + { + "epoch": 0.31137604437389893, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 35703 + }, + { + "epoch": 0.31138476565906753, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 35704 + }, + { + "epoch": 0.3113934869442361, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 35705 + }, + { + "epoch": 0.3114022082294047, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 35706 + }, + { + "epoch": 0.3114109295145733, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 35707 + }, + { + "epoch": 0.3114196507997419, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 35708 + }, + { + "epoch": 0.3114283720849104, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 35709 + }, + { + "epoch": 0.311437093370079, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 35710 + }, + { + "epoch": 0.3114458146552476, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 35711 + }, + { + "epoch": 0.31145453594041617, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 35712 + }, + { + "epoch": 0.31146325722558477, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 35713 + }, + { + "epoch": 0.31147197851075337, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 35714 + }, + { + "epoch": 0.3114806997959219, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 35715 + }, + { + "epoch": 0.3114894210810905, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 35716 + }, + { + "epoch": 0.3114981423662591, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 35717 + }, + { + "epoch": 0.31150686365142766, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 35718 + }, + { + "epoch": 0.31151558493659626, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 35719 + }, + { + "epoch": 0.31152430622176486, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 35720 + }, + { + "epoch": 0.3115330275069334, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 35721 + }, + { + "epoch": 0.311541748792102, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 35722 + }, + { + "epoch": 0.3115504700772706, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 35723 + }, + { + "epoch": 0.31155919136243915, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 35724 + }, + { + "epoch": 0.31156791264760775, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 35725 + }, + { + "epoch": 0.31157663393277635, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 35726 + }, + { + "epoch": 0.3115853552179449, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 35727 + }, + { + "epoch": 0.3115940765031135, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 35728 + }, + { + "epoch": 0.3116027977882821, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35729 + }, + { + "epoch": 0.31161151907345064, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 35730 + }, + { + "epoch": 0.31162024035861924, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 35731 + }, + { + "epoch": 0.31162896164378784, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 35732 + }, + { + "epoch": 0.31163768292895644, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 35733 + }, + { + "epoch": 0.311646404214125, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 35734 + }, + { + "epoch": 0.3116551254992936, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 35735 + }, + { + "epoch": 0.3116638467844622, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 35736 + }, + { + "epoch": 0.31167256806963073, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 35737 + }, + { + "epoch": 0.31168128935479933, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 35738 + }, + { + "epoch": 0.31169001063996793, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 35739 + }, + { + "epoch": 0.3116987319251365, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 35740 + }, + { + "epoch": 0.3117074532103051, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 35741 + }, + { + "epoch": 0.3117161744954737, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 35742 + }, + { + "epoch": 0.3117248957806422, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 35743 + }, + { + "epoch": 0.3117336170658108, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 35744 + }, + { + "epoch": 0.3117423383509794, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 35745 + }, + { + "epoch": 0.31175105963614796, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 35746 + }, + { + "epoch": 0.31175978092131656, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 35747 + }, + { + "epoch": 0.31176850220648517, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 35748 + }, + { + "epoch": 0.3117772234916537, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 35749 + }, + { + "epoch": 0.3117859447768223, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 35750 + }, + { + "epoch": 0.3117946660619909, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 35751 + }, + { + "epoch": 0.31180338734715946, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 35752 + }, + { + "epoch": 0.31181210863232806, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 35753 + }, + { + "epoch": 0.31182082991749666, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 35754 + }, + { + "epoch": 0.3118295512026652, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 35755 + }, + { + "epoch": 0.3118382724878338, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 35756 + }, + { + "epoch": 0.3118469937730024, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 35757 + }, + { + "epoch": 0.31185571505817095, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 35758 + }, + { + "epoch": 0.31186443634333955, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 35759 + }, + { + "epoch": 0.31187315762850815, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35760 + }, + { + "epoch": 0.31188187891367675, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 35761 + }, + { + "epoch": 0.3118906001988453, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 35762 + }, + { + "epoch": 0.3118993214840139, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 35763 + }, + { + "epoch": 0.3119080427691825, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 35764 + }, + { + "epoch": 0.31191676405435104, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 35765 + }, + { + "epoch": 0.31192548533951964, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 35766 + }, + { + "epoch": 0.31193420662468824, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 35767 + }, + { + "epoch": 0.3119429279098568, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 35768 + }, + { + "epoch": 0.3119516491950254, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 35769 + }, + { + "epoch": 0.311960370480194, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 35770 + }, + { + "epoch": 0.3119690917653625, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 35771 + }, + { + "epoch": 0.3119778130505311, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 35772 + }, + { + "epoch": 0.31198653433569973, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 35773 + }, + { + "epoch": 0.3119952556208683, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 35774 + }, + { + "epoch": 0.3120039769060369, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 35775 + }, + { + "epoch": 0.3120126981912055, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 35776 + }, + { + "epoch": 0.312021419476374, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 35777 + }, + { + "epoch": 0.3120301407615426, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 35778 + }, + { + "epoch": 0.3120388620467112, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 35779 + }, + { + "epoch": 0.31204758333187976, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 35780 + }, + { + "epoch": 0.31205630461704836, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 35781 + }, + { + "epoch": 0.31206502590221696, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 35782 + }, + { + "epoch": 0.3120737471873855, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 35783 + }, + { + "epoch": 0.3120824684725541, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 35784 + }, + { + "epoch": 0.3120911897577227, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 35785 + }, + { + "epoch": 0.31209991104289125, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 35786 + }, + { + "epoch": 0.31210863232805985, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 35787 + }, + { + "epoch": 0.31211735361322845, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 35788 + }, + { + "epoch": 0.31212607489839705, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 35789 + }, + { + "epoch": 0.3121347961835656, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 35790 + }, + { + "epoch": 0.3121435174687342, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 35791 + }, + { + "epoch": 0.3121522387539028, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 35792 + }, + { + "epoch": 0.31216096003907134, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 35793 + }, + { + "epoch": 0.31216968132423994, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 35794 + }, + { + "epoch": 0.31217840260940855, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 35795 + }, + { + "epoch": 0.3121871238945771, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 35796 + }, + { + "epoch": 0.3121958451797457, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 35797 + }, + { + "epoch": 0.3122045664649143, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 35798 + }, + { + "epoch": 0.31221328775008284, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 35799 + }, + { + "epoch": 0.31222200903525144, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 35800 + }, + { + "epoch": 0.31223073032042004, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 35801 + }, + { + "epoch": 0.3122394516055886, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 35802 + }, + { + "epoch": 0.3122481728907572, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 35803 + }, + { + "epoch": 0.3122568941759258, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 35804 + }, + { + "epoch": 0.3122656154610943, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 35805 + }, + { + "epoch": 0.3122743367462629, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 35806 + }, + { + "epoch": 0.3122830580314315, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 35807 + }, + { + "epoch": 0.31229177931660007, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 35808 + }, + { + "epoch": 0.31230050060176867, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 35809 + }, + { + "epoch": 0.31230922188693727, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 35810 + }, + { + "epoch": 0.3123179431721058, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 35811 + }, + { + "epoch": 0.3123266644572744, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 35812 + }, + { + "epoch": 0.312335385742443, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 35813 + }, + { + "epoch": 0.31234410702761156, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 35814 + }, + { + "epoch": 0.31235282831278016, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 35815 + }, + { + "epoch": 0.31236154959794876, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 35816 + }, + { + "epoch": 0.31237027088311736, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 35817 + }, + { + "epoch": 0.3123789921682859, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 35818 + }, + { + "epoch": 0.3123877134534545, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 35819 + }, + { + "epoch": 0.3123964347386231, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 35820 + }, + { + "epoch": 0.31240515602379165, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 35821 + }, + { + "epoch": 0.31241387730896025, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 35822 + }, + { + "epoch": 0.31242259859412885, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 35823 + }, + { + "epoch": 0.3124313198792974, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 35824 + }, + { + "epoch": 0.312440041164466, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 35825 + }, + { + "epoch": 0.3124487624496346, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 35826 + }, + { + "epoch": 0.31245748373480314, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 35827 + }, + { + "epoch": 0.31246620501997174, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 35828 + }, + { + "epoch": 0.31247492630514034, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 35829 + }, + { + "epoch": 0.3124836475903089, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 35830 + }, + { + "epoch": 0.3124923688754775, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 35831 + }, + { + "epoch": 0.3125010901606461, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 35832 + }, + { + "epoch": 0.31250981144581463, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 35833 + }, + { + "epoch": 0.31251853273098323, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 35834 + }, + { + "epoch": 0.31252725401615183, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 35835 + }, + { + "epoch": 0.3125359753013204, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 35836 + }, + { + "epoch": 0.312544696586489, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 35837 + }, + { + "epoch": 0.3125534178716576, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35838 + }, + { + "epoch": 0.3125621391568261, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 35839 + }, + { + "epoch": 0.3125708604419947, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 35840 + }, + { + "epoch": 0.3125795817271633, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 35841 + }, + { + "epoch": 0.31258830301233187, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 35842 + }, + { + "epoch": 0.31259702429750047, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 35843 + }, + { + "epoch": 0.31260574558266907, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 35844 + }, + { + "epoch": 0.31261446686783767, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 35845 + }, + { + "epoch": 0.3126231881530062, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 35846 + }, + { + "epoch": 0.3126319094381748, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 35847 + }, + { + "epoch": 0.3126406307233434, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 35848 + }, + { + "epoch": 0.31264935200851196, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 35849 + }, + { + "epoch": 0.31265807329368056, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 35850 + }, + { + "epoch": 0.31266679457884916, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 35851 + }, + { + "epoch": 0.3126755158640177, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 35852 + }, + { + "epoch": 0.3126842371491863, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 35853 + }, + { + "epoch": 0.3126929584343549, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 35854 + }, + { + "epoch": 0.31270167971952345, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 35855 + }, + { + "epoch": 0.31271040100469205, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 35856 + }, + { + "epoch": 0.31271912228986065, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 35857 + }, + { + "epoch": 0.3127278435750292, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 35858 + }, + { + "epoch": 0.3127365648601978, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 35859 + }, + { + "epoch": 0.3127452861453664, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 35860 + }, + { + "epoch": 0.31275400743053494, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 35861 + }, + { + "epoch": 0.31276272871570354, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 35862 + }, + { + "epoch": 0.31277145000087214, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 35863 + }, + { + "epoch": 0.3127801712860407, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 35864 + }, + { + "epoch": 0.3127888925712093, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 35865 + }, + { + "epoch": 0.3127976138563779, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 35866 + }, + { + "epoch": 0.31280633514154643, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 35867 + }, + { + "epoch": 0.31281505642671503, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35868 + }, + { + "epoch": 0.31282377771188363, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 35869 + }, + { + "epoch": 0.31283249899705223, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 35870 + }, + { + "epoch": 0.3128412202822208, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 35871 + }, + { + "epoch": 0.3128499415673894, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 35872 + }, + { + "epoch": 0.312858662852558, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 35873 + }, + { + "epoch": 0.3128673841377265, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 35874 + }, + { + "epoch": 0.3128761054228951, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 35875 + }, + { + "epoch": 0.3128848267080637, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 35876 + }, + { + "epoch": 0.31289354799323227, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 35877 + }, + { + "epoch": 0.31290226927840087, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 35878 + }, + { + "epoch": 0.31291099056356947, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 35879 + }, + { + "epoch": 0.312919711848738, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 35880 + }, + { + "epoch": 0.3129284331339066, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 35881 + }, + { + "epoch": 0.3129371544190752, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 35882 + }, + { + "epoch": 0.31294587570424376, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 35883 + }, + { + "epoch": 0.31295459698941236, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 35884 + }, + { + "epoch": 0.31296331827458096, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 35885 + }, + { + "epoch": 0.3129720395597495, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 35886 + }, + { + "epoch": 0.3129807608449181, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 35887 + }, + { + "epoch": 0.3129894821300867, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 35888 + }, + { + "epoch": 0.31299820341525525, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 35889 + }, + { + "epoch": 0.31300692470042385, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 35890 + }, + { + "epoch": 0.31301564598559245, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 35891 + }, + { + "epoch": 0.313024367270761, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 35892 + }, + { + "epoch": 0.3130330885559296, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.976, + "step": 35893 + }, + { + "epoch": 0.3130418098410982, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 35894 + }, + { + "epoch": 0.31305053112626674, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 35895 + }, + { + "epoch": 0.31305925241143534, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 35896 + }, + { + "epoch": 0.31306797369660394, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 35897 + }, + { + "epoch": 0.31307669498177254, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 35898 + }, + { + "epoch": 0.3130854162669411, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 35899 + }, + { + "epoch": 0.3130941375521097, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 35900 + }, + { + "epoch": 0.3131028588372783, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 35901 + }, + { + "epoch": 0.31311158012244683, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 35902 + }, + { + "epoch": 0.31312030140761543, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 35903 + }, + { + "epoch": 0.31312902269278403, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 35904 + }, + { + "epoch": 0.3131377439779526, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 35905 + }, + { + "epoch": 0.3131464652631212, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 35906 + }, + { + "epoch": 0.3131551865482898, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 35907 + }, + { + "epoch": 0.3131639078334583, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 35908 + }, + { + "epoch": 0.3131726291186269, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 35909 + }, + { + "epoch": 0.3131813504037955, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 35910 + }, + { + "epoch": 0.31319007168896407, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 35911 + }, + { + "epoch": 0.31319879297413267, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 35912 + }, + { + "epoch": 0.31320751425930127, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 35913 + }, + { + "epoch": 0.3132162355444698, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 35914 + }, + { + "epoch": 0.3132249568296384, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35915 + }, + { + "epoch": 0.313233678114807, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 35916 + }, + { + "epoch": 0.31324239939997556, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 35917 + }, + { + "epoch": 0.31325112068514416, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 35918 + }, + { + "epoch": 0.31325984197031276, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 35919 + }, + { + "epoch": 0.3132685632554813, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 35920 + }, + { + "epoch": 0.3132772845406499, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 35921 + }, + { + "epoch": 0.3132860058258185, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 35922 + }, + { + "epoch": 0.31329472711098705, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 35923 + }, + { + "epoch": 0.31330344839615565, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 35924 + }, + { + "epoch": 0.31331216968132425, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 35925 + }, + { + "epoch": 0.31332089096649285, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 35926 + }, + { + "epoch": 0.3133296122516614, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 35927 + }, + { + "epoch": 0.31333833353683, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 35928 + }, + { + "epoch": 0.3133470548219986, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 35929 + }, + { + "epoch": 0.31335577610716714, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 35930 + }, + { + "epoch": 0.31336449739233574, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 35931 + }, + { + "epoch": 0.31337321867750434, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 35932 + }, + { + "epoch": 0.3133819399626729, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 35933 + }, + { + "epoch": 0.3133906612478415, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 35934 + }, + { + "epoch": 0.3133993825330101, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 35935 + }, + { + "epoch": 0.31340810381817863, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 35936 + }, + { + "epoch": 0.31341682510334723, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 35937 + }, + { + "epoch": 0.31342554638851583, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 35938 + }, + { + "epoch": 0.3134342676736844, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 35939 + }, + { + "epoch": 0.313442988958853, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 35940 + }, + { + "epoch": 0.3134517102440216, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 35941 + }, + { + "epoch": 0.3134604315291901, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 35942 + }, + { + "epoch": 0.3134691528143587, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 35943 + }, + { + "epoch": 0.3134778740995273, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 35944 + }, + { + "epoch": 0.31348659538469587, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 35945 + }, + { + "epoch": 0.31349531666986447, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 35946 + }, + { + "epoch": 0.31350403795503307, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 35947 + }, + { + "epoch": 0.3135127592402016, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 35948 + }, + { + "epoch": 0.3135214805253702, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 35949 + }, + { + "epoch": 0.3135302018105388, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 35950 + }, + { + "epoch": 0.31353892309570736, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 35951 + }, + { + "epoch": 0.31354764438087596, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 35952 + }, + { + "epoch": 0.31355636566604456, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 35953 + }, + { + "epoch": 0.31356508695121316, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 35954 + }, + { + "epoch": 0.3135738082363817, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 35955 + }, + { + "epoch": 0.3135825295215503, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 35956 + }, + { + "epoch": 0.3135912508067189, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 35957 + }, + { + "epoch": 0.31359997209188745, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 35958 + }, + { + "epoch": 0.31360869337705605, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 35959 + }, + { + "epoch": 0.31361741466222465, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 35960 + }, + { + "epoch": 0.3136261359473932, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 35961 + }, + { + "epoch": 0.3136348572325618, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 35962 + }, + { + "epoch": 0.3136435785177304, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 35963 + }, + { + "epoch": 0.31365229980289894, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 35964 + }, + { + "epoch": 0.31366102108806754, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 35965 + }, + { + "epoch": 0.31366974237323614, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 35966 + }, + { + "epoch": 0.3136784636584047, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 35967 + }, + { + "epoch": 0.3136871849435733, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 35968 + }, + { + "epoch": 0.3136959062287419, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 35969 + }, + { + "epoch": 0.31370462751391043, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 35970 + }, + { + "epoch": 0.31371334879907903, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 35971 + }, + { + "epoch": 0.31372207008424763, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 35972 + }, + { + "epoch": 0.3137307913694162, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 35973 + }, + { + "epoch": 0.3137395126545848, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 35974 + }, + { + "epoch": 0.3137482339397534, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 35975 + }, + { + "epoch": 0.3137569552249219, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 35976 + }, + { + "epoch": 0.3137656765100905, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 35977 + }, + { + "epoch": 0.3137743977952591, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 35978 + }, + { + "epoch": 0.3137831190804277, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 35979 + }, + { + "epoch": 0.31379184036559626, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 35980 + }, + { + "epoch": 0.31380056165076486, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 35981 + }, + { + "epoch": 0.31380928293593346, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 35982 + }, + { + "epoch": 0.313818004221102, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 35983 + }, + { + "epoch": 0.3138267255062706, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 35984 + }, + { + "epoch": 0.3138354467914392, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 35985 + }, + { + "epoch": 0.31384416807660775, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 35986 + }, + { + "epoch": 0.31385288936177635, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 35987 + }, + { + "epoch": 0.31386161064694496, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 35988 + }, + { + "epoch": 0.3138703319321135, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 35989 + }, + { + "epoch": 0.3138790532172821, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 35990 + }, + { + "epoch": 0.3138877745024507, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 35991 + }, + { + "epoch": 0.31389649578761925, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 35992 + }, + { + "epoch": 0.31390521707278785, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 35993 + }, + { + "epoch": 0.31391393835795645, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 35994 + }, + { + "epoch": 0.313922659643125, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 35995 + }, + { + "epoch": 0.3139313809282936, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 35996 + }, + { + "epoch": 0.3139401022134622, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 35997 + }, + { + "epoch": 0.31394882349863074, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 35998 + }, + { + "epoch": 0.31395754478379934, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 35999 + }, + { + "epoch": 0.31396626606896794, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 36000 + }, + { + "epoch": 0.3139749873541365, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 36001 + }, + { + "epoch": 0.3139837086393051, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 36002 + }, + { + "epoch": 0.3139924299244737, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 36003 + }, + { + "epoch": 0.3140011512096422, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 36004 + }, + { + "epoch": 0.3140098724948108, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 36005 + }, + { + "epoch": 0.3140185937799794, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 36006 + }, + { + "epoch": 0.314027315065148, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 36007 + }, + { + "epoch": 0.31403603635031657, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 36008 + }, + { + "epoch": 0.31404475763548517, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 36009 + }, + { + "epoch": 0.3140534789206538, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 36010 + }, + { + "epoch": 0.3140622002058223, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 36011 + }, + { + "epoch": 0.3140709214909909, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 36012 + }, + { + "epoch": 0.3140796427761595, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 36013 + }, + { + "epoch": 0.31408836406132806, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 36014 + }, + { + "epoch": 0.31409708534649666, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 36015 + }, + { + "epoch": 0.31410580663166526, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 36016 + }, + { + "epoch": 0.3141145279168338, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 36017 + }, + { + "epoch": 0.3141232492020024, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36018 + }, + { + "epoch": 0.314131970487171, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 36019 + }, + { + "epoch": 0.31414069177233955, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 36020 + }, + { + "epoch": 0.31414941305750815, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 36021 + }, + { + "epoch": 0.31415813434267675, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 36022 + }, + { + "epoch": 0.3141668556278453, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 36023 + }, + { + "epoch": 0.3141755769130139, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 36024 + }, + { + "epoch": 0.3141842981981825, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 36025 + }, + { + "epoch": 0.31419301948335104, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 36026 + }, + { + "epoch": 0.31420174076851964, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 36027 + }, + { + "epoch": 0.31421046205368824, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 36028 + }, + { + "epoch": 0.3142191833388568, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 36029 + }, + { + "epoch": 0.3142279046240254, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 36030 + }, + { + "epoch": 0.314236625909194, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 36031 + }, + { + "epoch": 0.31424534719436253, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 36032 + }, + { + "epoch": 0.31425406847953113, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 36033 + }, + { + "epoch": 0.31426278976469973, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 36034 + }, + { + "epoch": 0.31427151104986834, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 36035 + }, + { + "epoch": 0.3142802323350369, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9861, + "step": 36036 + }, + { + "epoch": 0.3142889536202055, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 36037 + }, + { + "epoch": 0.3142976749053741, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 36038 + }, + { + "epoch": 0.3143063961905426, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 36039 + }, + { + "epoch": 0.3143151174757112, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 36040 + }, + { + "epoch": 0.3143238387608798, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 36041 + }, + { + "epoch": 0.31433256004604837, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 36042 + }, + { + "epoch": 0.31434128133121697, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 36043 + }, + { + "epoch": 0.31435000261638557, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 36044 + }, + { + "epoch": 0.3143587239015541, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 36045 + }, + { + "epoch": 0.3143674451867227, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36046 + }, + { + "epoch": 0.3143761664718913, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 36047 + }, + { + "epoch": 0.31438488775705986, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 36048 + }, + { + "epoch": 0.31439360904222846, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 36049 + }, + { + "epoch": 0.31440233032739706, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 36050 + }, + { + "epoch": 0.3144110516125656, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 36051 + }, + { + "epoch": 0.3144197728977342, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 36052 + }, + { + "epoch": 0.3144284941829028, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 36053 + }, + { + "epoch": 0.31443721546807135, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 36054 + }, + { + "epoch": 0.31444593675323995, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 36055 + }, + { + "epoch": 0.31445465803840855, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 36056 + }, + { + "epoch": 0.3144633793235771, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 36057 + }, + { + "epoch": 0.3144721006087457, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 36058 + }, + { + "epoch": 0.3144808218939143, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9596, + "step": 36059 + }, + { + "epoch": 0.31448954317908284, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 36060 + }, + { + "epoch": 0.31449826446425144, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36061 + }, + { + "epoch": 0.31450698574942004, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 36062 + }, + { + "epoch": 0.31451570703458864, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 36063 + }, + { + "epoch": 0.3145244283197572, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 36064 + }, + { + "epoch": 0.3145331496049258, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36065 + }, + { + "epoch": 0.3145418708900944, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 36066 + }, + { + "epoch": 0.31455059217526293, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 36067 + }, + { + "epoch": 0.31455931346043153, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 36068 + }, + { + "epoch": 0.31456803474560013, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 36069 + }, + { + "epoch": 0.3145767560307687, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 36070 + }, + { + "epoch": 0.3145854773159373, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 36071 + }, + { + "epoch": 0.3145941986011059, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 36072 + }, + { + "epoch": 0.3146029198862744, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 36073 + }, + { + "epoch": 0.314611641171443, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 36074 + }, + { + "epoch": 0.3146203624566116, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 36075 + }, + { + "epoch": 0.31462908374178017, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36076 + }, + { + "epoch": 0.31463780502694877, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36077 + }, + { + "epoch": 0.31464652631211737, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 36078 + }, + { + "epoch": 0.3146552475972859, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 36079 + }, + { + "epoch": 0.3146639688824545, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36080 + }, + { + "epoch": 0.3146726901676231, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 36081 + }, + { + "epoch": 0.31468141145279166, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 36082 + }, + { + "epoch": 0.31469013273796026, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 36083 + }, + { + "epoch": 0.31469885402312886, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 36084 + }, + { + "epoch": 0.3147075753082974, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 36085 + }, + { + "epoch": 0.314716296593466, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 36086 + }, + { + "epoch": 0.3147250178786346, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 36087 + }, + { + "epoch": 0.3147337391638032, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 36088 + }, + { + "epoch": 0.31474246044897175, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 36089 + }, + { + "epoch": 0.31475118173414035, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36090 + }, + { + "epoch": 0.31475990301930895, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 36091 + }, + { + "epoch": 0.3147686243044775, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 36092 + }, + { + "epoch": 0.3147773455896461, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 36093 + }, + { + "epoch": 0.3147860668748147, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 36094 + }, + { + "epoch": 0.31479478815998324, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 36095 + }, + { + "epoch": 0.31480350944515184, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 36096 + }, + { + "epoch": 0.31481223073032044, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 36097 + }, + { + "epoch": 0.314820952015489, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 36098 + }, + { + "epoch": 0.3148296733006576, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 36099 + }, + { + "epoch": 0.3148383945858262, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 36100 + }, + { + "epoch": 0.31484711587099473, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 36101 + }, + { + "epoch": 0.31485583715616333, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 36102 + }, + { + "epoch": 0.31486455844133193, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 36103 + }, + { + "epoch": 0.3148732797265005, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 36104 + }, + { + "epoch": 0.3148820010116691, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 36105 + }, + { + "epoch": 0.3148907222968377, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 36106 + }, + { + "epoch": 0.3148994435820062, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 36107 + }, + { + "epoch": 0.3149081648671748, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 36108 + }, + { + "epoch": 0.3149168861523434, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 36109 + }, + { + "epoch": 0.31492560743751197, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 36110 + }, + { + "epoch": 0.31493432872268057, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36111 + }, + { + "epoch": 0.31494305000784917, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 36112 + }, + { + "epoch": 0.3149517712930177, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 36113 + }, + { + "epoch": 0.3149604925781863, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 36114 + }, + { + "epoch": 0.3149692138633549, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 36115 + }, + { + "epoch": 0.3149779351485235, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 36116 + }, + { + "epoch": 0.31498665643369206, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 36117 + }, + { + "epoch": 0.31499537771886066, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 36118 + }, + { + "epoch": 0.31500409900402926, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 36119 + }, + { + "epoch": 0.3150128202891978, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 36120 + }, + { + "epoch": 0.3150215415743664, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 36121 + }, + { + "epoch": 0.315030262859535, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 36122 + }, + { + "epoch": 0.31503898414470355, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 36123 + }, + { + "epoch": 0.31504770542987215, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 36124 + }, + { + "epoch": 0.31505642671504075, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 36125 + }, + { + "epoch": 0.3150651480002093, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 36126 + }, + { + "epoch": 0.3150738692853779, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 36127 + }, + { + "epoch": 0.3150825905705465, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 36128 + }, + { + "epoch": 0.31509131185571504, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 36129 + }, + { + "epoch": 0.31510003314088364, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 36130 + }, + { + "epoch": 0.31510875442605224, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 36131 + }, + { + "epoch": 0.3151174757112208, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 36132 + }, + { + "epoch": 0.3151261969963894, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 36133 + }, + { + "epoch": 0.315134918281558, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 36134 + }, + { + "epoch": 0.31514363956672653, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 36135 + }, + { + "epoch": 0.31515236085189513, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 36136 + }, + { + "epoch": 0.31516108213706373, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 36137 + }, + { + "epoch": 0.3151698034222323, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 36138 + }, + { + "epoch": 0.3151785247074009, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 36139 + }, + { + "epoch": 0.3151872459925695, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 36140 + }, + { + "epoch": 0.315195967277738, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 36141 + }, + { + "epoch": 0.3152046885629066, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 36142 + }, + { + "epoch": 0.3152134098480752, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 36143 + }, + { + "epoch": 0.3152221311332438, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 36144 + }, + { + "epoch": 0.31523085241841237, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 36145 + }, + { + "epoch": 0.31523957370358097, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 36146 + }, + { + "epoch": 0.31524829498874957, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.982, + "step": 36147 + }, + { + "epoch": 0.3152570162739181, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 36148 + }, + { + "epoch": 0.3152657375590867, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 36149 + }, + { + "epoch": 0.3152744588442553, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 36150 + }, + { + "epoch": 0.31528318012942386, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 36151 + }, + { + "epoch": 0.31529190141459246, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 36152 + }, + { + "epoch": 0.31530062269976106, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 36153 + }, + { + "epoch": 0.3153093439849296, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 36154 + }, + { + "epoch": 0.3153180652700982, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 36155 + }, + { + "epoch": 0.3153267865552668, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 36156 + }, + { + "epoch": 0.31533550784043535, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 36157 + }, + { + "epoch": 0.31534422912560395, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 36158 + }, + { + "epoch": 0.31535295041077255, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 36159 + }, + { + "epoch": 0.3153616716959411, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 36160 + }, + { + "epoch": 0.3153703929811097, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 36161 + }, + { + "epoch": 0.3153791142662783, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 36162 + }, + { + "epoch": 0.31538783555144684, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 36163 + }, + { + "epoch": 0.31539655683661544, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 36164 + }, + { + "epoch": 0.31540527812178404, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 36165 + }, + { + "epoch": 0.3154139994069526, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 36166 + }, + { + "epoch": 0.3154227206921212, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36167 + }, + { + "epoch": 0.3154314419772898, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 36168 + }, + { + "epoch": 0.31544016326245833, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 36169 + }, + { + "epoch": 0.31544888454762693, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 36170 + }, + { + "epoch": 0.31545760583279553, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 36171 + }, + { + "epoch": 0.31546632711796413, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 36172 + }, + { + "epoch": 0.3154750484031327, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 36173 + }, + { + "epoch": 0.3154837696883013, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 36174 + }, + { + "epoch": 0.3154924909734699, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 36175 + }, + { + "epoch": 0.3155012122586384, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 36176 + }, + { + "epoch": 0.315509933543807, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 36177 + }, + { + "epoch": 0.3155186548289756, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 36178 + }, + { + "epoch": 0.31552737611414416, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 36179 + }, + { + "epoch": 0.31553609739931276, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 36180 + }, + { + "epoch": 0.31554481868448137, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 36181 + }, + { + "epoch": 0.3155535399696499, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 36182 + }, + { + "epoch": 0.3155622612548185, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 36183 + }, + { + "epoch": 0.3155709825399871, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 36184 + }, + { + "epoch": 0.31557970382515566, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 36185 + }, + { + "epoch": 0.31558842511032426, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 36186 + }, + { + "epoch": 0.31559714639549286, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 36187 + }, + { + "epoch": 0.3156058676806614, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 36188 + }, + { + "epoch": 0.31561458896583, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 36189 + }, + { + "epoch": 0.3156233102509986, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 36190 + }, + { + "epoch": 0.31563203153616715, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 36191 + }, + { + "epoch": 0.31564075282133575, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 36192 + }, + { + "epoch": 0.31564947410650435, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 36193 + }, + { + "epoch": 0.3156581953916729, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 36194 + }, + { + "epoch": 0.3156669166768415, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 36195 + }, + { + "epoch": 0.3156756379620101, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36196 + }, + { + "epoch": 0.3156843592471787, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 36197 + }, + { + "epoch": 0.31569308053234724, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 36198 + }, + { + "epoch": 0.31570180181751584, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 36199 + }, + { + "epoch": 0.31571052310268444, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 36200 + }, + { + "epoch": 0.315719244387853, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 36201 + }, + { + "epoch": 0.3157279656730216, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 36202 + }, + { + "epoch": 0.3157366869581902, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 36203 + }, + { + "epoch": 0.3157454082433587, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 36204 + }, + { + "epoch": 0.3157541295285273, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 36205 + }, + { + "epoch": 0.31576285081369593, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 36206 + }, + { + "epoch": 0.3157715720988645, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 36207 + }, + { + "epoch": 0.3157802933840331, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 36208 + }, + { + "epoch": 0.3157890146692017, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 36209 + }, + { + "epoch": 0.3157977359543702, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 36210 + }, + { + "epoch": 0.3158064572395388, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 36211 + }, + { + "epoch": 0.3158151785247074, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 36212 + }, + { + "epoch": 0.31582389980987596, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 36213 + }, + { + "epoch": 0.31583262109504456, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 36214 + }, + { + "epoch": 0.31584134238021316, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 36215 + }, + { + "epoch": 0.3158500636653817, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 36216 + }, + { + "epoch": 0.3158587849505503, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 36217 + }, + { + "epoch": 0.3158675062357189, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 36218 + }, + { + "epoch": 0.31587622752088745, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36219 + }, + { + "epoch": 0.31588494880605605, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 36220 + }, + { + "epoch": 0.31589367009122465, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 36221 + }, + { + "epoch": 0.3159023913763932, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 36222 + }, + { + "epoch": 0.3159111126615618, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 36223 + }, + { + "epoch": 0.3159198339467304, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 36224 + }, + { + "epoch": 0.315928555231899, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 36225 + }, + { + "epoch": 0.31593727651706754, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 36226 + }, + { + "epoch": 0.31594599780223614, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36227 + }, + { + "epoch": 0.31595471908740475, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 36228 + }, + { + "epoch": 0.3159634403725733, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36229 + }, + { + "epoch": 0.3159721616577419, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 36230 + }, + { + "epoch": 0.3159808829429105, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 36231 + }, + { + "epoch": 0.31598960422807904, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 36232 + }, + { + "epoch": 0.31599832551324764, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 36233 + }, + { + "epoch": 0.31600704679841624, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 36234 + }, + { + "epoch": 0.3160157680835848, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 36235 + }, + { + "epoch": 0.3160244893687534, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 36236 + }, + { + "epoch": 0.316033210653922, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 36237 + }, + { + "epoch": 0.3160419319390905, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 36238 + }, + { + "epoch": 0.3160506532242591, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 36239 + }, + { + "epoch": 0.3160593745094277, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 36240 + }, + { + "epoch": 0.31606809579459627, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 36241 + }, + { + "epoch": 0.31607681707976487, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 36242 + }, + { + "epoch": 0.31608553836493347, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 36243 + }, + { + "epoch": 0.316094259650102, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 36244 + }, + { + "epoch": 0.3161029809352706, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 36245 + }, + { + "epoch": 0.3161117022204392, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 36246 + }, + { + "epoch": 0.31612042350560776, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 36247 + }, + { + "epoch": 0.31612914479077636, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 36248 + }, + { + "epoch": 0.31613786607594496, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 36249 + }, + { + "epoch": 0.3161465873611135, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 36250 + }, + { + "epoch": 0.3161553086462821, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 36251 + }, + { + "epoch": 0.3161640299314507, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 36252 + }, + { + "epoch": 0.3161727512166193, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 36253 + }, + { + "epoch": 0.31618147250178785, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 36254 + }, + { + "epoch": 0.31619019378695645, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 36255 + }, + { + "epoch": 0.31619891507212505, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 36256 + }, + { + "epoch": 0.3162076363572936, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 36257 + }, + { + "epoch": 0.3162163576424622, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 36258 + }, + { + "epoch": 0.3162250789276308, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 36259 + }, + { + "epoch": 0.31623380021279934, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 36260 + }, + { + "epoch": 0.31624252149796794, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 36261 + }, + { + "epoch": 0.31625124278313654, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 36262 + }, + { + "epoch": 0.3162599640683051, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 36263 + }, + { + "epoch": 0.3162686853534737, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 36264 + }, + { + "epoch": 0.3162774066386423, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 36265 + }, + { + "epoch": 0.31628612792381083, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 36266 + }, + { + "epoch": 0.31629484920897943, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 36267 + }, + { + "epoch": 0.31630357049414803, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 36268 + }, + { + "epoch": 0.3163122917793166, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 36269 + }, + { + "epoch": 0.3163210130644852, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 36270 + }, + { + "epoch": 0.3163297343496538, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 36271 + }, + { + "epoch": 0.3163384556348223, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 36272 + }, + { + "epoch": 0.3163471769199909, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36273 + }, + { + "epoch": 0.3163558982051595, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 36274 + }, + { + "epoch": 0.31636461949032807, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 36275 + }, + { + "epoch": 0.31637334077549667, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 36276 + }, + { + "epoch": 0.31638206206066527, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 36277 + }, + { + "epoch": 0.3163907833458338, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36278 + }, + { + "epoch": 0.3163995046310024, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 36279 + }, + { + "epoch": 0.316408225916171, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 36280 + }, + { + "epoch": 0.3164169472013396, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 36281 + }, + { + "epoch": 0.31642566848650816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 36282 + }, + { + "epoch": 0.31643438977167676, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 36283 + }, + { + "epoch": 0.31644311105684536, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 36284 + }, + { + "epoch": 0.3164518323420139, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 36285 + }, + { + "epoch": 0.3164605536271825, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 36286 + }, + { + "epoch": 0.3164692749123511, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 36287 + }, + { + "epoch": 0.31647799619751965, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 36288 + }, + { + "epoch": 0.31648671748268825, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 36289 + }, + { + "epoch": 0.31649543876785685, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 36290 + }, + { + "epoch": 0.3165041600530254, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 36291 + }, + { + "epoch": 0.316512881338194, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 36292 + }, + { + "epoch": 0.3165216026233626, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36293 + }, + { + "epoch": 0.31653032390853114, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 36294 + }, + { + "epoch": 0.31653904519369974, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 36295 + }, + { + "epoch": 0.31654776647886834, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 36296 + }, + { + "epoch": 0.3165564877640369, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 36297 + }, + { + "epoch": 0.3165652090492055, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 36298 + }, + { + "epoch": 0.3165739303343741, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 36299 + }, + { + "epoch": 0.31658265161954263, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 36300 + }, + { + "epoch": 0.31659137290471123, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 36301 + }, + { + "epoch": 0.31660009418987983, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 36302 + }, + { + "epoch": 0.3166088154750484, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 36303 + }, + { + "epoch": 0.316617536760217, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 36304 + }, + { + "epoch": 0.3166262580453856, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 36305 + }, + { + "epoch": 0.3166349793305542, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 36306 + }, + { + "epoch": 0.3166437006157227, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 36307 + }, + { + "epoch": 0.3166524219008913, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 36308 + }, + { + "epoch": 0.3166611431860599, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 36309 + }, + { + "epoch": 0.31666986447122847, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 36310 + }, + { + "epoch": 0.31667858575639707, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 36311 + }, + { + "epoch": 0.31668730704156567, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 36312 + }, + { + "epoch": 0.3166960283267342, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 36313 + }, + { + "epoch": 0.3167047496119028, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 36314 + }, + { + "epoch": 0.3167134708970714, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 36315 + }, + { + "epoch": 0.31672219218223996, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 36316 + }, + { + "epoch": 0.31673091346740856, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 36317 + }, + { + "epoch": 0.31673963475257716, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 36318 + }, + { + "epoch": 0.3167483560377457, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 36319 + }, + { + "epoch": 0.3167570773229143, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 36320 + }, + { + "epoch": 0.3167657986080829, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 36321 + }, + { + "epoch": 0.31677451989325145, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 36322 + }, + { + "epoch": 0.31678324117842005, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 36323 + }, + { + "epoch": 0.31679196246358865, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 36324 + }, + { + "epoch": 0.3168006837487572, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 36325 + }, + { + "epoch": 0.3168094050339258, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 36326 + }, + { + "epoch": 0.3168181263190944, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 36327 + }, + { + "epoch": 0.31682684760426294, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 36328 + }, + { + "epoch": 0.31683556888943154, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36329 + }, + { + "epoch": 0.31684429017460014, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 36330 + }, + { + "epoch": 0.3168530114597687, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 36331 + }, + { + "epoch": 0.3168617327449373, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 36332 + }, + { + "epoch": 0.3168704540301059, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 36333 + }, + { + "epoch": 0.3168791753152745, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 36334 + }, + { + "epoch": 0.31688789660044303, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 36335 + }, + { + "epoch": 0.31689661788561163, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 36336 + }, + { + "epoch": 0.31690533917078023, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 36337 + }, + { + "epoch": 0.3169140604559488, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 36338 + }, + { + "epoch": 0.3169227817411174, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 36339 + }, + { + "epoch": 0.316931503026286, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 36340 + }, + { + "epoch": 0.3169402243114545, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 36341 + }, + { + "epoch": 0.3169489455966231, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 36342 + }, + { + "epoch": 0.3169576668817917, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 36343 + }, + { + "epoch": 0.31696638816696027, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 36344 + }, + { + "epoch": 0.31697510945212887, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 36345 + }, + { + "epoch": 0.31698383073729747, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 36346 + }, + { + "epoch": 0.316992552022466, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 36347 + }, + { + "epoch": 0.3170012733076346, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 36348 + }, + { + "epoch": 0.3170099945928032, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 36349 + }, + { + "epoch": 0.31701871587797176, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 36350 + }, + { + "epoch": 0.31702743716314036, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 36351 + }, + { + "epoch": 0.31703615844830896, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 36352 + }, + { + "epoch": 0.3170448797334775, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 36353 + }, + { + "epoch": 0.3170536010186461, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 36354 + }, + { + "epoch": 0.3170623223038147, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 36355 + }, + { + "epoch": 0.31707104358898325, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 36356 + }, + { + "epoch": 0.31707976487415185, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 36357 + }, + { + "epoch": 0.31708848615932045, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 36358 + }, + { + "epoch": 0.317097207444489, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 36359 + }, + { + "epoch": 0.3171059287296576, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 36360 + }, + { + "epoch": 0.3171146500148262, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 36361 + }, + { + "epoch": 0.3171233712999948, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 36362 + }, + { + "epoch": 0.31713209258516334, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 36363 + }, + { + "epoch": 0.31714081387033194, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 36364 + }, + { + "epoch": 0.31714953515550054, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 36365 + }, + { + "epoch": 0.3171582564406691, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 36366 + }, + { + "epoch": 0.3171669777258377, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 36367 + }, + { + "epoch": 0.3171756990110063, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 36368 + }, + { + "epoch": 0.31718442029617483, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 36369 + }, + { + "epoch": 0.31719314158134343, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 36370 + }, + { + "epoch": 0.31720186286651203, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 36371 + }, + { + "epoch": 0.3172105841516806, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 36372 + }, + { + "epoch": 0.3172193054368492, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 36373 + }, + { + "epoch": 0.3172280267220178, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 36374 + }, + { + "epoch": 0.3172367480071863, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 36375 + }, + { + "epoch": 0.3172454692923549, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36376 + }, + { + "epoch": 0.3172541905775235, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 36377 + }, + { + "epoch": 0.31726291186269207, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36378 + }, + { + "epoch": 0.31727163314786067, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 36379 + }, + { + "epoch": 0.31728035443302927, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 36380 + }, + { + "epoch": 0.3172890757181978, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 36381 + }, + { + "epoch": 0.3172977970033664, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36382 + }, + { + "epoch": 0.317306518288535, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 36383 + }, + { + "epoch": 0.31731523957370356, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 36384 + }, + { + "epoch": 0.31732396085887216, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 36385 + }, + { + "epoch": 0.31733268214404076, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 36386 + }, + { + "epoch": 0.3173414034292093, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 36387 + }, + { + "epoch": 0.3173501247143779, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 36388 + }, + { + "epoch": 0.3173588459995465, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 36389 + }, + { + "epoch": 0.3173675672847151, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 36390 + }, + { + "epoch": 0.31737628856988365, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 36391 + }, + { + "epoch": 0.31738500985505225, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 36392 + }, + { + "epoch": 0.31739373114022085, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 36393 + }, + { + "epoch": 0.3174024524253894, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 36394 + }, + { + "epoch": 0.317411173710558, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 36395 + }, + { + "epoch": 0.3174198949957266, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 36396 + }, + { + "epoch": 0.31742861628089514, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 36397 + }, + { + "epoch": 0.31743733756606374, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 36398 + }, + { + "epoch": 0.31744605885123234, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 36399 + }, + { + "epoch": 0.3174547801364009, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 36400 + }, + { + "epoch": 0.3174635014215695, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 36401 + }, + { + "epoch": 0.3174722227067381, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 36402 + }, + { + "epoch": 0.31748094399190663, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 36403 + }, + { + "epoch": 0.31748966527707523, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 36404 + }, + { + "epoch": 0.31749838656224383, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 36405 + }, + { + "epoch": 0.3175071078474124, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36406 + }, + { + "epoch": 0.317515829132581, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 36407 + }, + { + "epoch": 0.3175245504177496, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 36408 + }, + { + "epoch": 0.3175332717029181, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 36409 + }, + { + "epoch": 0.3175419929880867, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 36410 + }, + { + "epoch": 0.3175507142732553, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 36411 + }, + { + "epoch": 0.31755943555842386, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36412 + }, + { + "epoch": 0.31756815684359246, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 36413 + }, + { + "epoch": 0.31757687812876106, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 36414 + }, + { + "epoch": 0.3175855994139296, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 36415 + }, + { + "epoch": 0.3175943206990982, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 36416 + }, + { + "epoch": 0.3176030419842668, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 36417 + }, + { + "epoch": 0.3176117632694354, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 36418 + }, + { + "epoch": 0.31762048455460395, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 36419 + }, + { + "epoch": 0.31762920583977255, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 36420 + }, + { + "epoch": 0.31763792712494116, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 36421 + }, + { + "epoch": 0.3176466484101097, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 36422 + }, + { + "epoch": 0.3176553696952783, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 36423 + }, + { + "epoch": 0.3176640909804469, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 36424 + }, + { + "epoch": 0.31767281226561545, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 36425 + }, + { + "epoch": 0.31768153355078405, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 36426 + }, + { + "epoch": 0.31769025483595265, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 36427 + }, + { + "epoch": 0.3176989761211212, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 36428 + }, + { + "epoch": 0.3177076974062898, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 36429 + }, + { + "epoch": 0.3177164186914584, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 36430 + }, + { + "epoch": 0.31772513997662694, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 36431 + }, + { + "epoch": 0.31773386126179554, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 36432 + }, + { + "epoch": 0.31774258254696414, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 36433 + }, + { + "epoch": 0.3177513038321327, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 36434 + }, + { + "epoch": 0.3177600251173013, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 36435 + }, + { + "epoch": 0.3177687464024699, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 36436 + }, + { + "epoch": 0.3177774676876384, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 36437 + }, + { + "epoch": 0.317786188972807, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 36438 + }, + { + "epoch": 0.3177949102579756, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36439 + }, + { + "epoch": 0.31780363154314417, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 36440 + }, + { + "epoch": 0.31781235282831277, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 36441 + }, + { + "epoch": 0.31782107411348137, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 36442 + }, + { + "epoch": 0.31782979539865, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 36443 + }, + { + "epoch": 0.3178385166838185, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 36444 + }, + { + "epoch": 0.3178472379689871, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 36445 + }, + { + "epoch": 0.3178559592541557, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 36446 + }, + { + "epoch": 0.31786468053932426, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 36447 + }, + { + "epoch": 0.31787340182449286, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 36448 + }, + { + "epoch": 0.31788212310966146, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 36449 + }, + { + "epoch": 0.31789084439483, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 36450 + }, + { + "epoch": 0.3178995656799986, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 36451 + }, + { + "epoch": 0.3179082869651672, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 36452 + }, + { + "epoch": 0.31791700825033575, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 36453 + }, + { + "epoch": 0.31792572953550435, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 36454 + }, + { + "epoch": 0.31793445082067295, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 36455 + }, + { + "epoch": 0.3179431721058415, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 36456 + }, + { + "epoch": 0.3179518933910101, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 36457 + }, + { + "epoch": 0.3179606146761787, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 36458 + }, + { + "epoch": 0.31796933596134724, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 36459 + }, + { + "epoch": 0.31797805724651584, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 36460 + }, + { + "epoch": 0.31798677853168444, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 36461 + }, + { + "epoch": 0.317995499816853, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 36462 + }, + { + "epoch": 0.3180042211020216, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 36463 + }, + { + "epoch": 0.3180129423871902, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 36464 + }, + { + "epoch": 0.31802166367235873, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 36465 + }, + { + "epoch": 0.31803038495752733, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 36466 + }, + { + "epoch": 0.31803910624269593, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 36467 + }, + { + "epoch": 0.3180478275278645, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 36468 + }, + { + "epoch": 0.3180565488130331, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 36469 + }, + { + "epoch": 0.3180652700982017, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 36470 + }, + { + "epoch": 0.3180739913833703, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 36471 + }, + { + "epoch": 0.3180827126685388, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 36472 + }, + { + "epoch": 0.3180914339537074, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 36473 + }, + { + "epoch": 0.318100155238876, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36474 + }, + { + "epoch": 0.31810887652404457, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 36475 + }, + { + "epoch": 0.31811759780921317, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 36476 + }, + { + "epoch": 0.31812631909438177, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 36477 + }, + { + "epoch": 0.3181350403795503, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 36478 + }, + { + "epoch": 0.3181437616647189, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 36479 + }, + { + "epoch": 0.3181524829498875, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 36480 + }, + { + "epoch": 0.31816120423505606, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 36481 + }, + { + "epoch": 0.31816992552022466, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 36482 + }, + { + "epoch": 0.31817864680539326, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 36483 + }, + { + "epoch": 0.3181873680905618, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 36484 + }, + { + "epoch": 0.3181960893757304, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 36485 + }, + { + "epoch": 0.318204810660899, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 36486 + }, + { + "epoch": 0.31821353194606755, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 36487 + }, + { + "epoch": 0.31822225323123615, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 36488 + }, + { + "epoch": 0.31823097451640475, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 36489 + }, + { + "epoch": 0.3182396958015733, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 36490 + }, + { + "epoch": 0.3182484170867419, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 36491 + }, + { + "epoch": 0.3182571383719105, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 36492 + }, + { + "epoch": 0.31826585965707904, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 36493 + }, + { + "epoch": 0.31827458094224764, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 36494 + }, + { + "epoch": 0.31828330222741624, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 36495 + }, + { + "epoch": 0.3182920235125848, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 36496 + }, + { + "epoch": 0.3183007447977534, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 36497 + }, + { + "epoch": 0.318309466082922, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36498 + }, + { + "epoch": 0.3183181873680906, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 36499 + }, + { + "epoch": 0.31832690865325913, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 36500 + }, + { + "epoch": 0.31833562993842773, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 36501 + }, + { + "epoch": 0.31834435122359633, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 36502 + }, + { + "epoch": 0.3183530725087649, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 36503 + }, + { + "epoch": 0.3183617937939335, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 36504 + }, + { + "epoch": 0.3183705150791021, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 36505 + }, + { + "epoch": 0.3183792363642706, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 36506 + }, + { + "epoch": 0.3183879576494392, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 36507 + }, + { + "epoch": 0.3183966789346078, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 36508 + }, + { + "epoch": 0.31840540021977637, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 36509 + }, + { + "epoch": 0.31841412150494497, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 36510 + }, + { + "epoch": 0.31842284279011357, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 36511 + }, + { + "epoch": 0.3184315640752821, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 36512 + }, + { + "epoch": 0.3184402853604507, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 36513 + }, + { + "epoch": 0.3184490066456193, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 36514 + }, + { + "epoch": 0.31845772793078786, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36515 + }, + { + "epoch": 0.31846644921595646, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 36516 + }, + { + "epoch": 0.31847517050112506, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36517 + }, + { + "epoch": 0.3184838917862936, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 36518 + }, + { + "epoch": 0.3184926130714622, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 36519 + }, + { + "epoch": 0.3185013343566308, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 36520 + }, + { + "epoch": 0.31851005564179935, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 36521 + }, + { + "epoch": 0.31851877692696795, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 36522 + }, + { + "epoch": 0.31852749821213655, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 36523 + }, + { + "epoch": 0.3185362194973051, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 36524 + }, + { + "epoch": 0.3185449407824737, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 36525 + }, + { + "epoch": 0.3185536620676423, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 36526 + }, + { + "epoch": 0.3185623833528109, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 36527 + }, + { + "epoch": 0.31857110463797944, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 36528 + }, + { + "epoch": 0.31857982592314804, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 36529 + }, + { + "epoch": 0.31858854720831664, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 36530 + }, + { + "epoch": 0.3185972684934852, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 36531 + }, + { + "epoch": 0.3186059897786538, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 36532 + }, + { + "epoch": 0.3186147110638224, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 36533 + }, + { + "epoch": 0.31862343234899093, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 36534 + }, + { + "epoch": 0.31863215363415953, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 36535 + }, + { + "epoch": 0.31864087491932813, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 36536 + }, + { + "epoch": 0.3186495962044967, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 36537 + }, + { + "epoch": 0.3186583174896653, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 36538 + }, + { + "epoch": 0.3186670387748339, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 36539 + }, + { + "epoch": 0.3186757600600024, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 36540 + }, + { + "epoch": 0.318684481345171, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 36541 + }, + { + "epoch": 0.3186932026303396, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 36542 + }, + { + "epoch": 0.31870192391550817, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 36543 + }, + { + "epoch": 0.31871064520067677, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 36544 + }, + { + "epoch": 0.31871936648584537, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 36545 + }, + { + "epoch": 0.3187280877710139, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 36546 + }, + { + "epoch": 0.3187368090561825, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 36547 + }, + { + "epoch": 0.3187455303413511, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 0.9809, + "step": 36548 + }, + { + "epoch": 0.31875425162651966, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 36549 + }, + { + "epoch": 0.31876297291168826, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 36550 + }, + { + "epoch": 0.31877169419685686, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36551 + }, + { + "epoch": 0.31878041548202546, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 36552 + }, + { + "epoch": 0.318789136767194, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 36553 + }, + { + "epoch": 0.3187978580523626, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 36554 + }, + { + "epoch": 0.3188065793375312, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 36555 + }, + { + "epoch": 0.31881530062269975, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 36556 + }, + { + "epoch": 0.31882402190786835, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 36557 + }, + { + "epoch": 0.31883274319303695, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 36558 + }, + { + "epoch": 0.3188414644782055, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 36559 + }, + { + "epoch": 0.3188501857633741, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 36560 + }, + { + "epoch": 0.3188589070485427, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 36561 + }, + { + "epoch": 0.31886762833371124, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 36562 + }, + { + "epoch": 0.31887634961887984, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 36563 + }, + { + "epoch": 0.31888507090404844, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 36564 + }, + { + "epoch": 0.318893792189217, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 36565 + }, + { + "epoch": 0.3189025134743856, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 36566 + }, + { + "epoch": 0.3189112347595542, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 36567 + }, + { + "epoch": 0.31891995604472273, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 36568 + }, + { + "epoch": 0.31892867732989133, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 36569 + }, + { + "epoch": 0.31893739861505993, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 36570 + }, + { + "epoch": 0.3189461199002285, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 36571 + }, + { + "epoch": 0.3189548411853971, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 36572 + }, + { + "epoch": 0.3189635624705657, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 36573 + }, + { + "epoch": 0.3189722837557342, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 36574 + }, + { + "epoch": 0.3189810050409028, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 36575 + }, + { + "epoch": 0.3189897263260714, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 36576 + }, + { + "epoch": 0.31899844761123997, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 36577 + }, + { + "epoch": 0.31900716889640857, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 36578 + }, + { + "epoch": 0.31901589018157717, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 36579 + }, + { + "epoch": 0.31902461146674577, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 36580 + }, + { + "epoch": 0.3190333327519143, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 36581 + }, + { + "epoch": 0.3190420540370829, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 36582 + }, + { + "epoch": 0.3190507753222515, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 36583 + }, + { + "epoch": 0.31905949660742006, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 36584 + }, + { + "epoch": 0.31906821789258866, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 36585 + }, + { + "epoch": 0.31907693917775726, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 36586 + }, + { + "epoch": 0.3190856604629258, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 36587 + }, + { + "epoch": 0.3190943817480944, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 36588 + }, + { + "epoch": 0.319103103033263, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 36589 + }, + { + "epoch": 0.31911182431843155, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 36590 + }, + { + "epoch": 0.31912054560360015, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 36591 + }, + { + "epoch": 0.31912926688876875, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 36592 + }, + { + "epoch": 0.3191379881739373, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 36593 + }, + { + "epoch": 0.3191467094591059, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 36594 + }, + { + "epoch": 0.3191554307442745, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 36595 + }, + { + "epoch": 0.31916415202944304, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 36596 + }, + { + "epoch": 0.31917287331461164, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 36597 + }, + { + "epoch": 0.31918159459978024, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 36598 + }, + { + "epoch": 0.3191903158849488, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 36599 + }, + { + "epoch": 0.3191990371701174, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 36600 + }, + { + "epoch": 0.319207758455286, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 36601 + }, + { + "epoch": 0.31921647974045453, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 36602 + }, + { + "epoch": 0.31922520102562313, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 36603 + }, + { + "epoch": 0.31923392231079173, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 36604 + }, + { + "epoch": 0.3192426435959603, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 36605 + }, + { + "epoch": 0.3192513648811289, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 36606 + }, + { + "epoch": 0.3192600861662975, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 36607 + }, + { + "epoch": 0.3192688074514661, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 36608 + }, + { + "epoch": 0.3192775287366346, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 36609 + }, + { + "epoch": 0.3192862500218032, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 36610 + }, + { + "epoch": 0.3192949713069718, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 36611 + }, + { + "epoch": 0.31930369259214036, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 36612 + }, + { + "epoch": 0.31931241387730896, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 36613 + }, + { + "epoch": 0.31932113516247757, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 36614 + }, + { + "epoch": 0.3193298564476461, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 36615 + }, + { + "epoch": 0.3193385777328147, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 36616 + }, + { + "epoch": 0.3193472990179833, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 36617 + }, + { + "epoch": 0.31935602030315186, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 36618 + }, + { + "epoch": 0.31936474158832046, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 36619 + }, + { + "epoch": 0.31937346287348906, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 36620 + }, + { + "epoch": 0.3193821841586576, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 36621 + }, + { + "epoch": 0.3193909054438262, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 36622 + }, + { + "epoch": 0.3193996267289948, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36623 + }, + { + "epoch": 0.31940834801416335, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 36624 + }, + { + "epoch": 0.31941706929933195, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 36625 + }, + { + "epoch": 0.31942579058450055, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 36626 + }, + { + "epoch": 0.3194345118696691, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 36627 + }, + { + "epoch": 0.3194432331548377, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 36628 + }, + { + "epoch": 0.3194519544400063, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 36629 + }, + { + "epoch": 0.31946067572517484, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 36630 + }, + { + "epoch": 0.31946939701034344, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 36631 + }, + { + "epoch": 0.31947811829551204, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 36632 + }, + { + "epoch": 0.3194868395806806, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 36633 + }, + { + "epoch": 0.3194955608658492, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 36634 + }, + { + "epoch": 0.3195042821510178, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 36635 + }, + { + "epoch": 0.3195130034361864, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 36636 + }, + { + "epoch": 0.3195217247213549, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 36637 + }, + { + "epoch": 0.3195304460065235, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 36638 + }, + { + "epoch": 0.3195391672916921, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36639 + }, + { + "epoch": 0.3195478885768607, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 36640 + }, + { + "epoch": 0.3195566098620293, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 36641 + }, + { + "epoch": 0.3195653311471979, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 36642 + }, + { + "epoch": 0.3195740524323664, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 36643 + }, + { + "epoch": 0.319582773717535, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36644 + }, + { + "epoch": 0.3195914950027036, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 36645 + }, + { + "epoch": 0.31960021628787216, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 36646 + }, + { + "epoch": 0.31960893757304076, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 36647 + }, + { + "epoch": 0.31961765885820936, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 36648 + }, + { + "epoch": 0.3196263801433779, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 36649 + }, + { + "epoch": 0.3196351014285465, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 36650 + }, + { + "epoch": 0.3196438227137151, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 36651 + }, + { + "epoch": 0.31965254399888365, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 36652 + }, + { + "epoch": 0.31966126528405225, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 36653 + }, + { + "epoch": 0.31966998656922085, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 36654 + }, + { + "epoch": 0.3196787078543894, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 36655 + }, + { + "epoch": 0.319687429139558, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 36656 + }, + { + "epoch": 0.3196961504247266, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 36657 + }, + { + "epoch": 0.31970487170989514, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 36658 + }, + { + "epoch": 0.31971359299506374, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 36659 + }, + { + "epoch": 0.31972231428023234, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 36660 + }, + { + "epoch": 0.31973103556540095, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 36661 + }, + { + "epoch": 0.3197397568505695, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 36662 + }, + { + "epoch": 0.3197484781357381, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 36663 + }, + { + "epoch": 0.3197571994209067, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 36664 + }, + { + "epoch": 0.31976592070607524, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 36665 + }, + { + "epoch": 0.31977464199124384, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 36666 + }, + { + "epoch": 0.31978336327641244, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 36667 + }, + { + "epoch": 0.319792084561581, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 36668 + }, + { + "epoch": 0.3198008058467496, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 36669 + }, + { + "epoch": 0.3198095271319182, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 36670 + }, + { + "epoch": 0.3198182484170867, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 36671 + }, + { + "epoch": 0.3198269697022553, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 36672 + }, + { + "epoch": 0.3198356909874239, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 36673 + }, + { + "epoch": 0.31984441227259247, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 36674 + }, + { + "epoch": 0.31985313355776107, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 36675 + }, + { + "epoch": 0.31986185484292967, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 36676 + }, + { + "epoch": 0.3198705761280982, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 36677 + }, + { + "epoch": 0.3198792974132668, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 36678 + }, + { + "epoch": 0.3198880186984354, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 36679 + }, + { + "epoch": 0.31989673998360396, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 36680 + }, + { + "epoch": 0.31990546126877256, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 36681 + }, + { + "epoch": 0.31991418255394116, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36682 + }, + { + "epoch": 0.3199229038391097, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 36683 + }, + { + "epoch": 0.3199316251242783, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 36684 + }, + { + "epoch": 0.3199403464094469, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 36685 + }, + { + "epoch": 0.31994906769461545, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 36686 + }, + { + "epoch": 0.31995778897978405, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 36687 + }, + { + "epoch": 0.31996651026495265, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 36688 + }, + { + "epoch": 0.31997523155012125, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 36689 + }, + { + "epoch": 0.3199839528352898, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 36690 + }, + { + "epoch": 0.3199926741204584, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 36691 + }, + { + "epoch": 0.320001395405627, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 36692 + }, + { + "epoch": 0.32001011669079554, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 36693 + }, + { + "epoch": 0.32001883797596414, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 36694 + }, + { + "epoch": 0.32002755926113274, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 36695 + }, + { + "epoch": 0.3200362805463013, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 36696 + }, + { + "epoch": 0.3200450018314699, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 36697 + }, + { + "epoch": 0.3200537231166385, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 36698 + }, + { + "epoch": 0.32006244440180703, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 36699 + }, + { + "epoch": 0.32007116568697563, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 36700 + }, + { + "epoch": 0.32007988697214423, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 36701 + }, + { + "epoch": 0.3200886082573128, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 36702 + }, + { + "epoch": 0.3200973295424814, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 36703 + }, + { + "epoch": 0.32010605082765, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 36704 + }, + { + "epoch": 0.3201147721128185, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 36705 + }, + { + "epoch": 0.3201234933979871, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 36706 + }, + { + "epoch": 0.3201322146831557, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 36707 + }, + { + "epoch": 0.32014093596832427, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 36708 + }, + { + "epoch": 0.32014965725349287, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 36709 + }, + { + "epoch": 0.32015837853866147, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 36710 + }, + { + "epoch": 0.32016709982383, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 36711 + }, + { + "epoch": 0.3201758211089986, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 36712 + }, + { + "epoch": 0.3201845423941672, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 36713 + }, + { + "epoch": 0.32019326367933576, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 36714 + }, + { + "epoch": 0.32020198496450436, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 36715 + }, + { + "epoch": 0.32021070624967296, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 36716 + }, + { + "epoch": 0.32021942753484156, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 36717 + }, + { + "epoch": 0.3202281488200101, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 36718 + }, + { + "epoch": 0.3202368701051787, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 36719 + }, + { + "epoch": 0.3202455913903473, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 36720 + }, + { + "epoch": 0.32025431267551585, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 36721 + }, + { + "epoch": 0.32026303396068445, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 36722 + }, + { + "epoch": 0.32027175524585305, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 36723 + }, + { + "epoch": 0.3202804765310216, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 36724 + }, + { + "epoch": 0.3202891978161902, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36725 + }, + { + "epoch": 0.3202979191013588, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 36726 + }, + { + "epoch": 0.32030664038652734, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 36727 + }, + { + "epoch": 0.32031536167169594, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 36728 + }, + { + "epoch": 0.32032408295686454, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 36729 + }, + { + "epoch": 0.3203328042420331, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 36730 + }, + { + "epoch": 0.3203415255272017, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 36731 + }, + { + "epoch": 0.3203502468123703, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 36732 + }, + { + "epoch": 0.32035896809753883, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 36733 + }, + { + "epoch": 0.32036768938270743, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 36734 + }, + { + "epoch": 0.32037641066787603, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 36735 + }, + { + "epoch": 0.3203851319530446, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 36736 + }, + { + "epoch": 0.3203938532382132, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 36737 + }, + { + "epoch": 0.3204025745233818, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 36738 + }, + { + "epoch": 0.3204112958085503, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 36739 + }, + { + "epoch": 0.3204200170937189, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 36740 + }, + { + "epoch": 0.3204287383788875, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 36741 + }, + { + "epoch": 0.32043745966405607, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 36742 + }, + { + "epoch": 0.32044618094922467, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 36743 + }, + { + "epoch": 0.32045490223439327, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 36744 + }, + { + "epoch": 0.32046362351956187, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 36745 + }, + { + "epoch": 0.3204723448047304, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 36746 + }, + { + "epoch": 0.320481066089899, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 36747 + }, + { + "epoch": 0.3204897873750676, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 36748 + }, + { + "epoch": 0.32049850866023616, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 36749 + }, + { + "epoch": 0.32050722994540476, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 36750 + }, + { + "epoch": 0.32051595123057336, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 36751 + }, + { + "epoch": 0.3205246725157419, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 36752 + }, + { + "epoch": 0.3205333938009105, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 36753 + }, + { + "epoch": 0.3205421150860791, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 36754 + }, + { + "epoch": 0.32055083637124765, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 36755 + }, + { + "epoch": 0.32055955765641625, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 36756 + }, + { + "epoch": 0.32056827894158485, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 36757 + }, + { + "epoch": 0.3205770002267534, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 36758 + }, + { + "epoch": 0.320585721511922, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 36759 + }, + { + "epoch": 0.3205944427970906, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 36760 + }, + { + "epoch": 0.32060316408225914, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 36761 + }, + { + "epoch": 0.32061188536742774, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 36762 + }, + { + "epoch": 0.32062060665259634, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 36763 + }, + { + "epoch": 0.3206293279377649, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 36764 + }, + { + "epoch": 0.3206380492229335, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 36765 + }, + { + "epoch": 0.3206467705081021, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 36766 + }, + { + "epoch": 0.32065549179327063, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 36767 + }, + { + "epoch": 0.32066421307843923, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 36768 + }, + { + "epoch": 0.32067293436360783, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 36769 + }, + { + "epoch": 0.32068165564877643, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 36770 + }, + { + "epoch": 0.320690376933945, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 36771 + }, + { + "epoch": 0.3206990982191136, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 36772 + }, + { + "epoch": 0.3207078195042822, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 36773 + }, + { + "epoch": 0.3207165407894507, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 36774 + }, + { + "epoch": 0.3207252620746193, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 36775 + }, + { + "epoch": 0.3207339833597879, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 36776 + }, + { + "epoch": 0.32074270464495647, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 36777 + }, + { + "epoch": 0.32075142593012507, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 36778 + }, + { + "epoch": 0.32076014721529367, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 36779 + }, + { + "epoch": 0.3207688685004622, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 36780 + }, + { + "epoch": 0.3207775897856308, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 36781 + }, + { + "epoch": 0.3207863110707994, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 36782 + }, + { + "epoch": 0.32079503235596796, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 36783 + }, + { + "epoch": 0.32080375364113656, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 36784 + }, + { + "epoch": 0.32081247492630516, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 36785 + }, + { + "epoch": 0.3208211962114737, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 36786 + }, + { + "epoch": 0.3208299174966423, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 36787 + }, + { + "epoch": 0.3208386387818109, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 36788 + }, + { + "epoch": 0.32084736006697945, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 36789 + }, + { + "epoch": 0.32085608135214805, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 36790 + }, + { + "epoch": 0.32086480263731665, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 36791 + }, + { + "epoch": 0.3208735239224852, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 36792 + }, + { + "epoch": 0.3208822452076538, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 36793 + }, + { + "epoch": 0.3208909664928224, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 36794 + }, + { + "epoch": 0.32089968777799094, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36795 + }, + { + "epoch": 0.32090840906315954, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 36796 + }, + { + "epoch": 0.32091713034832814, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 36797 + }, + { + "epoch": 0.32092585163349674, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 36798 + }, + { + "epoch": 0.3209345729186653, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 36799 + }, + { + "epoch": 0.3209432942038339, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 36800 + }, + { + "epoch": 0.3209520154890025, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 36801 + }, + { + "epoch": 0.32096073677417103, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 36802 + }, + { + "epoch": 0.32096945805933963, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 36803 + }, + { + "epoch": 0.32097817934450823, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 36804 + }, + { + "epoch": 0.3209869006296768, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 36805 + }, + { + "epoch": 0.3209956219148454, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 36806 + }, + { + "epoch": 0.321004343200014, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 36807 + }, + { + "epoch": 0.3210130644851825, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 36808 + }, + { + "epoch": 0.3210217857703511, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 36809 + }, + { + "epoch": 0.3210305070555197, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 36810 + }, + { + "epoch": 0.32103922834068827, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 36811 + }, + { + "epoch": 0.32104794962585687, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 36812 + }, + { + "epoch": 0.32105667091102547, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 36813 + }, + { + "epoch": 0.321065392196194, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 36814 + }, + { + "epoch": 0.3210741134813626, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 36815 + }, + { + "epoch": 0.3210828347665312, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 36816 + }, + { + "epoch": 0.32109155605169976, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 36817 + }, + { + "epoch": 0.32110027733686836, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 36818 + }, + { + "epoch": 0.32110899862203696, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 36819 + }, + { + "epoch": 0.3211177199072055, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 36820 + }, + { + "epoch": 0.3211264411923741, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 36821 + }, + { + "epoch": 0.3211351624775427, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 36822 + }, + { + "epoch": 0.32114388376271125, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 36823 + }, + { + "epoch": 0.32115260504787985, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 36824 + }, + { + "epoch": 0.32116132633304845, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 36825 + }, + { + "epoch": 0.32117004761821705, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 36826 + }, + { + "epoch": 0.3211787689033856, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 36827 + }, + { + "epoch": 0.3211874901885542, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 36828 + }, + { + "epoch": 0.3211962114737228, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 36829 + }, + { + "epoch": 0.32120493275889134, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 36830 + }, + { + "epoch": 0.32121365404405994, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 36831 + }, + { + "epoch": 0.32122237532922854, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 36832 + }, + { + "epoch": 0.3212310966143971, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 36833 + }, + { + "epoch": 0.3212398178995657, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 36834 + }, + { + "epoch": 0.3212485391847343, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 36835 + }, + { + "epoch": 0.32125726046990283, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 36836 + }, + { + "epoch": 0.32126598175507143, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 36837 + }, + { + "epoch": 0.32127470304024003, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 36838 + }, + { + "epoch": 0.3212834243254086, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 36839 + }, + { + "epoch": 0.3212921456105772, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 36840 + }, + { + "epoch": 0.3213008668957458, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 36841 + }, + { + "epoch": 0.3213095881809143, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 36842 + }, + { + "epoch": 0.3213183094660829, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 36843 + }, + { + "epoch": 0.3213270307512515, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 36844 + }, + { + "epoch": 0.32133575203642006, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 36845 + }, + { + "epoch": 0.32134447332158866, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 36846 + }, + { + "epoch": 0.32135319460675726, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 36847 + }, + { + "epoch": 0.3213619158919258, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 36848 + }, + { + "epoch": 0.3213706371770944, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 36849 + }, + { + "epoch": 0.321379358462263, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 36850 + }, + { + "epoch": 0.32138807974743155, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 36851 + }, + { + "epoch": 0.32139680103260015, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 36852 + }, + { + "epoch": 0.32140552231776875, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 36853 + }, + { + "epoch": 0.32141424360293736, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 36854 + }, + { + "epoch": 0.3214229648881059, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 36855 + }, + { + "epoch": 0.3214316861732745, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 36856 + }, + { + "epoch": 0.3214404074584431, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 36857 + }, + { + "epoch": 0.32144912874361165, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 36858 + }, + { + "epoch": 0.32145785002878025, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 36859 + }, + { + "epoch": 0.32146657131394885, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9861, + "step": 36860 + }, + { + "epoch": 0.3214752925991174, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 36861 + }, + { + "epoch": 0.321484013884286, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 36862 + }, + { + "epoch": 0.3214927351694546, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 36863 + }, + { + "epoch": 0.32150145645462314, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 36864 + }, + { + "epoch": 0.32151017773979174, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 36865 + }, + { + "epoch": 0.32151889902496034, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 36866 + }, + { + "epoch": 0.3215276203101289, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 36867 + }, + { + "epoch": 0.3215363415952975, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 36868 + }, + { + "epoch": 0.3215450628804661, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 36869 + }, + { + "epoch": 0.3215537841656346, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 36870 + }, + { + "epoch": 0.3215625054508032, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 36871 + }, + { + "epoch": 0.3215712267359718, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 36872 + }, + { + "epoch": 0.32157994802114037, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 36873 + }, + { + "epoch": 0.32158866930630897, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 36874 + }, + { + "epoch": 0.32159739059147757, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 36875 + }, + { + "epoch": 0.3216061118766461, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 36876 + }, + { + "epoch": 0.3216148331618147, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 36877 + }, + { + "epoch": 0.3216235544469833, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 36878 + }, + { + "epoch": 0.32163227573215186, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 36879 + }, + { + "epoch": 0.32164099701732046, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 36880 + }, + { + "epoch": 0.32164971830248906, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 36881 + }, + { + "epoch": 0.32165843958765766, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 36882 + }, + { + "epoch": 0.3216671608728262, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 36883 + }, + { + "epoch": 0.3216758821579948, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 36884 + }, + { + "epoch": 0.3216846034431634, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 36885 + }, + { + "epoch": 0.32169332472833195, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 36886 + }, + { + "epoch": 0.32170204601350055, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 36887 + }, + { + "epoch": 0.32171076729866915, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 36888 + }, + { + "epoch": 0.3217194885838377, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 36889 + }, + { + "epoch": 0.3217282098690063, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 36890 + }, + { + "epoch": 0.3217369311541749, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 36891 + }, + { + "epoch": 0.32174565243934344, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 36892 + }, + { + "epoch": 0.32175437372451204, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 36893 + }, + { + "epoch": 0.32176309500968064, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 36894 + }, + { + "epoch": 0.3217718162948492, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 36895 + }, + { + "epoch": 0.3217805375800178, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 36896 + }, + { + "epoch": 0.3217892588651864, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 36897 + }, + { + "epoch": 0.32179798015035493, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 36898 + }, + { + "epoch": 0.32180670143552353, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 36899 + }, + { + "epoch": 0.32181542272069213, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 36900 + }, + { + "epoch": 0.3218241440058607, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 36901 + }, + { + "epoch": 0.3218328652910293, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 36902 + }, + { + "epoch": 0.3218415865761979, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 36903 + }, + { + "epoch": 0.3218503078613664, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 36904 + }, + { + "epoch": 0.321859029146535, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 36905 + }, + { + "epoch": 0.3218677504317036, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 36906 + }, + { + "epoch": 0.3218764717168722, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 36907 + }, + { + "epoch": 0.32188519300204077, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 36908 + }, + { + "epoch": 0.32189391428720937, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 36909 + }, + { + "epoch": 0.32190263557237797, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 36910 + }, + { + "epoch": 0.3219113568575465, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 36911 + }, + { + "epoch": 0.3219200781427151, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 36912 + }, + { + "epoch": 0.3219287994278837, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 36913 + }, + { + "epoch": 0.32193752071305226, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 36914 + }, + { + "epoch": 0.32194624199822086, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 36915 + }, + { + "epoch": 0.32195496328338946, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 36916 + }, + { + "epoch": 0.321963684568558, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 36917 + }, + { + "epoch": 0.3219724058537266, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 36918 + }, + { + "epoch": 0.3219811271388952, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 36919 + }, + { + "epoch": 0.32198984842406375, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36920 + }, + { + "epoch": 0.32199856970923235, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 36921 + }, + { + "epoch": 0.32200729099440095, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 36922 + }, + { + "epoch": 0.3220160122795695, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 36923 + }, + { + "epoch": 0.3220247335647381, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 36924 + }, + { + "epoch": 0.3220334548499067, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 36925 + }, + { + "epoch": 0.32204217613507524, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 36926 + }, + { + "epoch": 0.32205089742024384, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 36927 + }, + { + "epoch": 0.32205961870541244, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 36928 + }, + { + "epoch": 0.322068339990581, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 36929 + }, + { + "epoch": 0.3220770612757496, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36930 + }, + { + "epoch": 0.3220857825609182, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 36931 + }, + { + "epoch": 0.32209450384608673, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 36932 + }, + { + "epoch": 0.32210322513125533, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 36933 + }, + { + "epoch": 0.32211194641642393, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 36934 + }, + { + "epoch": 0.32212066770159253, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 36935 + }, + { + "epoch": 0.3221293889867611, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 36936 + }, + { + "epoch": 0.3221381102719297, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 36937 + }, + { + "epoch": 0.3221468315570983, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 36938 + }, + { + "epoch": 0.3221555528422668, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 36939 + }, + { + "epoch": 0.3221642741274354, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 36940 + }, + { + "epoch": 0.322172995412604, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 36941 + }, + { + "epoch": 0.32218171669777257, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 36942 + }, + { + "epoch": 0.32219043798294117, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 36943 + }, + { + "epoch": 0.32219915926810977, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 36944 + }, + { + "epoch": 0.3222078805532783, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 36945 + }, + { + "epoch": 0.3222166018384469, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 36946 + }, + { + "epoch": 0.3222253231236155, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 36947 + }, + { + "epoch": 0.32223404440878406, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 36948 + }, + { + "epoch": 0.32224276569395266, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 36949 + }, + { + "epoch": 0.32225148697912126, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 36950 + }, + { + "epoch": 0.3222602082642898, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 36951 + }, + { + "epoch": 0.3222689295494584, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 36952 + }, + { + "epoch": 0.322277650834627, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 36953 + }, + { + "epoch": 0.32228637211979555, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 36954 + }, + { + "epoch": 0.32229509340496415, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 36955 + }, + { + "epoch": 0.32230381469013275, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 36956 + }, + { + "epoch": 0.3223125359753013, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 36957 + }, + { + "epoch": 0.3223212572604699, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 36958 + }, + { + "epoch": 0.3223299785456385, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 36959 + }, + { + "epoch": 0.32233869983080704, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 36960 + }, + { + "epoch": 0.32234742111597564, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 36961 + }, + { + "epoch": 0.32235614240114424, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 36962 + }, + { + "epoch": 0.32236486368631284, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 36963 + }, + { + "epoch": 0.3223735849714814, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 36964 + }, + { + "epoch": 0.32238230625665, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 36965 + }, + { + "epoch": 0.3223910275418186, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 36966 + }, + { + "epoch": 0.32239974882698713, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 36967 + }, + { + "epoch": 0.32240847011215573, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 36968 + }, + { + "epoch": 0.32241719139732433, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 36969 + }, + { + "epoch": 0.3224259126824929, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 36970 + }, + { + "epoch": 0.3224346339676615, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 36971 + }, + { + "epoch": 0.3224433552528301, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 36972 + }, + { + "epoch": 0.3224520765379986, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 36973 + }, + { + "epoch": 0.3224607978231672, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 36974 + }, + { + "epoch": 0.3224695191083358, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 36975 + }, + { + "epoch": 0.32247824039350437, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 36976 + }, + { + "epoch": 0.32248696167867297, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 36977 + }, + { + "epoch": 0.32249568296384157, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 36978 + }, + { + "epoch": 0.3225044042490101, + "grad_norm": 0.369140625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 36979 + }, + { + "epoch": 0.3225131255341787, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 36980 + }, + { + "epoch": 0.3225218468193473, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 36981 + }, + { + "epoch": 0.32253056810451586, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 36982 + }, + { + "epoch": 0.32253928938968446, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 36983 + }, + { + "epoch": 0.32254801067485306, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 36984 + }, + { + "epoch": 0.3225567319600216, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 36985 + }, + { + "epoch": 0.3225654532451902, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 36986 + }, + { + "epoch": 0.3225741745303588, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 36987 + }, + { + "epoch": 0.32258289581552735, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 36988 + }, + { + "epoch": 0.32259161710069595, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 36989 + }, + { + "epoch": 0.32260033838586455, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 36990 + }, + { + "epoch": 0.32260905967103315, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 36991 + }, + { + "epoch": 0.3226177809562017, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 36992 + }, + { + "epoch": 0.3226265022413703, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 36993 + }, + { + "epoch": 0.3226352235265389, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 36994 + }, + { + "epoch": 0.32264394481170744, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 36995 + }, + { + "epoch": 0.32265266609687604, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 36996 + }, + { + "epoch": 0.32266138738204464, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 36997 + }, + { + "epoch": 0.3226701086672132, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 36998 + }, + { + "epoch": 0.3226788299523818, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 36999 + }, + { + "epoch": 0.3226875512375504, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 37000 + }, + { + "epoch": 0.32269627252271893, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 37001 + }, + { + "epoch": 0.32270499380788753, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 37002 + }, + { + "epoch": 0.32271371509305613, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 37003 + }, + { + "epoch": 0.3227224363782247, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 37004 + }, + { + "epoch": 0.3227311576633933, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 37005 + }, + { + "epoch": 0.3227398789485619, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 37006 + }, + { + "epoch": 0.3227486002337304, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 37007 + }, + { + "epoch": 0.322757321518899, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 37008 + }, + { + "epoch": 0.3227660428040676, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 37009 + }, + { + "epoch": 0.32277476408923617, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 37010 + }, + { + "epoch": 0.32278348537440477, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 37011 + }, + { + "epoch": 0.32279220665957337, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 37012 + }, + { + "epoch": 0.3228009279447419, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 37013 + }, + { + "epoch": 0.3228096492299105, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37014 + }, + { + "epoch": 0.3228183705150791, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 37015 + }, + { + "epoch": 0.3228270918002477, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 37016 + }, + { + "epoch": 0.32283581308541626, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 37017 + }, + { + "epoch": 0.32284453437058486, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 37018 + }, + { + "epoch": 0.32285325565575346, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 37019 + }, + { + "epoch": 0.322861976940922, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37020 + }, + { + "epoch": 0.3228706982260906, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 37021 + }, + { + "epoch": 0.3228794195112592, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 37022 + }, + { + "epoch": 0.32288814079642775, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 37023 + }, + { + "epoch": 0.32289686208159635, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 37024 + }, + { + "epoch": 0.32290558336676495, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 37025 + }, + { + "epoch": 0.3229143046519335, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 37026 + }, + { + "epoch": 0.3229230259371021, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 37027 + }, + { + "epoch": 0.3229317472222707, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 37028 + }, + { + "epoch": 0.32294046850743924, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37029 + }, + { + "epoch": 0.32294918979260784, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 37030 + }, + { + "epoch": 0.32295791107777644, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 37031 + }, + { + "epoch": 0.322966632362945, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 37032 + }, + { + "epoch": 0.3229753536481136, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 37033 + }, + { + "epoch": 0.3229840749332822, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37034 + }, + { + "epoch": 0.32299279621845073, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 37035 + }, + { + "epoch": 0.32300151750361933, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 37036 + }, + { + "epoch": 0.32301023878878793, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 37037 + }, + { + "epoch": 0.3230189600739565, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 37038 + }, + { + "epoch": 0.3230276813591251, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 37039 + }, + { + "epoch": 0.3230364026442937, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 37040 + }, + { + "epoch": 0.3230451239294622, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 37041 + }, + { + "epoch": 0.3230538452146308, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 37042 + }, + { + "epoch": 0.3230625664997994, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 37043 + }, + { + "epoch": 0.323071287784968, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 37044 + }, + { + "epoch": 0.32308000907013656, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 37045 + }, + { + "epoch": 0.32308873035530516, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 37046 + }, + { + "epoch": 0.32309745164047377, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 37047 + }, + { + "epoch": 0.3231061729256423, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 37048 + }, + { + "epoch": 0.3231148942108109, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 37049 + }, + { + "epoch": 0.3231236154959795, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 37050 + }, + { + "epoch": 0.32313233678114806, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 37051 + }, + { + "epoch": 0.32314105806631666, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 37052 + }, + { + "epoch": 0.32314977935148526, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 37053 + }, + { + "epoch": 0.3231585006366538, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 37054 + }, + { + "epoch": 0.3231672219218224, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 37055 + }, + { + "epoch": 0.323175943206991, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 37056 + }, + { + "epoch": 0.32318466449215955, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37057 + }, + { + "epoch": 0.32319338577732815, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 37058 + }, + { + "epoch": 0.32320210706249675, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 37059 + }, + { + "epoch": 0.3232108283476653, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 37060 + }, + { + "epoch": 0.3232195496328339, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 37061 + }, + { + "epoch": 0.3232282709180025, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 37062 + }, + { + "epoch": 0.32323699220317104, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 37063 + }, + { + "epoch": 0.32324571348833964, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 37064 + }, + { + "epoch": 0.32325443477350824, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 37065 + }, + { + "epoch": 0.3232631560586768, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 37066 + }, + { + "epoch": 0.3232718773438454, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 37067 + }, + { + "epoch": 0.323280598629014, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 37068 + }, + { + "epoch": 0.3232893199141825, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 37069 + }, + { + "epoch": 0.3232980411993511, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 37070 + }, + { + "epoch": 0.3233067624845197, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 37071 + }, + { + "epoch": 0.3233154837696883, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 37072 + }, + { + "epoch": 0.3233242050548569, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 37073 + }, + { + "epoch": 0.3233329263400255, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 37074 + }, + { + "epoch": 0.3233416476251941, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 37075 + }, + { + "epoch": 0.3233503689103626, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 37076 + }, + { + "epoch": 0.3233590901955312, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 37077 + }, + { + "epoch": 0.3233678114806998, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 37078 + }, + { + "epoch": 0.32337653276586836, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 37079 + }, + { + "epoch": 0.32338525405103696, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 37080 + }, + { + "epoch": 0.32339397533620556, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 37081 + }, + { + "epoch": 0.3234026966213741, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 37082 + }, + { + "epoch": 0.3234114179065427, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 37083 + }, + { + "epoch": 0.3234201391917113, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 37084 + }, + { + "epoch": 0.32342886047687985, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 37085 + }, + { + "epoch": 0.32343758176204845, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 37086 + }, + { + "epoch": 0.32344630304721705, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 37087 + }, + { + "epoch": 0.3234550243323856, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 37088 + }, + { + "epoch": 0.3234637456175542, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 37089 + }, + { + "epoch": 0.3234724669027228, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 37090 + }, + { + "epoch": 0.32348118818789134, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 37091 + }, + { + "epoch": 0.32348990947305994, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 37092 + }, + { + "epoch": 0.32349863075822854, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 37093 + }, + { + "epoch": 0.3235073520433971, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 37094 + }, + { + "epoch": 0.3235160733285657, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 37095 + }, + { + "epoch": 0.3235247946137343, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 37096 + }, + { + "epoch": 0.32353351589890283, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 37097 + }, + { + "epoch": 0.32354223718407144, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 37098 + }, + { + "epoch": 0.32355095846924004, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37099 + }, + { + "epoch": 0.32355967975440864, + "grad_norm": 0.40234375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 37100 + }, + { + "epoch": 0.3235684010395772, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 37101 + }, + { + "epoch": 0.3235771223247458, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37102 + }, + { + "epoch": 0.3235858436099144, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 37103 + }, + { + "epoch": 0.3235945648950829, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 37104 + }, + { + "epoch": 0.3236032861802515, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 37105 + }, + { + "epoch": 0.3236120074654201, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 37106 + }, + { + "epoch": 0.32362072875058867, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 37107 + }, + { + "epoch": 0.32362945003575727, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 37108 + }, + { + "epoch": 0.32363817132092587, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 37109 + }, + { + "epoch": 0.3236468926060944, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 37110 + }, + { + "epoch": 0.323655613891263, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 37111 + }, + { + "epoch": 0.3236643351764316, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 37112 + }, + { + "epoch": 0.32367305646160016, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 37113 + }, + { + "epoch": 0.32368177774676876, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 37114 + }, + { + "epoch": 0.32369049903193736, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 37115 + }, + { + "epoch": 0.3236992203171059, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 37116 + }, + { + "epoch": 0.3237079416022745, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 37117 + }, + { + "epoch": 0.3237166628874431, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 37118 + }, + { + "epoch": 0.32372538417261165, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 37119 + }, + { + "epoch": 0.32373410545778025, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 37120 + }, + { + "epoch": 0.32374282674294885, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 37121 + }, + { + "epoch": 0.3237515480281174, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 37122 + }, + { + "epoch": 0.323760269313286, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 37123 + }, + { + "epoch": 0.3237689905984546, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 37124 + }, + { + "epoch": 0.3237777118836232, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 37125 + }, + { + "epoch": 0.32378643316879174, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 37126 + }, + { + "epoch": 0.32379515445396034, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 37127 + }, + { + "epoch": 0.32380387573912894, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 37128 + }, + { + "epoch": 0.3238125970242975, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 37129 + }, + { + "epoch": 0.3238213183094661, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 37130 + }, + { + "epoch": 0.3238300395946347, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9846, + "step": 37131 + }, + { + "epoch": 0.32383876087980323, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 37132 + }, + { + "epoch": 0.32384748216497183, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 37133 + }, + { + "epoch": 0.32385620345014043, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 37134 + }, + { + "epoch": 0.323864924735309, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 37135 + }, + { + "epoch": 0.3238736460204776, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 37136 + }, + { + "epoch": 0.3238823673056462, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 37137 + }, + { + "epoch": 0.3238910885908147, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 37138 + }, + { + "epoch": 0.3238998098759833, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 37139 + }, + { + "epoch": 0.3239085311611519, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 37140 + }, + { + "epoch": 0.32391725244632047, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 37141 + }, + { + "epoch": 0.32392597373148907, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37142 + }, + { + "epoch": 0.32393469501665767, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 37143 + }, + { + "epoch": 0.3239434163018262, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 37144 + }, + { + "epoch": 0.3239521375869948, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 37145 + }, + { + "epoch": 0.3239608588721634, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 37146 + }, + { + "epoch": 0.32396958015733196, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 37147 + }, + { + "epoch": 0.32397830144250056, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 37148 + }, + { + "epoch": 0.32398702272766916, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 37149 + }, + { + "epoch": 0.3239957440128377, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 37150 + }, + { + "epoch": 0.3240044652980063, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 37151 + }, + { + "epoch": 0.3240131865831749, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 37152 + }, + { + "epoch": 0.3240219078683435, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 37153 + }, + { + "epoch": 0.32403062915351205, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 37154 + }, + { + "epoch": 0.32403935043868065, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 37155 + }, + { + "epoch": 0.32404807172384925, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 37156 + }, + { + "epoch": 0.3240567930090178, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 37157 + }, + { + "epoch": 0.3240655142941864, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 37158 + }, + { + "epoch": 0.324074235579355, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 37159 + }, + { + "epoch": 0.32408295686452354, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37160 + }, + { + "epoch": 0.32409167814969214, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 37161 + }, + { + "epoch": 0.32410039943486074, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 37162 + }, + { + "epoch": 0.3241091207200293, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 37163 + }, + { + "epoch": 0.3241178420051979, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 37164 + }, + { + "epoch": 0.3241265632903665, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 37165 + }, + { + "epoch": 0.32413528457553503, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 37166 + }, + { + "epoch": 0.32414400586070363, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 37167 + }, + { + "epoch": 0.32415272714587223, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 37168 + }, + { + "epoch": 0.3241614484310408, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 37169 + }, + { + "epoch": 0.3241701697162094, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 37170 + }, + { + "epoch": 0.324178891001378, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 37171 + }, + { + "epoch": 0.3241876122865465, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 37172 + }, + { + "epoch": 0.3241963335717151, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 37173 + }, + { + "epoch": 0.3242050548568837, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 37174 + }, + { + "epoch": 0.32421377614205227, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 37175 + }, + { + "epoch": 0.32422249742722087, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 37176 + }, + { + "epoch": 0.32423121871238947, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 37177 + }, + { + "epoch": 0.324239939997558, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 37178 + }, + { + "epoch": 0.3242486612827266, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 37179 + }, + { + "epoch": 0.3242573825678952, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 37180 + }, + { + "epoch": 0.3242661038530638, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 37181 + }, + { + "epoch": 0.32427482513823236, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 37182 + }, + { + "epoch": 0.32428354642340096, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 37183 + }, + { + "epoch": 0.32429226770856956, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 37184 + }, + { + "epoch": 0.3243009889937381, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 37185 + }, + { + "epoch": 0.3243097102789067, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 37186 + }, + { + "epoch": 0.3243184315640753, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 37187 + }, + { + "epoch": 0.32432715284924385, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 37188 + }, + { + "epoch": 0.32433587413441245, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 37189 + }, + { + "epoch": 0.32434459541958105, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 37190 + }, + { + "epoch": 0.3243533167047496, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 37191 + }, + { + "epoch": 0.3243620379899182, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 37192 + }, + { + "epoch": 0.3243707592750868, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 37193 + }, + { + "epoch": 0.32437948056025534, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 37194 + }, + { + "epoch": 0.32438820184542394, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 37195 + }, + { + "epoch": 0.32439692313059254, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 37196 + }, + { + "epoch": 0.3244056444157611, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 37197 + }, + { + "epoch": 0.3244143657009297, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 37198 + }, + { + "epoch": 0.3244230869860983, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 37199 + }, + { + "epoch": 0.32443180827126683, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37200 + }, + { + "epoch": 0.32444052955643543, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 37201 + }, + { + "epoch": 0.32444925084160403, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 37202 + }, + { + "epoch": 0.3244579721267726, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 37203 + }, + { + "epoch": 0.3244666934119412, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 37204 + }, + { + "epoch": 0.3244754146971098, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 37205 + }, + { + "epoch": 0.3244841359822783, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 37206 + }, + { + "epoch": 0.3244928572674469, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 37207 + }, + { + "epoch": 0.3245015785526155, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 37208 + }, + { + "epoch": 0.3245102998377841, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 37209 + }, + { + "epoch": 0.32451902112295267, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 37210 + }, + { + "epoch": 0.32452774240812127, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 37211 + }, + { + "epoch": 0.32453646369328987, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 37212 + }, + { + "epoch": 0.3245451849784584, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 37213 + }, + { + "epoch": 0.324553906263627, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 37214 + }, + { + "epoch": 0.3245626275487956, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 37215 + }, + { + "epoch": 0.32457134883396416, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 37216 + }, + { + "epoch": 0.32458007011913276, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 37217 + }, + { + "epoch": 0.32458879140430136, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 37218 + }, + { + "epoch": 0.3245975126894699, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 37219 + }, + { + "epoch": 0.3246062339746385, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 37220 + }, + { + "epoch": 0.3246149552598071, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 37221 + }, + { + "epoch": 0.32462367654497565, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 37222 + }, + { + "epoch": 0.32463239783014425, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 37223 + }, + { + "epoch": 0.32464111911531285, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 37224 + }, + { + "epoch": 0.3246498404004814, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 37225 + }, + { + "epoch": 0.32465856168565, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 37226 + }, + { + "epoch": 0.3246672829708186, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 37227 + }, + { + "epoch": 0.32467600425598714, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 37228 + }, + { + "epoch": 0.32468472554115574, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 37229 + }, + { + "epoch": 0.32469344682632434, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37230 + }, + { + "epoch": 0.3247021681114929, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 37231 + }, + { + "epoch": 0.3247108893966615, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 37232 + }, + { + "epoch": 0.3247196106818301, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 37233 + }, + { + "epoch": 0.3247283319669987, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 37234 + }, + { + "epoch": 0.32473705325216723, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 37235 + }, + { + "epoch": 0.32474577453733583, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 37236 + }, + { + "epoch": 0.32475449582250443, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 37237 + }, + { + "epoch": 0.324763217107673, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 37238 + }, + { + "epoch": 0.3247719383928416, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 37239 + }, + { + "epoch": 0.3247806596780102, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 37240 + }, + { + "epoch": 0.3247893809631787, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 37241 + }, + { + "epoch": 0.3247981022483473, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 37242 + }, + { + "epoch": 0.3248068235335159, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 37243 + }, + { + "epoch": 0.32481554481868447, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 37244 + }, + { + "epoch": 0.32482426610385307, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 37245 + }, + { + "epoch": 0.32483298738902167, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 37246 + }, + { + "epoch": 0.3248417086741902, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 37247 + }, + { + "epoch": 0.3248504299593588, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 37248 + }, + { + "epoch": 0.3248591512445274, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 37249 + }, + { + "epoch": 0.32486787252969596, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 37250 + }, + { + "epoch": 0.32487659381486456, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 37251 + }, + { + "epoch": 0.32488531510003316, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 37252 + }, + { + "epoch": 0.3248940363852017, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 37253 + }, + { + "epoch": 0.3249027576703703, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 37254 + }, + { + "epoch": 0.3249114789555389, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 37255 + }, + { + "epoch": 0.32492020024070745, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 37256 + }, + { + "epoch": 0.32492892152587605, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 37257 + }, + { + "epoch": 0.32493764281104465, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37258 + }, + { + "epoch": 0.3249463640962132, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 37259 + }, + { + "epoch": 0.3249550853813818, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 37260 + }, + { + "epoch": 0.3249638066665504, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37261 + }, + { + "epoch": 0.324972527951719, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 37262 + }, + { + "epoch": 0.32498124923688754, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 37263 + }, + { + "epoch": 0.32498997052205614, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 37264 + }, + { + "epoch": 0.32499869180722474, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 37265 + }, + { + "epoch": 0.3250074130923933, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 37266 + }, + { + "epoch": 0.3250161343775619, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 37267 + }, + { + "epoch": 0.3250248556627305, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 37268 + }, + { + "epoch": 0.32503357694789903, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 37269 + }, + { + "epoch": 0.32504229823306763, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 37270 + }, + { + "epoch": 0.32505101951823623, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 37271 + }, + { + "epoch": 0.3250597408034048, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 37272 + }, + { + "epoch": 0.3250684620885734, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 37273 + }, + { + "epoch": 0.325077183373742, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 37274 + }, + { + "epoch": 0.3250859046589105, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 37275 + }, + { + "epoch": 0.3250946259440791, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 37276 + }, + { + "epoch": 0.3251033472292477, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 37277 + }, + { + "epoch": 0.32511206851441626, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 37278 + }, + { + "epoch": 0.32512078979958486, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 37279 + }, + { + "epoch": 0.32512951108475346, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 37280 + }, + { + "epoch": 0.325138232369922, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 37281 + }, + { + "epoch": 0.3251469536550906, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 37282 + }, + { + "epoch": 0.3251556749402592, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 37283 + }, + { + "epoch": 0.32516439622542775, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 37284 + }, + { + "epoch": 0.32517311751059635, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37285 + }, + { + "epoch": 0.32518183879576495, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 37286 + }, + { + "epoch": 0.3251905600809335, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 37287 + }, + { + "epoch": 0.3251992813661021, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 37288 + }, + { + "epoch": 0.3252080026512707, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 37289 + }, + { + "epoch": 0.3252167239364393, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 37290 + }, + { + "epoch": 0.32522544522160785, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 37291 + }, + { + "epoch": 0.32523416650677645, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 37292 + }, + { + "epoch": 0.32524288779194505, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 37293 + }, + { + "epoch": 0.3252516090771136, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 37294 + }, + { + "epoch": 0.3252603303622822, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 37295 + }, + { + "epoch": 0.3252690516474508, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37296 + }, + { + "epoch": 0.32527777293261934, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 37297 + }, + { + "epoch": 0.32528649421778794, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 37298 + }, + { + "epoch": 0.32529521550295654, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 37299 + }, + { + "epoch": 0.3253039367881251, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 37300 + }, + { + "epoch": 0.3253126580732937, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 37301 + }, + { + "epoch": 0.3253213793584623, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 37302 + }, + { + "epoch": 0.3253301006436308, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 37303 + }, + { + "epoch": 0.3253388219287994, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 37304 + }, + { + "epoch": 0.325347543213968, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 37305 + }, + { + "epoch": 0.32535626449913657, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 37306 + }, + { + "epoch": 0.32536498578430517, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 37307 + }, + { + "epoch": 0.32537370706947377, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.976, + "step": 37308 + }, + { + "epoch": 0.3253824283546423, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 37309 + }, + { + "epoch": 0.3253911496398109, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37310 + }, + { + "epoch": 0.3253998709249795, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 37311 + }, + { + "epoch": 0.32540859221014806, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 37312 + }, + { + "epoch": 0.32541731349531666, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 37313 + }, + { + "epoch": 0.32542603478048526, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 37314 + }, + { + "epoch": 0.3254347560656538, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 37315 + }, + { + "epoch": 0.3254434773508224, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 37316 + }, + { + "epoch": 0.325452198635991, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 37317 + }, + { + "epoch": 0.3254609199211596, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 37318 + }, + { + "epoch": 0.32546964120632815, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 37319 + }, + { + "epoch": 0.32547836249149675, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 37320 + }, + { + "epoch": 0.32548708377666535, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 37321 + }, + { + "epoch": 0.3254958050618339, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 37322 + }, + { + "epoch": 0.3255045263470025, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 37323 + }, + { + "epoch": 0.3255132476321711, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 37324 + }, + { + "epoch": 0.32552196891733964, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 37325 + }, + { + "epoch": 0.32553069020250824, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 37326 + }, + { + "epoch": 0.32553941148767684, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 37327 + }, + { + "epoch": 0.3255481327728454, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 37328 + }, + { + "epoch": 0.325556854058014, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.972, + "step": 37329 + }, + { + "epoch": 0.3255655753431826, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 37330 + }, + { + "epoch": 0.32557429662835113, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 37331 + }, + { + "epoch": 0.32558301791351973, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 37332 + }, + { + "epoch": 0.32559173919868833, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 37333 + }, + { + "epoch": 0.3256004604838569, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 37334 + }, + { + "epoch": 0.3256091817690255, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 37335 + }, + { + "epoch": 0.3256179030541941, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 37336 + }, + { + "epoch": 0.3256266243393626, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 37337 + }, + { + "epoch": 0.3256353456245312, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 37338 + }, + { + "epoch": 0.3256440669096998, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 37339 + }, + { + "epoch": 0.32565278819486837, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 37340 + }, + { + "epoch": 0.32566150948003697, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9752, + "step": 37341 + }, + { + "epoch": 0.32567023076520557, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 37342 + }, + { + "epoch": 0.32567895205037417, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 37343 + }, + { + "epoch": 0.3256876733355427, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 37344 + }, + { + "epoch": 0.3256963946207113, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 37345 + }, + { + "epoch": 0.3257051159058799, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 37346 + }, + { + "epoch": 0.32571383719104846, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 37347 + }, + { + "epoch": 0.32572255847621706, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 37348 + }, + { + "epoch": 0.32573127976138566, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 37349 + }, + { + "epoch": 0.3257400010465542, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 37350 + }, + { + "epoch": 0.3257487223317228, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 37351 + }, + { + "epoch": 0.3257574436168914, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 37352 + }, + { + "epoch": 0.32576616490205995, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 37353 + }, + { + "epoch": 0.32577488618722855, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 37354 + }, + { + "epoch": 0.32578360747239715, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 37355 + }, + { + "epoch": 0.3257923287575657, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 37356 + }, + { + "epoch": 0.3258010500427343, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 37357 + }, + { + "epoch": 0.3258097713279029, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 37358 + }, + { + "epoch": 0.32581849261307144, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 37359 + }, + { + "epoch": 0.32582721389824004, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 37360 + }, + { + "epoch": 0.32583593518340864, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 37361 + }, + { + "epoch": 0.3258446564685772, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 37362 + }, + { + "epoch": 0.3258533777537458, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 37363 + }, + { + "epoch": 0.3258620990389144, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 37364 + }, + { + "epoch": 0.32587082032408293, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 37365 + }, + { + "epoch": 0.32587954160925153, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 37366 + }, + { + "epoch": 0.32588826289442013, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37367 + }, + { + "epoch": 0.3258969841795887, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 37368 + }, + { + "epoch": 0.3259057054647573, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 37369 + }, + { + "epoch": 0.3259144267499259, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 37370 + }, + { + "epoch": 0.3259231480350945, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 37371 + }, + { + "epoch": 0.325931869320263, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 37372 + }, + { + "epoch": 0.3259405906054316, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 37373 + }, + { + "epoch": 0.3259493118906002, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 37374 + }, + { + "epoch": 0.32595803317576877, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 37375 + }, + { + "epoch": 0.32596675446093737, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 37376 + }, + { + "epoch": 0.32597547574610597, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 37377 + }, + { + "epoch": 0.3259841970312745, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 37378 + }, + { + "epoch": 0.3259929183164431, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37379 + }, + { + "epoch": 0.3260016396016117, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 37380 + }, + { + "epoch": 0.32601036088678026, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 37381 + }, + { + "epoch": 0.32601908217194886, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 37382 + }, + { + "epoch": 0.32602780345711746, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 37383 + }, + { + "epoch": 0.326036524742286, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 37384 + }, + { + "epoch": 0.3260452460274546, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 37385 + }, + { + "epoch": 0.3260539673126232, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 37386 + }, + { + "epoch": 0.32606268859779175, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 37387 + }, + { + "epoch": 0.32607140988296035, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 37388 + }, + { + "epoch": 0.32608013116812895, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 37389 + }, + { + "epoch": 0.3260888524532975, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 37390 + }, + { + "epoch": 0.3260975737384661, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 37391 + }, + { + "epoch": 0.3261062950236347, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 37392 + }, + { + "epoch": 0.32611501630880324, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 37393 + }, + { + "epoch": 0.32612373759397184, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 37394 + }, + { + "epoch": 0.32613245887914044, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 37395 + }, + { + "epoch": 0.326141180164309, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 37396 + }, + { + "epoch": 0.3261499014494776, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 37397 + }, + { + "epoch": 0.3261586227346462, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 37398 + }, + { + "epoch": 0.3261673440198148, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 37399 + }, + { + "epoch": 0.32617606530498333, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 37400 + }, + { + "epoch": 0.32618478659015193, + "grad_norm": 0.341796875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 37401 + }, + { + "epoch": 0.32619350787532053, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 37402 + }, + { + "epoch": 0.3262022291604891, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 37403 + }, + { + "epoch": 0.3262109504456577, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 37404 + }, + { + "epoch": 0.3262196717308263, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 37405 + }, + { + "epoch": 0.3262283930159948, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 37406 + }, + { + "epoch": 0.3262371143011634, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 37407 + }, + { + "epoch": 0.326245835586332, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 37408 + }, + { + "epoch": 0.32625455687150057, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 37409 + }, + { + "epoch": 0.32626327815666917, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 37410 + }, + { + "epoch": 0.32627199944183777, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 37411 + }, + { + "epoch": 0.3262807207270063, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 37412 + }, + { + "epoch": 0.3262894420121749, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 37413 + }, + { + "epoch": 0.3262981632973435, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 37414 + }, + { + "epoch": 0.32630688458251206, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 37415 + }, + { + "epoch": 0.32631560586768066, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 37416 + }, + { + "epoch": 0.32632432715284926, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 37417 + }, + { + "epoch": 0.3263330484380178, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 37418 + }, + { + "epoch": 0.3263417697231864, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 37419 + }, + { + "epoch": 0.326350491008355, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 37420 + }, + { + "epoch": 0.32635921229352355, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 37421 + }, + { + "epoch": 0.32636793357869215, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37422 + }, + { + "epoch": 0.32637665486386075, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 37423 + }, + { + "epoch": 0.3263853761490293, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 37424 + }, + { + "epoch": 0.3263940974341979, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 37425 + }, + { + "epoch": 0.3264028187193665, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 37426 + }, + { + "epoch": 0.3264115400045351, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 37427 + }, + { + "epoch": 0.32642026128970364, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 37428 + }, + { + "epoch": 0.32642898257487224, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 37429 + }, + { + "epoch": 0.32643770386004084, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 37430 + }, + { + "epoch": 0.3264464251452094, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 37431 + }, + { + "epoch": 0.326455146430378, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 37432 + }, + { + "epoch": 0.3264638677155466, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 37433 + }, + { + "epoch": 0.32647258900071513, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 37434 + }, + { + "epoch": 0.32648131028588373, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 37435 + }, + { + "epoch": 0.32649003157105233, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 37436 + }, + { + "epoch": 0.3264987528562209, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 37437 + }, + { + "epoch": 0.3265074741413895, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 37438 + }, + { + "epoch": 0.3265161954265581, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 37439 + }, + { + "epoch": 0.3265249167117266, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 37440 + }, + { + "epoch": 0.3265336379968952, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 37441 + }, + { + "epoch": 0.3265423592820638, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 37442 + }, + { + "epoch": 0.32655108056723237, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 37443 + }, + { + "epoch": 0.32655980185240097, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 37444 + }, + { + "epoch": 0.32656852313756957, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 37445 + }, + { + "epoch": 0.3265772444227381, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 37446 + }, + { + "epoch": 0.3265859657079067, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 37447 + }, + { + "epoch": 0.3265946869930753, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 37448 + }, + { + "epoch": 0.32660340827824386, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 37449 + }, + { + "epoch": 0.32661212956341246, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 37450 + }, + { + "epoch": 0.32662085084858106, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 37451 + }, + { + "epoch": 0.3266295721337496, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 37452 + }, + { + "epoch": 0.3266382934189182, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 37453 + }, + { + "epoch": 0.3266470147040868, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 37454 + }, + { + "epoch": 0.3266557359892554, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37455 + }, + { + "epoch": 0.32666445727442395, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 37456 + }, + { + "epoch": 0.32667317855959255, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37457 + }, + { + "epoch": 0.32668189984476115, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 37458 + }, + { + "epoch": 0.3266906211299297, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 37459 + }, + { + "epoch": 0.3266993424150983, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 37460 + }, + { + "epoch": 0.3267080637002669, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 37461 + }, + { + "epoch": 0.32671678498543544, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 37462 + }, + { + "epoch": 0.32672550627060404, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 37463 + }, + { + "epoch": 0.32673422755577264, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 37464 + }, + { + "epoch": 0.3267429488409412, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 37465 + }, + { + "epoch": 0.3267516701261098, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 37466 + }, + { + "epoch": 0.3267603914112784, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 37467 + }, + { + "epoch": 0.32676911269644693, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 37468 + }, + { + "epoch": 0.32677783398161553, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 37469 + }, + { + "epoch": 0.32678655526678413, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 37470 + }, + { + "epoch": 0.3267952765519527, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 37471 + }, + { + "epoch": 0.3268039978371213, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 37472 + }, + { + "epoch": 0.3268127191222899, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 37473 + }, + { + "epoch": 0.3268214404074584, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 37474 + }, + { + "epoch": 0.326830161692627, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 37475 + }, + { + "epoch": 0.3268388829777956, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 37476 + }, + { + "epoch": 0.32684760426296416, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 37477 + }, + { + "epoch": 0.32685632554813276, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 37478 + }, + { + "epoch": 0.32686504683330136, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 37479 + }, + { + "epoch": 0.32687376811846997, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 37480 + }, + { + "epoch": 0.3268824894036385, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 37481 + }, + { + "epoch": 0.3268912106888071, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 37482 + }, + { + "epoch": 0.3268999319739757, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 37483 + }, + { + "epoch": 0.32690865325914426, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 37484 + }, + { + "epoch": 0.32691737454431286, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 37485 + }, + { + "epoch": 0.32692609582948146, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 37486 + }, + { + "epoch": 0.32693481711465, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 37487 + }, + { + "epoch": 0.3269435383998186, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37488 + }, + { + "epoch": 0.3269522596849872, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 37489 + }, + { + "epoch": 0.32696098097015575, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 37490 + }, + { + "epoch": 0.32696970225532435, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 37491 + }, + { + "epoch": 0.32697842354049295, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 37492 + }, + { + "epoch": 0.3269871448256615, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 37493 + }, + { + "epoch": 0.3269958661108301, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 37494 + }, + { + "epoch": 0.3270045873959987, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 37495 + }, + { + "epoch": 0.32701330868116724, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 37496 + }, + { + "epoch": 0.32702202996633584, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 37497 + }, + { + "epoch": 0.32703075125150444, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 37498 + }, + { + "epoch": 0.327039472536673, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 37499 + }, + { + "epoch": 0.3270481938218416, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 37500 + }, + { + "epoch": 0.3270569151070102, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 37501 + }, + { + "epoch": 0.3270656363921787, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 37502 + }, + { + "epoch": 0.3270743576773473, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 37503 + }, + { + "epoch": 0.3270830789625159, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 37504 + }, + { + "epoch": 0.32709180024768447, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 37505 + }, + { + "epoch": 0.3271005215328531, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 37506 + }, + { + "epoch": 0.3271092428180217, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 37507 + }, + { + "epoch": 0.3271179641031903, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 37508 + }, + { + "epoch": 0.3271266853883588, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 37509 + }, + { + "epoch": 0.3271354066735274, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 37510 + }, + { + "epoch": 0.327144127958696, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 37511 + }, + { + "epoch": 0.32715284924386456, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 37512 + }, + { + "epoch": 0.32716157052903316, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 37513 + }, + { + "epoch": 0.32717029181420176, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 37514 + }, + { + "epoch": 0.3271790130993703, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 37515 + }, + { + "epoch": 0.3271877343845389, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 37516 + }, + { + "epoch": 0.3271964556697075, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 37517 + }, + { + "epoch": 0.32720517695487605, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 37518 + }, + { + "epoch": 0.32721389824004465, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 37519 + }, + { + "epoch": 0.32722261952521325, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 37520 + }, + { + "epoch": 0.3272313408103818, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 37521 + }, + { + "epoch": 0.3272400620955504, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9807, + "step": 37522 + }, + { + "epoch": 0.327248783380719, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 37523 + }, + { + "epoch": 0.32725750466588754, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 37524 + }, + { + "epoch": 0.32726622595105614, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 37525 + }, + { + "epoch": 0.32727494723622474, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 37526 + }, + { + "epoch": 0.3272836685213933, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 37527 + }, + { + "epoch": 0.3272923898065619, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 37528 + }, + { + "epoch": 0.3273011110917305, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 37529 + }, + { + "epoch": 0.32730983237689903, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 37530 + }, + { + "epoch": 0.32731855366206764, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 37531 + }, + { + "epoch": 0.32732727494723624, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 37532 + }, + { + "epoch": 0.3273359962324048, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 37533 + }, + { + "epoch": 0.3273447175175734, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 37534 + }, + { + "epoch": 0.327353438802742, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9807, + "step": 37535 + }, + { + "epoch": 0.3273621600879106, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 37536 + }, + { + "epoch": 0.3273708813730791, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 37537 + }, + { + "epoch": 0.3273796026582477, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 37538 + }, + { + "epoch": 0.3273883239434163, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 37539 + }, + { + "epoch": 0.32739704522858487, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 37540 + }, + { + "epoch": 0.32740576651375347, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 37541 + }, + { + "epoch": 0.32741448779892207, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 37542 + }, + { + "epoch": 0.3274232090840906, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 37543 + }, + { + "epoch": 0.3274319303692592, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 37544 + }, + { + "epoch": 0.3274406516544278, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 37545 + }, + { + "epoch": 0.32744937293959636, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 37546 + }, + { + "epoch": 0.32745809422476496, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 37547 + }, + { + "epoch": 0.32746681550993356, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 37548 + }, + { + "epoch": 0.3274755367951021, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 37549 + }, + { + "epoch": 0.3274842580802707, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 37550 + }, + { + "epoch": 0.3274929793654393, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 37551 + }, + { + "epoch": 0.32750170065060785, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 37552 + }, + { + "epoch": 0.32751042193577645, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 37553 + }, + { + "epoch": 0.32751914322094505, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 37554 + }, + { + "epoch": 0.3275278645061136, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 37555 + }, + { + "epoch": 0.3275365857912822, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 37556 + }, + { + "epoch": 0.3275453070764508, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 37557 + }, + { + "epoch": 0.32755402836161934, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 37558 + }, + { + "epoch": 0.32756274964678794, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 37559 + }, + { + "epoch": 0.32757147093195654, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 37560 + }, + { + "epoch": 0.3275801922171251, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 37561 + }, + { + "epoch": 0.3275889135022937, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 37562 + }, + { + "epoch": 0.3275976347874623, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 37563 + }, + { + "epoch": 0.3276063560726309, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 37564 + }, + { + "epoch": 0.32761507735779943, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9825, + "step": 37565 + }, + { + "epoch": 0.32762379864296803, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 37566 + }, + { + "epoch": 0.32763251992813663, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 37567 + }, + { + "epoch": 0.3276412412133052, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 37568 + }, + { + "epoch": 0.3276499624984738, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 37569 + }, + { + "epoch": 0.3276586837836424, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 37570 + }, + { + "epoch": 0.3276674050688109, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 37571 + }, + { + "epoch": 0.3276761263539795, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 37572 + }, + { + "epoch": 0.3276848476391481, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 37573 + }, + { + "epoch": 0.32769356892431667, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 37574 + }, + { + "epoch": 0.32770229020948527, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 37575 + }, + { + "epoch": 0.32771101149465387, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 37576 + }, + { + "epoch": 0.3277197327798224, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 37577 + }, + { + "epoch": 0.327728454064991, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 37578 + }, + { + "epoch": 0.3277371753501596, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 37579 + }, + { + "epoch": 0.32774589663532816, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 37580 + }, + { + "epoch": 0.32775461792049676, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 37581 + }, + { + "epoch": 0.32776333920566536, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 37582 + }, + { + "epoch": 0.3277720604908339, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 37583 + }, + { + "epoch": 0.3277807817760025, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 37584 + }, + { + "epoch": 0.3277895030611711, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37585 + }, + { + "epoch": 0.32779822434633965, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 37586 + }, + { + "epoch": 0.32780694563150825, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 37587 + }, + { + "epoch": 0.32781566691667685, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 37588 + }, + { + "epoch": 0.32782438820184545, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 37589 + }, + { + "epoch": 0.327833109487014, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 37590 + }, + { + "epoch": 0.3278418307721826, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 37591 + }, + { + "epoch": 0.3278505520573512, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 37592 + }, + { + "epoch": 0.32785927334251974, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 37593 + }, + { + "epoch": 0.32786799462768834, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 37594 + }, + { + "epoch": 0.32787671591285694, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 37595 + }, + { + "epoch": 0.3278854371980255, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 37596 + }, + { + "epoch": 0.3278941584831941, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 37597 + }, + { + "epoch": 0.3279028797683627, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 37598 + }, + { + "epoch": 0.32791160105353123, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 37599 + }, + { + "epoch": 0.32792032233869983, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 37600 + }, + { + "epoch": 0.32792904362386843, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 37601 + }, + { + "epoch": 0.327937764909037, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 37602 + }, + { + "epoch": 0.3279464861942056, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 37603 + }, + { + "epoch": 0.3279552074793742, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 37604 + }, + { + "epoch": 0.3279639287645427, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 37605 + }, + { + "epoch": 0.3279726500497113, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 37606 + }, + { + "epoch": 0.3279813713348799, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 37607 + }, + { + "epoch": 0.32799009262004847, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 37608 + }, + { + "epoch": 0.32799881390521707, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 37609 + }, + { + "epoch": 0.32800753519038567, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 37610 + }, + { + "epoch": 0.3280162564755542, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 37611 + }, + { + "epoch": 0.3280249777607228, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 37612 + }, + { + "epoch": 0.3280336990458914, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 37613 + }, + { + "epoch": 0.32804242033105996, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 37614 + }, + { + "epoch": 0.32805114161622856, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 37615 + }, + { + "epoch": 0.32805986290139716, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 37616 + }, + { + "epoch": 0.32806858418656576, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 37617 + }, + { + "epoch": 0.3280773054717343, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 37618 + }, + { + "epoch": 0.3280860267569029, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 37619 + }, + { + "epoch": 0.3280947480420715, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37620 + }, + { + "epoch": 0.32810346932724005, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 37621 + }, + { + "epoch": 0.32811219061240865, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 37622 + }, + { + "epoch": 0.32812091189757725, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 37623 + }, + { + "epoch": 0.3281296331827458, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 37624 + }, + { + "epoch": 0.3281383544679144, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 37625 + }, + { + "epoch": 0.328147075753083, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 37626 + }, + { + "epoch": 0.32815579703825154, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 37627 + }, + { + "epoch": 0.32816451832342014, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 37628 + }, + { + "epoch": 0.32817323960858874, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 37629 + }, + { + "epoch": 0.3281819608937573, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 37630 + }, + { + "epoch": 0.3281906821789259, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 37631 + }, + { + "epoch": 0.3281994034640945, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37632 + }, + { + "epoch": 0.32820812474926303, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 37633 + }, + { + "epoch": 0.32821684603443163, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 37634 + }, + { + "epoch": 0.32822556731960023, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 37635 + }, + { + "epoch": 0.3282342886047688, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 37636 + }, + { + "epoch": 0.3282430098899374, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 37637 + }, + { + "epoch": 0.328251731175106, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 37638 + }, + { + "epoch": 0.3282604524602745, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 37639 + }, + { + "epoch": 0.3282691737454431, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 37640 + }, + { + "epoch": 0.3282778950306117, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 37641 + }, + { + "epoch": 0.32828661631578027, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 37642 + }, + { + "epoch": 0.32829533760094887, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 37643 + }, + { + "epoch": 0.32830405888611747, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 37644 + }, + { + "epoch": 0.32831278017128607, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 37645 + }, + { + "epoch": 0.3283215014564546, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 37646 + }, + { + "epoch": 0.3283302227416232, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 37647 + }, + { + "epoch": 0.3283389440267918, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 37648 + }, + { + "epoch": 0.32834766531196036, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 37649 + }, + { + "epoch": 0.32835638659712896, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 37650 + }, + { + "epoch": 0.32836510788229756, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 37651 + }, + { + "epoch": 0.3283738291674661, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 37652 + }, + { + "epoch": 0.3283825504526347, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 37653 + }, + { + "epoch": 0.3283912717378033, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 37654 + }, + { + "epoch": 0.32839999302297185, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 37655 + }, + { + "epoch": 0.32840871430814045, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 37656 + }, + { + "epoch": 0.32841743559330905, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 37657 + }, + { + "epoch": 0.3284261568784776, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 37658 + }, + { + "epoch": 0.3284348781636462, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 37659 + }, + { + "epoch": 0.3284435994488148, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 37660 + }, + { + "epoch": 0.32845232073398334, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 37661 + }, + { + "epoch": 0.32846104201915194, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 37662 + }, + { + "epoch": 0.32846976330432054, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 37663 + }, + { + "epoch": 0.3284784845894891, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 37664 + }, + { + "epoch": 0.3284872058746577, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 37665 + }, + { + "epoch": 0.3284959271598263, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 37666 + }, + { + "epoch": 0.32850464844499483, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 37667 + }, + { + "epoch": 0.32851336973016343, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 37668 + }, + { + "epoch": 0.32852209101533203, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 37669 + }, + { + "epoch": 0.3285308123005006, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 37670 + }, + { + "epoch": 0.3285395335856692, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 37671 + }, + { + "epoch": 0.3285482548708378, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 37672 + }, + { + "epoch": 0.3285569761560064, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 37673 + }, + { + "epoch": 0.3285656974411749, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 37674 + }, + { + "epoch": 0.3285744187263435, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 37675 + }, + { + "epoch": 0.3285831400115121, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 37676 + }, + { + "epoch": 0.32859186129668067, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 37677 + }, + { + "epoch": 0.32860058258184927, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 37678 + }, + { + "epoch": 0.32860930386701787, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 37679 + }, + { + "epoch": 0.3286180251521864, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 37680 + }, + { + "epoch": 0.328626746437355, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 37681 + }, + { + "epoch": 0.3286354677225236, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 37682 + }, + { + "epoch": 0.32864418900769216, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 37683 + }, + { + "epoch": 0.32865291029286076, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 37684 + }, + { + "epoch": 0.32866163157802936, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 37685 + }, + { + "epoch": 0.3286703528631979, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 37686 + }, + { + "epoch": 0.3286790741483665, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 37687 + }, + { + "epoch": 0.3286877954335351, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 37688 + }, + { + "epoch": 0.32869651671870365, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 37689 + }, + { + "epoch": 0.32870523800387225, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 37690 + }, + { + "epoch": 0.32871395928904085, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 37691 + }, + { + "epoch": 0.3287226805742094, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 37692 + }, + { + "epoch": 0.328731401859378, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 37693 + }, + { + "epoch": 0.3287401231445466, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 37694 + }, + { + "epoch": 0.32874884442971514, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 37695 + }, + { + "epoch": 0.32875756571488374, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 37696 + }, + { + "epoch": 0.32876628700005234, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 37697 + }, + { + "epoch": 0.32877500828522094, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 37698 + }, + { + "epoch": 0.3287837295703895, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 37699 + }, + { + "epoch": 0.3287924508555581, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 37700 + }, + { + "epoch": 0.3288011721407267, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37701 + }, + { + "epoch": 0.3288098934258952, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 37702 + }, + { + "epoch": 0.32881861471106383, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 37703 + }, + { + "epoch": 0.32882733599623243, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 37704 + }, + { + "epoch": 0.328836057281401, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 37705 + }, + { + "epoch": 0.3288447785665696, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 37706 + }, + { + "epoch": 0.3288534998517382, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 37707 + }, + { + "epoch": 0.3288622211369067, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 37708 + }, + { + "epoch": 0.3288709424220753, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 37709 + }, + { + "epoch": 0.3288796637072439, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 37710 + }, + { + "epoch": 0.32888838499241246, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 37711 + }, + { + "epoch": 0.32889710627758106, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 37712 + }, + { + "epoch": 0.32890582756274966, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 37713 + }, + { + "epoch": 0.3289145488479182, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 37714 + }, + { + "epoch": 0.3289232701330868, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37715 + }, + { + "epoch": 0.3289319914182554, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 37716 + }, + { + "epoch": 0.32894071270342395, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 37717 + }, + { + "epoch": 0.32894943398859255, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 37718 + }, + { + "epoch": 0.32895815527376115, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 37719 + }, + { + "epoch": 0.3289668765589297, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 37720 + }, + { + "epoch": 0.3289755978440983, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 37721 + }, + { + "epoch": 0.3289843191292669, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 37722 + }, + { + "epoch": 0.32899304041443544, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 37723 + }, + { + "epoch": 0.32900176169960405, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 37724 + }, + { + "epoch": 0.32901048298477265, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 37725 + }, + { + "epoch": 0.32901920426994125, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 37726 + }, + { + "epoch": 0.3290279255551098, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 37727 + }, + { + "epoch": 0.3290366468402784, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 37728 + }, + { + "epoch": 0.329045368125447, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 37729 + }, + { + "epoch": 0.32905408941061554, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 37730 + }, + { + "epoch": 0.32906281069578414, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 37731 + }, + { + "epoch": 0.32907153198095274, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 37732 + }, + { + "epoch": 0.3290802532661213, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 37733 + }, + { + "epoch": 0.3290889745512899, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 37734 + }, + { + "epoch": 0.3290976958364585, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 37735 + }, + { + "epoch": 0.329106417121627, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 37736 + }, + { + "epoch": 0.3291151384067956, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 37737 + }, + { + "epoch": 0.3291238596919642, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 37738 + }, + { + "epoch": 0.32913258097713277, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 37739 + }, + { + "epoch": 0.32914130226230137, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 37740 + }, + { + "epoch": 0.32915002354746997, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 37741 + }, + { + "epoch": 0.3291587448326385, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37742 + }, + { + "epoch": 0.3291674661178071, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 37743 + }, + { + "epoch": 0.3291761874029757, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 37744 + }, + { + "epoch": 0.32918490868814426, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 37745 + }, + { + "epoch": 0.32919362997331286, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 37746 + }, + { + "epoch": 0.32920235125848146, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 37747 + }, + { + "epoch": 0.32921107254365, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 37748 + }, + { + "epoch": 0.3292197938288186, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 37749 + }, + { + "epoch": 0.3292285151139872, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 37750 + }, + { + "epoch": 0.32923723639915575, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 37751 + }, + { + "epoch": 0.32924595768432435, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 37752 + }, + { + "epoch": 0.32925467896949295, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 37753 + }, + { + "epoch": 0.32926340025466155, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 37754 + }, + { + "epoch": 0.3292721215398301, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 37755 + }, + { + "epoch": 0.3292808428249987, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 37756 + }, + { + "epoch": 0.3292895641101673, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 37757 + }, + { + "epoch": 0.32929828539533584, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 37758 + }, + { + "epoch": 0.32930700668050444, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 37759 + }, + { + "epoch": 0.32931572796567304, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 37760 + }, + { + "epoch": 0.3293244492508416, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 37761 + }, + { + "epoch": 0.3293331705360102, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 37762 + }, + { + "epoch": 0.3293418918211788, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 37763 + }, + { + "epoch": 0.32935061310634733, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 37764 + }, + { + "epoch": 0.32935933439151593, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 37765 + }, + { + "epoch": 0.32936805567668453, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 37766 + }, + { + "epoch": 0.3293767769618531, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 37767 + }, + { + "epoch": 0.3293854982470217, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 37768 + }, + { + "epoch": 0.3293942195321903, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 37769 + }, + { + "epoch": 0.3294029408173588, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 37770 + }, + { + "epoch": 0.3294116621025274, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 37771 + }, + { + "epoch": 0.329420383387696, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37772 + }, + { + "epoch": 0.32942910467286457, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 37773 + }, + { + "epoch": 0.32943782595803317, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 37774 + }, + { + "epoch": 0.32944654724320177, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 37775 + }, + { + "epoch": 0.3294552685283703, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 37776 + }, + { + "epoch": 0.3294639898135389, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 37777 + }, + { + "epoch": 0.3294727110987075, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 37778 + }, + { + "epoch": 0.32948143238387606, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 37779 + }, + { + "epoch": 0.32949015366904466, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 37780 + }, + { + "epoch": 0.32949887495421326, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 37781 + }, + { + "epoch": 0.32950759623938186, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 37782 + }, + { + "epoch": 0.3295163175245504, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 37783 + }, + { + "epoch": 0.329525038809719, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 37784 + }, + { + "epoch": 0.3295337600948876, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 37785 + }, + { + "epoch": 0.32954248138005615, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 37786 + }, + { + "epoch": 0.32955120266522475, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 37787 + }, + { + "epoch": 0.32955992395039335, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 37788 + }, + { + "epoch": 0.3295686452355619, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 37789 + }, + { + "epoch": 0.3295773665207305, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 37790 + }, + { + "epoch": 0.3295860878058991, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 37791 + }, + { + "epoch": 0.32959480909106764, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 37792 + }, + { + "epoch": 0.32960353037623624, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 37793 + }, + { + "epoch": 0.32961225166140484, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 37794 + }, + { + "epoch": 0.3296209729465734, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 37795 + }, + { + "epoch": 0.329629694231742, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 37796 + }, + { + "epoch": 0.3296384155169106, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 37797 + }, + { + "epoch": 0.32964713680207913, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 37798 + }, + { + "epoch": 0.32965585808724773, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 37799 + }, + { + "epoch": 0.32966457937241633, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 37800 + }, + { + "epoch": 0.3296733006575849, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 37801 + }, + { + "epoch": 0.3296820219427535, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 37802 + }, + { + "epoch": 0.3296907432279221, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 37803 + }, + { + "epoch": 0.3296994645130906, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 37804 + }, + { + "epoch": 0.3297081857982592, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 37805 + }, + { + "epoch": 0.3297169070834278, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 37806 + }, + { + "epoch": 0.3297256283685964, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 37807 + }, + { + "epoch": 0.32973434965376497, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 37808 + }, + { + "epoch": 0.32974307093893357, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 37809 + }, + { + "epoch": 0.32975179222410217, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 37810 + }, + { + "epoch": 0.3297605135092707, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 37811 + }, + { + "epoch": 0.3297692347944393, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 37812 + }, + { + "epoch": 0.3297779560796079, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 37813 + }, + { + "epoch": 0.32978667736477646, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 37814 + }, + { + "epoch": 0.32979539864994506, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 37815 + }, + { + "epoch": 0.32980411993511366, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 37816 + }, + { + "epoch": 0.3298128412202822, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 37817 + }, + { + "epoch": 0.3298215625054508, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 37818 + }, + { + "epoch": 0.3298302837906194, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 37819 + }, + { + "epoch": 0.32983900507578795, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 37820 + }, + { + "epoch": 0.32984772636095655, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 37821 + }, + { + "epoch": 0.32985644764612515, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 37822 + }, + { + "epoch": 0.3298651689312937, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 37823 + }, + { + "epoch": 0.3298738902164623, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 37824 + }, + { + "epoch": 0.3298826115016309, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 37825 + }, + { + "epoch": 0.32989133278679944, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 37826 + }, + { + "epoch": 0.32990005407196804, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 37827 + }, + { + "epoch": 0.32990877535713664, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 37828 + }, + { + "epoch": 0.3299174966423052, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 37829 + }, + { + "epoch": 0.3299262179274738, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 37830 + }, + { + "epoch": 0.3299349392126424, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 37831 + }, + { + "epoch": 0.32994366049781093, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 37832 + }, + { + "epoch": 0.32995238178297953, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 37833 + }, + { + "epoch": 0.32996110306814813, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 37834 + }, + { + "epoch": 0.32996982435331673, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 37835 + }, + { + "epoch": 0.3299785456384853, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 37836 + }, + { + "epoch": 0.3299872669236539, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 37837 + }, + { + "epoch": 0.3299959882088225, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 37838 + }, + { + "epoch": 0.330004709493991, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 37839 + }, + { + "epoch": 0.3300134307791596, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 37840 + }, + { + "epoch": 0.3300221520643282, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 37841 + }, + { + "epoch": 0.33003087334949677, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 37842 + }, + { + "epoch": 0.33003959463466537, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 37843 + }, + { + "epoch": 0.33004831591983397, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 37844 + }, + { + "epoch": 0.3300570372050025, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 37845 + }, + { + "epoch": 0.3300657584901711, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 37846 + }, + { + "epoch": 0.3300744797753397, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 37847 + }, + { + "epoch": 0.33008320106050826, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 37848 + }, + { + "epoch": 0.33009192234567686, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 37849 + }, + { + "epoch": 0.33010064363084546, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 37850 + }, + { + "epoch": 0.330109364916014, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 37851 + }, + { + "epoch": 0.3301180862011826, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 37852 + }, + { + "epoch": 0.3301268074863512, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 37853 + }, + { + "epoch": 0.33013552877151975, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 37854 + }, + { + "epoch": 0.33014425005668835, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 37855 + }, + { + "epoch": 0.33015297134185695, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 37856 + }, + { + "epoch": 0.3301616926270255, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 37857 + }, + { + "epoch": 0.3301704139121941, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 37858 + }, + { + "epoch": 0.3301791351973627, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 37859 + }, + { + "epoch": 0.33018785648253124, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 37860 + }, + { + "epoch": 0.33019657776769984, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 37861 + }, + { + "epoch": 0.33020529905286844, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 37862 + }, + { + "epoch": 0.33021402033803704, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 37863 + }, + { + "epoch": 0.3302227416232056, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 37864 + }, + { + "epoch": 0.3302314629083742, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 37865 + }, + { + "epoch": 0.3302401841935428, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 37866 + }, + { + "epoch": 0.33024890547871133, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37867 + }, + { + "epoch": 0.33025762676387993, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 37868 + }, + { + "epoch": 0.33026634804904853, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 37869 + }, + { + "epoch": 0.3302750693342171, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 37870 + }, + { + "epoch": 0.3302837906193857, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 37871 + }, + { + "epoch": 0.3302925119045543, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 37872 + }, + { + "epoch": 0.3303012331897228, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 37873 + }, + { + "epoch": 0.3303099544748914, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 37874 + }, + { + "epoch": 0.33031867576006, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 37875 + }, + { + "epoch": 0.33032739704522857, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 37876 + }, + { + "epoch": 0.33033611833039717, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 37877 + }, + { + "epoch": 0.33034483961556577, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 37878 + }, + { + "epoch": 0.3303535609007343, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 37879 + }, + { + "epoch": 0.3303622821859029, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 37880 + }, + { + "epoch": 0.3303710034710715, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 37881 + }, + { + "epoch": 0.33037972475624006, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 37882 + }, + { + "epoch": 0.33038844604140866, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 37883 + }, + { + "epoch": 0.33039716732657726, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 37884 + }, + { + "epoch": 0.3304058886117458, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 37885 + }, + { + "epoch": 0.3304146098969144, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 37886 + }, + { + "epoch": 0.330423331182083, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 37887 + }, + { + "epoch": 0.33043205246725155, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 37888 + }, + { + "epoch": 0.33044077375242015, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 37889 + }, + { + "epoch": 0.33044949503758875, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 37890 + }, + { + "epoch": 0.33045821632275735, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 37891 + }, + { + "epoch": 0.3304669376079259, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 37892 + }, + { + "epoch": 0.3304756588930945, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 37893 + }, + { + "epoch": 0.3304843801782631, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 37894 + }, + { + "epoch": 0.33049310146343164, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 37895 + }, + { + "epoch": 0.33050182274860024, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 37896 + }, + { + "epoch": 0.33051054403376884, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 37897 + }, + { + "epoch": 0.3305192653189374, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 37898 + }, + { + "epoch": 0.330527986604106, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 37899 + }, + { + "epoch": 0.3305367078892746, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 37900 + }, + { + "epoch": 0.33054542917444313, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 37901 + }, + { + "epoch": 0.33055415045961173, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 37902 + }, + { + "epoch": 0.33056287174478033, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 37903 + }, + { + "epoch": 0.3305715930299489, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 37904 + }, + { + "epoch": 0.3305803143151175, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 37905 + }, + { + "epoch": 0.3305890356002861, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 37906 + }, + { + "epoch": 0.3305977568854546, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 37907 + }, + { + "epoch": 0.3306064781706232, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 37908 + }, + { + "epoch": 0.3306151994557918, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 37909 + }, + { + "epoch": 0.33062392074096036, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 37910 + }, + { + "epoch": 0.33063264202612896, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 37911 + }, + { + "epoch": 0.33064136331129756, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 37912 + }, + { + "epoch": 0.3306500845964661, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 37913 + }, + { + "epoch": 0.3306588058816347, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 37914 + }, + { + "epoch": 0.3306675271668033, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 37915 + }, + { + "epoch": 0.3306762484519719, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 37916 + }, + { + "epoch": 0.33068496973714046, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 37917 + }, + { + "epoch": 0.33069369102230906, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 37918 + }, + { + "epoch": 0.33070241230747766, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 37919 + }, + { + "epoch": 0.3307111335926462, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37920 + }, + { + "epoch": 0.3307198548778148, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 37921 + }, + { + "epoch": 0.3307285761629834, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 37922 + }, + { + "epoch": 0.33073729744815195, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 37923 + }, + { + "epoch": 0.33074601873332055, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37924 + }, + { + "epoch": 0.33075474001848915, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 37925 + }, + { + "epoch": 0.3307634613036577, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 37926 + }, + { + "epoch": 0.3307721825888263, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 37927 + }, + { + "epoch": 0.3307809038739949, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 37928 + }, + { + "epoch": 0.33078962515916344, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 37929 + }, + { + "epoch": 0.33079834644433204, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 37930 + }, + { + "epoch": 0.33080706772950064, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9781, + "step": 37931 + }, + { + "epoch": 0.3308157890146692, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 37932 + }, + { + "epoch": 0.3308245102998378, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 37933 + }, + { + "epoch": 0.3308332315850064, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 37934 + }, + { + "epoch": 0.3308419528701749, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37935 + }, + { + "epoch": 0.3308506741553435, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 37936 + }, + { + "epoch": 0.3308593954405121, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 37937 + }, + { + "epoch": 0.33086811672568067, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 37938 + }, + { + "epoch": 0.3308768380108493, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 37939 + }, + { + "epoch": 0.3308855592960179, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 37940 + }, + { + "epoch": 0.3308942805811864, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 37941 + }, + { + "epoch": 0.330903001866355, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 37942 + }, + { + "epoch": 0.3309117231515236, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 37943 + }, + { + "epoch": 0.3309204444366922, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 37944 + }, + { + "epoch": 0.33092916572186076, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 37945 + }, + { + "epoch": 0.33093788700702936, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 37946 + }, + { + "epoch": 0.33094660829219796, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 37947 + }, + { + "epoch": 0.3309553295773665, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 37948 + }, + { + "epoch": 0.3309640508625351, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 37949 + }, + { + "epoch": 0.3309727721477037, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 37950 + }, + { + "epoch": 0.33098149343287225, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 37951 + }, + { + "epoch": 0.33099021471804085, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 37952 + }, + { + "epoch": 0.33099893600320945, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 37953 + }, + { + "epoch": 0.331007657288378, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 37954 + }, + { + "epoch": 0.3310163785735466, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 37955 + }, + { + "epoch": 0.3310250998587152, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 37956 + }, + { + "epoch": 0.33103382114388374, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9825, + "step": 37957 + }, + { + "epoch": 0.33104254242905234, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 37958 + }, + { + "epoch": 0.33105126371422094, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 37959 + }, + { + "epoch": 0.3310599849993895, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 37960 + }, + { + "epoch": 0.3310687062845581, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 37961 + }, + { + "epoch": 0.3310774275697267, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 37962 + }, + { + "epoch": 0.33108614885489523, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 37963 + }, + { + "epoch": 0.33109487014006383, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 37964 + }, + { + "epoch": 0.33110359142523244, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 37965 + }, + { + "epoch": 0.331112312710401, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 37966 + }, + { + "epoch": 0.3311210339955696, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 37967 + }, + { + "epoch": 0.3311297552807382, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 37968 + }, + { + "epoch": 0.3311384765659067, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 37969 + }, + { + "epoch": 0.3311471978510753, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 37970 + }, + { + "epoch": 0.3311559191362439, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 37971 + }, + { + "epoch": 0.3311646404214125, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 37972 + }, + { + "epoch": 0.33117336170658107, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 37973 + }, + { + "epoch": 0.33118208299174967, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 37974 + }, + { + "epoch": 0.33119080427691827, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 37975 + }, + { + "epoch": 0.3311995255620868, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 37976 + }, + { + "epoch": 0.3312082468472554, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 37977 + }, + { + "epoch": 0.331216968132424, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 37978 + }, + { + "epoch": 0.33122568941759256, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 37979 + }, + { + "epoch": 0.33123441070276116, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 37980 + }, + { + "epoch": 0.33124313198792976, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 37981 + }, + { + "epoch": 0.3312518532730983, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 37982 + }, + { + "epoch": 0.3312605745582669, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 37983 + }, + { + "epoch": 0.3312692958434355, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 37984 + }, + { + "epoch": 0.33127801712860405, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 37985 + }, + { + "epoch": 0.33128673841377265, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 37986 + }, + { + "epoch": 0.33129545969894125, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 37987 + }, + { + "epoch": 0.3313041809841098, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 37988 + }, + { + "epoch": 0.3313129022692784, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 37989 + }, + { + "epoch": 0.331321623554447, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 37990 + }, + { + "epoch": 0.33133034483961554, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 37991 + }, + { + "epoch": 0.33133906612478414, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 37992 + }, + { + "epoch": 0.33134778740995274, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 37993 + }, + { + "epoch": 0.3313565086951213, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 37994 + }, + { + "epoch": 0.3313652299802899, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 37995 + }, + { + "epoch": 0.3313739512654585, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 37996 + }, + { + "epoch": 0.33138267255062703, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 37997 + }, + { + "epoch": 0.33139139383579563, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 37998 + }, + { + "epoch": 0.33140011512096423, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 37999 + }, + { + "epoch": 0.33140883640613283, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 38000 + }, + { + "epoch": 0.3314175576913014, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 38001 + }, + { + "epoch": 0.33142627897647, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 38002 + }, + { + "epoch": 0.3314350002616386, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 38003 + }, + { + "epoch": 0.3314437215468071, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 38004 + }, + { + "epoch": 0.3314524428319757, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 38005 + }, + { + "epoch": 0.3314611641171443, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 38006 + }, + { + "epoch": 0.33146988540231287, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 38007 + }, + { + "epoch": 0.33147860668748147, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 38008 + }, + { + "epoch": 0.33148732797265007, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 38009 + }, + { + "epoch": 0.3314960492578186, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 38010 + }, + { + "epoch": 0.3315047705429872, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 38011 + }, + { + "epoch": 0.3315134918281558, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 38012 + }, + { + "epoch": 0.33152221311332436, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 38013 + }, + { + "epoch": 0.33153093439849296, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 38014 + }, + { + "epoch": 0.33153965568366156, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9826, + "step": 38015 + }, + { + "epoch": 0.3315483769688301, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 38016 + }, + { + "epoch": 0.3315570982539987, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 38017 + }, + { + "epoch": 0.3315658195391673, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 38018 + }, + { + "epoch": 0.33157454082433585, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38019 + }, + { + "epoch": 0.33158326210950445, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 38020 + }, + { + "epoch": 0.33159198339467305, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 38021 + }, + { + "epoch": 0.3316007046798416, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 38022 + }, + { + "epoch": 0.3316094259650102, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 38023 + }, + { + "epoch": 0.3316181472501788, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 38024 + }, + { + "epoch": 0.33162686853534734, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 38025 + }, + { + "epoch": 0.33163558982051594, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 38026 + }, + { + "epoch": 0.33164431110568454, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 38027 + }, + { + "epoch": 0.33165303239085314, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 38028 + }, + { + "epoch": 0.3316617536760217, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 38029 + }, + { + "epoch": 0.3316704749611903, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 38030 + }, + { + "epoch": 0.3316791962463589, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9801, + "step": 38031 + }, + { + "epoch": 0.33168791753152743, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 38032 + }, + { + "epoch": 0.33169663881669603, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 38033 + }, + { + "epoch": 0.33170536010186463, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 38034 + }, + { + "epoch": 0.3317140813870332, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 38035 + }, + { + "epoch": 0.3317228026722018, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 38036 + }, + { + "epoch": 0.3317315239573704, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 38037 + }, + { + "epoch": 0.3317402452425389, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 38038 + }, + { + "epoch": 0.3317489665277075, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 38039 + }, + { + "epoch": 0.3317576878128761, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 38040 + }, + { + "epoch": 0.33176640909804467, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 38041 + }, + { + "epoch": 0.33177513038321327, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 38042 + }, + { + "epoch": 0.33178385166838187, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 38043 + }, + { + "epoch": 0.3317925729535504, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 38044 + }, + { + "epoch": 0.331801294238719, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 38045 + }, + { + "epoch": 0.3318100155238876, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 38046 + }, + { + "epoch": 0.33181873680905616, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 38047 + }, + { + "epoch": 0.33182745809422476, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 38048 + }, + { + "epoch": 0.33183617937939336, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 38049 + }, + { + "epoch": 0.3318449006645619, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38050 + }, + { + "epoch": 0.3318536219497305, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 38051 + }, + { + "epoch": 0.3318623432348991, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 38052 + }, + { + "epoch": 0.3318710645200677, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 38053 + }, + { + "epoch": 0.33187978580523625, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 38054 + }, + { + "epoch": 0.33188850709040485, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 38055 + }, + { + "epoch": 0.33189722837557345, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 38056 + }, + { + "epoch": 0.331905949660742, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 38057 + }, + { + "epoch": 0.3319146709459106, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 38058 + }, + { + "epoch": 0.3319233922310792, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 38059 + }, + { + "epoch": 0.33193211351624774, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 38060 + }, + { + "epoch": 0.33194083480141634, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 38061 + }, + { + "epoch": 0.33194955608658494, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 38062 + }, + { + "epoch": 0.3319582773717535, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 38063 + }, + { + "epoch": 0.3319669986569221, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 38064 + }, + { + "epoch": 0.3319757199420907, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 38065 + }, + { + "epoch": 0.33198444122725923, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 38066 + }, + { + "epoch": 0.33199316251242783, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38067 + }, + { + "epoch": 0.33200188379759643, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 38068 + }, + { + "epoch": 0.332010605082765, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 38069 + }, + { + "epoch": 0.3320193263679336, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 38070 + }, + { + "epoch": 0.3320280476531022, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 38071 + }, + { + "epoch": 0.3320367689382707, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 38072 + }, + { + "epoch": 0.3320454902234393, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 38073 + }, + { + "epoch": 0.3320542115086079, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 0.9719, + "step": 38074 + }, + { + "epoch": 0.33206293279377647, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 38075 + }, + { + "epoch": 0.33207165407894507, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 38076 + }, + { + "epoch": 0.33208037536411367, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 38077 + }, + { + "epoch": 0.3320890966492822, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 38078 + }, + { + "epoch": 0.3320978179344508, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 38079 + }, + { + "epoch": 0.3321065392196194, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 38080 + }, + { + "epoch": 0.332115260504788, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 38081 + }, + { + "epoch": 0.33212398178995656, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 38082 + }, + { + "epoch": 0.33213270307512516, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38083 + }, + { + "epoch": 0.33214142436029376, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 38084 + }, + { + "epoch": 0.3321501456454623, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 38085 + }, + { + "epoch": 0.3321588669306309, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 38086 + }, + { + "epoch": 0.3321675882157995, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 38087 + }, + { + "epoch": 0.33217630950096805, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 38088 + }, + { + "epoch": 0.33218503078613665, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 38089 + }, + { + "epoch": 0.33219375207130525, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 38090 + }, + { + "epoch": 0.3322024733564738, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 38091 + }, + { + "epoch": 0.3322111946416424, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 38092 + }, + { + "epoch": 0.332219915926811, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 38093 + }, + { + "epoch": 0.33222863721197954, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 38094 + }, + { + "epoch": 0.33223735849714814, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 38095 + }, + { + "epoch": 0.33224607978231674, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 38096 + }, + { + "epoch": 0.3322548010674853, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 38097 + }, + { + "epoch": 0.3322635223526539, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 38098 + }, + { + "epoch": 0.3322722436378225, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 38099 + }, + { + "epoch": 0.33228096492299103, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 38100 + }, + { + "epoch": 0.33228968620815963, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 38101 + }, + { + "epoch": 0.33229840749332823, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 38102 + }, + { + "epoch": 0.3323071287784968, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 38103 + }, + { + "epoch": 0.3323158500636654, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 38104 + }, + { + "epoch": 0.332324571348834, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 38105 + }, + { + "epoch": 0.3323332926340025, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 38106 + }, + { + "epoch": 0.3323420139191711, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 38107 + }, + { + "epoch": 0.3323507352043397, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 38108 + }, + { + "epoch": 0.3323594564895083, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 38109 + }, + { + "epoch": 0.33236817777467687, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 38110 + }, + { + "epoch": 0.33237689905984547, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 38111 + }, + { + "epoch": 0.33238562034501407, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 38112 + }, + { + "epoch": 0.3323943416301826, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 38113 + }, + { + "epoch": 0.3324030629153512, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 38114 + }, + { + "epoch": 0.3324117842005198, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 38115 + }, + { + "epoch": 0.33242050548568836, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 38116 + }, + { + "epoch": 0.33242922677085696, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 38117 + }, + { + "epoch": 0.33243794805602556, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 38118 + }, + { + "epoch": 0.3324466693411941, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 38119 + }, + { + "epoch": 0.3324553906263627, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 38120 + }, + { + "epoch": 0.3324641119115313, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 38121 + }, + { + "epoch": 0.33247283319669985, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 38122 + }, + { + "epoch": 0.33248155448186845, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 38123 + }, + { + "epoch": 0.33249027576703705, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 38124 + }, + { + "epoch": 0.3324989970522056, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 38125 + }, + { + "epoch": 0.3325077183373742, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 38126 + }, + { + "epoch": 0.3325164396225428, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 38127 + }, + { + "epoch": 0.33252516090771134, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 38128 + }, + { + "epoch": 0.33253388219287994, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 38129 + }, + { + "epoch": 0.33254260347804854, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 38130 + }, + { + "epoch": 0.3325513247632171, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 38131 + }, + { + "epoch": 0.3325600460483857, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 38132 + }, + { + "epoch": 0.3325687673335543, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 38133 + }, + { + "epoch": 0.3325774886187228, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 38134 + }, + { + "epoch": 0.3325862099038914, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38135 + }, + { + "epoch": 0.33259493118906003, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 38136 + }, + { + "epoch": 0.33260365247422863, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 38137 + }, + { + "epoch": 0.3326123737593972, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 38138 + }, + { + "epoch": 0.3326210950445658, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38139 + }, + { + "epoch": 0.3326298163297344, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 38140 + }, + { + "epoch": 0.3326385376149029, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 38141 + }, + { + "epoch": 0.3326472589000715, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 38142 + }, + { + "epoch": 0.3326559801852401, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 38143 + }, + { + "epoch": 0.33266470147040866, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38144 + }, + { + "epoch": 0.33267342275557726, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 38145 + }, + { + "epoch": 0.33268214404074586, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 38146 + }, + { + "epoch": 0.3326908653259144, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 38147 + }, + { + "epoch": 0.332699586611083, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 38148 + }, + { + "epoch": 0.3327083078962516, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 38149 + }, + { + "epoch": 0.33271702918142015, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38150 + }, + { + "epoch": 0.33272575046658875, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38151 + }, + { + "epoch": 0.33273447175175735, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 38152 + }, + { + "epoch": 0.3327431930369259, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38153 + }, + { + "epoch": 0.3327519143220945, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38154 + }, + { + "epoch": 0.3327606356072631, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 38155 + }, + { + "epoch": 0.33276935689243164, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 38156 + }, + { + "epoch": 0.33277807817760025, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 38157 + }, + { + "epoch": 0.33278679946276885, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 38158 + }, + { + "epoch": 0.3327955207479374, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 38159 + }, + { + "epoch": 0.332804242033106, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 38160 + }, + { + "epoch": 0.3328129633182746, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 38161 + }, + { + "epoch": 0.3328216846034432, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 38162 + }, + { + "epoch": 0.33283040588861174, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 38163 + }, + { + "epoch": 0.33283912717378034, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 38164 + }, + { + "epoch": 0.33284784845894894, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 38165 + }, + { + "epoch": 0.3328565697441175, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 38166 + }, + { + "epoch": 0.3328652910292861, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 38167 + }, + { + "epoch": 0.3328740123144547, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 38168 + }, + { + "epoch": 0.3328827335996232, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 38169 + }, + { + "epoch": 0.3328914548847918, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38170 + }, + { + "epoch": 0.3329001761699604, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 38171 + }, + { + "epoch": 0.33290889745512897, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 38172 + }, + { + "epoch": 0.33291761874029757, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 38173 + }, + { + "epoch": 0.33292634002546617, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 38174 + }, + { + "epoch": 0.3329350613106347, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 38175 + }, + { + "epoch": 0.3329437825958033, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 38176 + }, + { + "epoch": 0.3329525038809719, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 38177 + }, + { + "epoch": 0.33296122516614046, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 38178 + }, + { + "epoch": 0.33296994645130906, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 38179 + }, + { + "epoch": 0.33297866773647766, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 38180 + }, + { + "epoch": 0.3329873890216462, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38181 + }, + { + "epoch": 0.3329961103068148, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 38182 + }, + { + "epoch": 0.3330048315919834, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 38183 + }, + { + "epoch": 0.33301355287715195, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 38184 + }, + { + "epoch": 0.33302227416232055, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 38185 + }, + { + "epoch": 0.33303099544748915, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 38186 + }, + { + "epoch": 0.3330397167326577, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 38187 + }, + { + "epoch": 0.3330484380178263, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 38188 + }, + { + "epoch": 0.3330571593029949, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 38189 + }, + { + "epoch": 0.3330658805881635, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 38190 + }, + { + "epoch": 0.33307460187333204, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 38191 + }, + { + "epoch": 0.33308332315850064, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 38192 + }, + { + "epoch": 0.33309204444366924, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 38193 + }, + { + "epoch": 0.3331007657288378, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 38194 + }, + { + "epoch": 0.3331094870140064, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 38195 + }, + { + "epoch": 0.333118208299175, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 38196 + }, + { + "epoch": 0.33312692958434353, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 38197 + }, + { + "epoch": 0.33313565086951213, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 38198 + }, + { + "epoch": 0.33314437215468073, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 38199 + }, + { + "epoch": 0.3331530934398493, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 38200 + }, + { + "epoch": 0.3331618147250179, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 38201 + }, + { + "epoch": 0.3331705360101865, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 38202 + }, + { + "epoch": 0.333179257295355, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 38203 + }, + { + "epoch": 0.3331879785805236, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 38204 + }, + { + "epoch": 0.3331966998656922, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 38205 + }, + { + "epoch": 0.33320542115086077, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 38206 + }, + { + "epoch": 0.33321414243602937, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 38207 + }, + { + "epoch": 0.33322286372119797, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 38208 + }, + { + "epoch": 0.3332315850063665, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 38209 + }, + { + "epoch": 0.3332403062915351, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 38210 + }, + { + "epoch": 0.3332490275767037, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.036, + "step": 38211 + }, + { + "epoch": 0.33325774886187226, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 38212 + }, + { + "epoch": 0.33326647014704086, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 38213 + }, + { + "epoch": 0.33327519143220946, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 38214 + }, + { + "epoch": 0.333283912717378, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 38215 + }, + { + "epoch": 0.3332926340025466, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 38216 + }, + { + "epoch": 0.3333013552877152, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 38217 + }, + { + "epoch": 0.3333100765728838, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 38218 + }, + { + "epoch": 0.33331879785805235, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 38219 + }, + { + "epoch": 0.33332751914322095, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 38220 + }, + { + "epoch": 0.33333624042838955, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38221 + }, + { + "epoch": 0.3333449617135581, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 38222 + }, + { + "epoch": 0.3333536829987267, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 38223 + }, + { + "epoch": 0.3333624042838953, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 38224 + }, + { + "epoch": 0.33337112556906384, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 38225 + }, + { + "epoch": 0.33337984685423244, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 38226 + }, + { + "epoch": 0.33338856813940104, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 38227 + }, + { + "epoch": 0.3333972894245696, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 38228 + }, + { + "epoch": 0.3334060107097382, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 38229 + }, + { + "epoch": 0.3334147319949068, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 38230 + }, + { + "epoch": 0.33342345328007533, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 38231 + }, + { + "epoch": 0.33343217456524393, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38232 + }, + { + "epoch": 0.33344089585041253, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 38233 + }, + { + "epoch": 0.3334496171355811, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 38234 + }, + { + "epoch": 0.3334583384207497, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 38235 + }, + { + "epoch": 0.3334670597059183, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 38236 + }, + { + "epoch": 0.3334757809910868, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 38237 + }, + { + "epoch": 0.3334845022762554, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 38238 + }, + { + "epoch": 0.333493223561424, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 38239 + }, + { + "epoch": 0.33350194484659257, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 38240 + }, + { + "epoch": 0.33351066613176117, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 38241 + }, + { + "epoch": 0.33351938741692977, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 38242 + }, + { + "epoch": 0.3335281087020983, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 38243 + }, + { + "epoch": 0.3335368299872669, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 38244 + }, + { + "epoch": 0.3335455512724355, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 38245 + }, + { + "epoch": 0.3335542725576041, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 38246 + }, + { + "epoch": 0.33356299384277266, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 38247 + }, + { + "epoch": 0.33357171512794126, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 38248 + }, + { + "epoch": 0.33358043641310986, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 38249 + }, + { + "epoch": 0.3335891576982784, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 38250 + }, + { + "epoch": 0.333597878983447, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38251 + }, + { + "epoch": 0.3336066002686156, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 38252 + }, + { + "epoch": 0.33361532155378415, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 38253 + }, + { + "epoch": 0.33362404283895275, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 38254 + }, + { + "epoch": 0.33363276412412135, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 38255 + }, + { + "epoch": 0.3336414854092899, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 38256 + }, + { + "epoch": 0.3336502066944585, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 38257 + }, + { + "epoch": 0.3336589279796271, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 38258 + }, + { + "epoch": 0.33366764926479564, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 38259 + }, + { + "epoch": 0.33367637054996424, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 38260 + }, + { + "epoch": 0.33368509183513284, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 38261 + }, + { + "epoch": 0.3336938131203014, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 38262 + }, + { + "epoch": 0.33370253440547, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 38263 + }, + { + "epoch": 0.3337112556906386, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 38264 + }, + { + "epoch": 0.33371997697580713, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 38265 + }, + { + "epoch": 0.33372869826097573, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 38266 + }, + { + "epoch": 0.33373741954614433, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 38267 + }, + { + "epoch": 0.3337461408313129, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 38268 + }, + { + "epoch": 0.3337548621164815, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 38269 + }, + { + "epoch": 0.3337635834016501, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 38270 + }, + { + "epoch": 0.3337723046868187, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 38271 + }, + { + "epoch": 0.3337810259719872, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 38272 + }, + { + "epoch": 0.3337897472571558, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 38273 + }, + { + "epoch": 0.3337984685423244, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 38274 + }, + { + "epoch": 0.33380718982749297, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 38275 + }, + { + "epoch": 0.33381591111266157, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 38276 + }, + { + "epoch": 0.33382463239783017, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 38277 + }, + { + "epoch": 0.3338333536829987, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 38278 + }, + { + "epoch": 0.3338420749681673, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 38279 + }, + { + "epoch": 0.3338507962533359, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 38280 + }, + { + "epoch": 0.33385951753850446, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 38281 + }, + { + "epoch": 0.33386823882367306, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9812, + "step": 38282 + }, + { + "epoch": 0.33387696010884166, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 38283 + }, + { + "epoch": 0.3338856813940102, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 38284 + }, + { + "epoch": 0.3338944026791788, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 38285 + }, + { + "epoch": 0.3339031239643474, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 38286 + }, + { + "epoch": 0.33391184524951595, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 38287 + }, + { + "epoch": 0.33392056653468455, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 38288 + }, + { + "epoch": 0.33392928781985315, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 38289 + }, + { + "epoch": 0.3339380091050217, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 38290 + }, + { + "epoch": 0.3339467303901903, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 38291 + }, + { + "epoch": 0.3339554516753589, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 38292 + }, + { + "epoch": 0.33396417296052744, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 38293 + }, + { + "epoch": 0.33397289424569604, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 38294 + }, + { + "epoch": 0.33398161553086464, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 38295 + }, + { + "epoch": 0.3339903368160332, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 38296 + }, + { + "epoch": 0.3339990581012018, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 38297 + }, + { + "epoch": 0.3340077793863704, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 38298 + }, + { + "epoch": 0.334016500671539, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 38299 + }, + { + "epoch": 0.33402522195670753, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 38300 + }, + { + "epoch": 0.33403394324187613, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 38301 + }, + { + "epoch": 0.33404266452704473, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 38302 + }, + { + "epoch": 0.3340513858122133, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 38303 + }, + { + "epoch": 0.3340601070973819, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 38304 + }, + { + "epoch": 0.3340688283825505, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 38305 + }, + { + "epoch": 0.334077549667719, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 38306 + }, + { + "epoch": 0.3340862709528876, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 38307 + }, + { + "epoch": 0.3340949922380562, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 38308 + }, + { + "epoch": 0.33410371352322477, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 38309 + }, + { + "epoch": 0.33411243480839337, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 38310 + }, + { + "epoch": 0.33412115609356197, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 38311 + }, + { + "epoch": 0.3341298773787305, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 38312 + }, + { + "epoch": 0.3341385986638991, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 38313 + }, + { + "epoch": 0.3341473199490677, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 38314 + }, + { + "epoch": 0.33415604123423626, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 38315 + }, + { + "epoch": 0.33416476251940486, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 38316 + }, + { + "epoch": 0.33417348380457346, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 38317 + }, + { + "epoch": 0.334182205089742, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 38318 + }, + { + "epoch": 0.3341909263749106, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 38319 + }, + { + "epoch": 0.3341996476600792, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 38320 + }, + { + "epoch": 0.33420836894524775, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 38321 + }, + { + "epoch": 0.33421709023041635, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 38322 + }, + { + "epoch": 0.33422581151558495, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 38323 + }, + { + "epoch": 0.3342345328007535, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 38324 + }, + { + "epoch": 0.3342432540859221, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 38325 + }, + { + "epoch": 0.3342519753710907, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 38326 + }, + { + "epoch": 0.3342606966562593, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 38327 + }, + { + "epoch": 0.33426941794142784, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 38328 + }, + { + "epoch": 0.33427813922659644, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 38329 + }, + { + "epoch": 0.33428686051176504, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 38330 + }, + { + "epoch": 0.3342955817969336, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 38331 + }, + { + "epoch": 0.3343043030821022, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 38332 + }, + { + "epoch": 0.3343130243672708, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 38333 + }, + { + "epoch": 0.33432174565243933, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 38334 + }, + { + "epoch": 0.33433046693760793, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 38335 + }, + { + "epoch": 0.33433918822277653, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 38336 + }, + { + "epoch": 0.3343479095079451, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 38337 + }, + { + "epoch": 0.3343566307931137, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 38338 + }, + { + "epoch": 0.3343653520782823, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 38339 + }, + { + "epoch": 0.3343740733634508, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 38340 + }, + { + "epoch": 0.3343827946486194, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 38341 + }, + { + "epoch": 0.334391515933788, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 38342 + }, + { + "epoch": 0.33440023721895656, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 38343 + }, + { + "epoch": 0.33440895850412516, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 38344 + }, + { + "epoch": 0.33441767978929376, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 38345 + }, + { + "epoch": 0.3344264010744623, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 38346 + }, + { + "epoch": 0.3344351223596309, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 38347 + }, + { + "epoch": 0.3344438436447995, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 38348 + }, + { + "epoch": 0.33445256492996805, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 38349 + }, + { + "epoch": 0.33446128621513666, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 38350 + }, + { + "epoch": 0.33447000750030526, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 38351 + }, + { + "epoch": 0.3344787287854738, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 38352 + }, + { + "epoch": 0.3344874500706424, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 38353 + }, + { + "epoch": 0.334496171355811, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38354 + }, + { + "epoch": 0.3345048926409796, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 38355 + }, + { + "epoch": 0.33451361392614815, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 38356 + }, + { + "epoch": 0.33452233521131675, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 38357 + }, + { + "epoch": 0.33453105649648535, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38358 + }, + { + "epoch": 0.3345397777816539, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 38359 + }, + { + "epoch": 0.3345484990668225, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 38360 + }, + { + "epoch": 0.3345572203519911, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 38361 + }, + { + "epoch": 0.33456594163715964, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 38362 + }, + { + "epoch": 0.33457466292232824, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 38363 + }, + { + "epoch": 0.33458338420749684, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 38364 + }, + { + "epoch": 0.3345921054926654, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 38365 + }, + { + "epoch": 0.334600826777834, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 38366 + }, + { + "epoch": 0.3346095480630026, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 38367 + }, + { + "epoch": 0.3346182693481711, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 38368 + }, + { + "epoch": 0.3346269906333397, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 38369 + }, + { + "epoch": 0.3346357119185083, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 38370 + }, + { + "epoch": 0.33464443320367687, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9819, + "step": 38371 + }, + { + "epoch": 0.3346531544888455, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 38372 + }, + { + "epoch": 0.3346618757740141, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38373 + }, + { + "epoch": 0.3346705970591826, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 38374 + }, + { + "epoch": 0.3346793183443512, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 38375 + }, + { + "epoch": 0.3346880396295198, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 38376 + }, + { + "epoch": 0.33469676091468836, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 38377 + }, + { + "epoch": 0.33470548219985696, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 38378 + }, + { + "epoch": 0.33471420348502556, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 38379 + }, + { + "epoch": 0.33472292477019416, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 38380 + }, + { + "epoch": 0.3347316460553627, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 38381 + }, + { + "epoch": 0.3347403673405313, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 38382 + }, + { + "epoch": 0.3347490886256999, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 38383 + }, + { + "epoch": 0.33475780991086845, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 38384 + }, + { + "epoch": 0.33476653119603705, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 38385 + }, + { + "epoch": 0.33477525248120565, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 38386 + }, + { + "epoch": 0.3347839737663742, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 38387 + }, + { + "epoch": 0.3347926950515428, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 38388 + }, + { + "epoch": 0.3348014163367114, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 38389 + }, + { + "epoch": 0.33481013762187994, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 38390 + }, + { + "epoch": 0.33481885890704854, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 38391 + }, + { + "epoch": 0.33482758019221714, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 38392 + }, + { + "epoch": 0.3348363014773857, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 38393 + }, + { + "epoch": 0.3348450227625543, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 38394 + }, + { + "epoch": 0.3348537440477229, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 38395 + }, + { + "epoch": 0.33486246533289143, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 38396 + }, + { + "epoch": 0.33487118661806003, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 38397 + }, + { + "epoch": 0.33487990790322864, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 38398 + }, + { + "epoch": 0.3348886291883972, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 38399 + }, + { + "epoch": 0.3348973504735658, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 38400 + }, + { + "epoch": 0.3349060717587344, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 38401 + }, + { + "epoch": 0.3349147930439029, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 38402 + }, + { + "epoch": 0.3349235143290715, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 38403 + }, + { + "epoch": 0.3349322356142401, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 38404 + }, + { + "epoch": 0.33494095689940867, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 38405 + }, + { + "epoch": 0.33494967818457727, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 38406 + }, + { + "epoch": 0.33495839946974587, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 38407 + }, + { + "epoch": 0.33496712075491447, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 38408 + }, + { + "epoch": 0.334975842040083, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 38409 + }, + { + "epoch": 0.3349845633252516, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 38410 + }, + { + "epoch": 0.3349932846104202, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 38411 + }, + { + "epoch": 0.33500200589558876, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 38412 + }, + { + "epoch": 0.33501072718075736, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 38413 + }, + { + "epoch": 0.33501944846592596, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 38414 + }, + { + "epoch": 0.3350281697510945, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 38415 + }, + { + "epoch": 0.3350368910362631, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 38416 + }, + { + "epoch": 0.3350456123214317, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38417 + }, + { + "epoch": 0.33505433360660025, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 38418 + }, + { + "epoch": 0.33506305489176885, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 38419 + }, + { + "epoch": 0.33507177617693745, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 38420 + }, + { + "epoch": 0.335080497462106, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 38421 + }, + { + "epoch": 0.3350892187472746, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 38422 + }, + { + "epoch": 0.3350979400324432, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 38423 + }, + { + "epoch": 0.33510666131761174, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 38424 + }, + { + "epoch": 0.33511538260278034, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 38425 + }, + { + "epoch": 0.33512410388794894, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 38426 + }, + { + "epoch": 0.3351328251731175, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 38427 + }, + { + "epoch": 0.3351415464582861, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 38428 + }, + { + "epoch": 0.3351502677434547, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 38429 + }, + { + "epoch": 0.33515898902862323, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 38430 + }, + { + "epoch": 0.33516771031379183, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38431 + }, + { + "epoch": 0.33517643159896043, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 38432 + }, + { + "epoch": 0.335185152884129, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 38433 + }, + { + "epoch": 0.3351938741692976, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 38434 + }, + { + "epoch": 0.3352025954544662, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 38435 + }, + { + "epoch": 0.3352113167396348, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 38436 + }, + { + "epoch": 0.3352200380248033, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 38437 + }, + { + "epoch": 0.3352287593099719, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 38438 + }, + { + "epoch": 0.3352374805951405, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 38439 + }, + { + "epoch": 0.33524620188030907, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 38440 + }, + { + "epoch": 0.33525492316547767, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 38441 + }, + { + "epoch": 0.33526364445064627, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 38442 + }, + { + "epoch": 0.3352723657358148, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38443 + }, + { + "epoch": 0.3352810870209834, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38444 + }, + { + "epoch": 0.335289808306152, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 38445 + }, + { + "epoch": 0.33529852959132056, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 38446 + }, + { + "epoch": 0.33530725087648916, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 38447 + }, + { + "epoch": 0.33531597216165776, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 38448 + }, + { + "epoch": 0.3353246934468263, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 38449 + }, + { + "epoch": 0.3353334147319949, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 38450 + }, + { + "epoch": 0.3353421360171635, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 38451 + }, + { + "epoch": 0.33535085730233205, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 38452 + }, + { + "epoch": 0.33535957858750065, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 38453 + }, + { + "epoch": 0.33536829987266925, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 38454 + }, + { + "epoch": 0.3353770211578378, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 38455 + }, + { + "epoch": 0.3353857424430064, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 38456 + }, + { + "epoch": 0.335394463728175, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 38457 + }, + { + "epoch": 0.33540318501334354, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 38458 + }, + { + "epoch": 0.33541190629851214, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 38459 + }, + { + "epoch": 0.33542062758368074, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 38460 + }, + { + "epoch": 0.3354293488688493, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 38461 + }, + { + "epoch": 0.3354380701540179, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 38462 + }, + { + "epoch": 0.3354467914391865, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 38463 + }, + { + "epoch": 0.3354555127243551, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38464 + }, + { + "epoch": 0.33546423400952363, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 38465 + }, + { + "epoch": 0.33547295529469223, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 38466 + }, + { + "epoch": 0.33548167657986083, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38467 + }, + { + "epoch": 0.3354903978650294, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 38468 + }, + { + "epoch": 0.335499119150198, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 38469 + }, + { + "epoch": 0.3355078404353666, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38470 + }, + { + "epoch": 0.3355165617205351, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 38471 + }, + { + "epoch": 0.3355252830057037, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 38472 + }, + { + "epoch": 0.3355340042908723, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 38473 + }, + { + "epoch": 0.33554272557604087, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 38474 + }, + { + "epoch": 0.33555144686120947, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 38475 + }, + { + "epoch": 0.33556016814637807, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 38476 + }, + { + "epoch": 0.3355688894315466, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 38477 + }, + { + "epoch": 0.3355776107167152, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 38478 + }, + { + "epoch": 0.3355863320018838, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 38479 + }, + { + "epoch": 0.33559505328705236, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 38480 + }, + { + "epoch": 0.33560377457222096, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 38481 + }, + { + "epoch": 0.33561249585738956, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 38482 + }, + { + "epoch": 0.3356212171425581, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 38483 + }, + { + "epoch": 0.3356299384277267, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 38484 + }, + { + "epoch": 0.3356386597128953, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 38485 + }, + { + "epoch": 0.33564738099806385, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 38486 + }, + { + "epoch": 0.33565610228323245, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 38487 + }, + { + "epoch": 0.33566482356840105, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 38488 + }, + { + "epoch": 0.33567354485356965, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 38489 + }, + { + "epoch": 0.3356822661387382, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 38490 + }, + { + "epoch": 0.3356909874239068, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 38491 + }, + { + "epoch": 0.3356997087090754, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 38492 + }, + { + "epoch": 0.33570842999424394, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 38493 + }, + { + "epoch": 0.33571715127941254, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 38494 + }, + { + "epoch": 0.33572587256458114, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 38495 + }, + { + "epoch": 0.3357345938497497, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 38496 + }, + { + "epoch": 0.3357433151349183, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 38497 + }, + { + "epoch": 0.3357520364200869, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 38498 + }, + { + "epoch": 0.33576075770525543, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 38499 + }, + { + "epoch": 0.33576947899042403, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 38500 + }, + { + "epoch": 0.33577820027559263, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 38501 + }, + { + "epoch": 0.3357869215607612, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 38502 + }, + { + "epoch": 0.3357956428459298, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 38503 + }, + { + "epoch": 0.3358043641310984, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 38504 + }, + { + "epoch": 0.3358130854162669, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 38505 + }, + { + "epoch": 0.3358218067014355, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 38506 + }, + { + "epoch": 0.3358305279866041, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 38507 + }, + { + "epoch": 0.33583924927177267, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38508 + }, + { + "epoch": 0.33584797055694127, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 38509 + }, + { + "epoch": 0.33585669184210987, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 38510 + }, + { + "epoch": 0.3358654131272784, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 38511 + }, + { + "epoch": 0.335874134412447, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 38512 + }, + { + "epoch": 0.3358828556976156, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 38513 + }, + { + "epoch": 0.33589157698278416, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 38514 + }, + { + "epoch": 0.33590029826795276, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 38515 + }, + { + "epoch": 0.33590901955312136, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 38516 + }, + { + "epoch": 0.33591774083828996, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 38517 + }, + { + "epoch": 0.3359264621234585, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 38518 + }, + { + "epoch": 0.3359351834086271, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 38519 + }, + { + "epoch": 0.3359439046937957, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 38520 + }, + { + "epoch": 0.33595262597896425, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 38521 + }, + { + "epoch": 0.33596134726413285, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 38522 + }, + { + "epoch": 0.33597006854930145, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 38523 + }, + { + "epoch": 0.33597878983447, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 38524 + }, + { + "epoch": 0.3359875111196386, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 38525 + }, + { + "epoch": 0.3359962324048072, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 38526 + }, + { + "epoch": 0.33600495368997574, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 38527 + }, + { + "epoch": 0.33601367497514434, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 38528 + }, + { + "epoch": 0.33602239626031294, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 38529 + }, + { + "epoch": 0.3360311175454815, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 38530 + }, + { + "epoch": 0.3360398388306501, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 38531 + }, + { + "epoch": 0.3360485601158187, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 38532 + }, + { + "epoch": 0.33605728140098723, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 38533 + }, + { + "epoch": 0.33606600268615583, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 38534 + }, + { + "epoch": 0.33607472397132443, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9773, + "step": 38535 + }, + { + "epoch": 0.336083445256493, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 38536 + }, + { + "epoch": 0.3360921665416616, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 38537 + }, + { + "epoch": 0.3361008878268302, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 38538 + }, + { + "epoch": 0.3361096091119987, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 38539 + }, + { + "epoch": 0.3361183303971673, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 38540 + }, + { + "epoch": 0.3361270516823359, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 38541 + }, + { + "epoch": 0.33613577296750446, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 38542 + }, + { + "epoch": 0.33614449425267307, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 38543 + }, + { + "epoch": 0.33615321553784167, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 38544 + }, + { + "epoch": 0.33616193682301027, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 38545 + }, + { + "epoch": 0.3361706581081788, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 38546 + }, + { + "epoch": 0.3361793793933474, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 38547 + }, + { + "epoch": 0.336188100678516, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 38548 + }, + { + "epoch": 0.33619682196368456, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 38549 + }, + { + "epoch": 0.33620554324885316, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 38550 + }, + { + "epoch": 0.33621426453402176, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 38551 + }, + { + "epoch": 0.3362229858191903, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38552 + }, + { + "epoch": 0.3362317071043589, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 38553 + }, + { + "epoch": 0.3362404283895275, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 38554 + }, + { + "epoch": 0.33624914967469605, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 38555 + }, + { + "epoch": 0.33625787095986465, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 38556 + }, + { + "epoch": 0.33626659224503325, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 38557 + }, + { + "epoch": 0.3362753135302018, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 38558 + }, + { + "epoch": 0.3362840348153704, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 38559 + }, + { + "epoch": 0.336292756100539, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 38560 + }, + { + "epoch": 0.33630147738570754, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 38561 + }, + { + "epoch": 0.33631019867087614, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 38562 + }, + { + "epoch": 0.33631891995604474, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 38563 + }, + { + "epoch": 0.3363276412412133, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 38564 + }, + { + "epoch": 0.3363363625263819, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38565 + }, + { + "epoch": 0.3363450838115505, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 38566 + }, + { + "epoch": 0.336353805096719, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 38567 + }, + { + "epoch": 0.3363625263818876, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 38568 + }, + { + "epoch": 0.33637124766705623, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 38569 + }, + { + "epoch": 0.3363799689522248, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 38570 + }, + { + "epoch": 0.3363886902373934, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 38571 + }, + { + "epoch": 0.336397411522562, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 38572 + }, + { + "epoch": 0.3364061328077306, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 38573 + }, + { + "epoch": 0.3364148540928991, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 38574 + }, + { + "epoch": 0.3364235753780677, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 38575 + }, + { + "epoch": 0.3364322966632363, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 38576 + }, + { + "epoch": 0.33644101794840486, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 38577 + }, + { + "epoch": 0.33644973923357346, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 38578 + }, + { + "epoch": 0.33645846051874206, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 38579 + }, + { + "epoch": 0.3364671818039106, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 38580 + }, + { + "epoch": 0.3364759030890792, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 38581 + }, + { + "epoch": 0.3364846243742478, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 38582 + }, + { + "epoch": 0.33649334565941635, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 38583 + }, + { + "epoch": 0.33650206694458495, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38584 + }, + { + "epoch": 0.33651078822975355, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 38585 + }, + { + "epoch": 0.3365195095149221, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 38586 + }, + { + "epoch": 0.3365282308000907, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 38587 + }, + { + "epoch": 0.3365369520852593, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 38588 + }, + { + "epoch": 0.33654567337042784, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 38589 + }, + { + "epoch": 0.33655439465559644, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 38590 + }, + { + "epoch": 0.33656311594076505, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 38591 + }, + { + "epoch": 0.3365718372259336, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 38592 + }, + { + "epoch": 0.3365805585111022, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 38593 + }, + { + "epoch": 0.3365892797962708, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 38594 + }, + { + "epoch": 0.33659800108143934, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 38595 + }, + { + "epoch": 0.33660672236660794, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 38596 + }, + { + "epoch": 0.33661544365177654, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 38597 + }, + { + "epoch": 0.3366241649369451, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 38598 + }, + { + "epoch": 0.3366328862221137, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 38599 + }, + { + "epoch": 0.3366416075072823, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 38600 + }, + { + "epoch": 0.3366503287924509, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 38601 + }, + { + "epoch": 0.3366590500776194, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 38602 + }, + { + "epoch": 0.336667771362788, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 38603 + }, + { + "epoch": 0.3366764926479566, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 38604 + }, + { + "epoch": 0.33668521393312517, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 38605 + }, + { + "epoch": 0.33669393521829377, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 38606 + }, + { + "epoch": 0.33670265650346237, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 38607 + }, + { + "epoch": 0.3367113777886309, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 38608 + }, + { + "epoch": 0.3367200990737995, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 38609 + }, + { + "epoch": 0.3367288203589681, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 38610 + }, + { + "epoch": 0.33673754164413666, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 38611 + }, + { + "epoch": 0.33674626292930526, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 38612 + }, + { + "epoch": 0.33675498421447386, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 38613 + }, + { + "epoch": 0.3367637054996424, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 38614 + }, + { + "epoch": 0.336772426784811, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 38615 + }, + { + "epoch": 0.3367811480699796, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 38616 + }, + { + "epoch": 0.33678986935514815, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 38617 + }, + { + "epoch": 0.33679859064031675, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 38618 + }, + { + "epoch": 0.33680731192548535, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 38619 + }, + { + "epoch": 0.3368160332106539, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 38620 + }, + { + "epoch": 0.3368247544958225, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9817, + "step": 38621 + }, + { + "epoch": 0.3368334757809911, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 38622 + }, + { + "epoch": 0.33684219706615964, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 38623 + }, + { + "epoch": 0.33685091835132824, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 38624 + }, + { + "epoch": 0.33685963963649684, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 38625 + }, + { + "epoch": 0.33686836092166544, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 38626 + }, + { + "epoch": 0.336877082206834, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 38627 + }, + { + "epoch": 0.3368858034920026, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38628 + }, + { + "epoch": 0.3368945247771712, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 38629 + }, + { + "epoch": 0.33690324606233973, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 38630 + }, + { + "epoch": 0.33691196734750833, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 38631 + }, + { + "epoch": 0.33692068863267693, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 38632 + }, + { + "epoch": 0.3369294099178455, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 38633 + }, + { + "epoch": 0.3369381312030141, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 38634 + }, + { + "epoch": 0.3369468524881827, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 38635 + }, + { + "epoch": 0.3369555737733512, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 38636 + }, + { + "epoch": 0.3369642950585198, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 38637 + }, + { + "epoch": 0.3369730163436884, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 38638 + }, + { + "epoch": 0.33698173762885697, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 38639 + }, + { + "epoch": 0.33699045891402557, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 38640 + }, + { + "epoch": 0.33699918019919417, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 38641 + }, + { + "epoch": 0.3370079014843627, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 38642 + }, + { + "epoch": 0.3370166227695313, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 38643 + }, + { + "epoch": 0.3370253440546999, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 38644 + }, + { + "epoch": 0.33703406533986846, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 38645 + }, + { + "epoch": 0.33704278662503706, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 38646 + }, + { + "epoch": 0.33705150791020566, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 38647 + }, + { + "epoch": 0.3370602291953742, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 38648 + }, + { + "epoch": 0.3370689504805428, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 38649 + }, + { + "epoch": 0.3370776717657114, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 38650 + }, + { + "epoch": 0.33708639305087995, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 38651 + }, + { + "epoch": 0.33709511433604855, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 38652 + }, + { + "epoch": 0.33710383562121715, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 38653 + }, + { + "epoch": 0.33711255690638575, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 38654 + }, + { + "epoch": 0.3371212781915543, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 38655 + }, + { + "epoch": 0.3371299994767229, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 38656 + }, + { + "epoch": 0.3371387207618915, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 38657 + }, + { + "epoch": 0.33714744204706004, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 38658 + }, + { + "epoch": 0.33715616333222864, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 38659 + }, + { + "epoch": 0.33716488461739724, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 38660 + }, + { + "epoch": 0.3371736059025658, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 38661 + }, + { + "epoch": 0.3371823271877344, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 38662 + }, + { + "epoch": 0.337191048472903, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 38663 + }, + { + "epoch": 0.33719976975807153, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9846, + "step": 38664 + }, + { + "epoch": 0.33720849104324013, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 38665 + }, + { + "epoch": 0.33721721232840873, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 38666 + }, + { + "epoch": 0.3372259336135773, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 38667 + }, + { + "epoch": 0.3372346548987459, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 38668 + }, + { + "epoch": 0.3372433761839145, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.981, + "step": 38669 + }, + { + "epoch": 0.337252097469083, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 38670 + }, + { + "epoch": 0.3372608187542516, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 38671 + }, + { + "epoch": 0.3372695400394202, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 38672 + }, + { + "epoch": 0.33727826132458877, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 38673 + }, + { + "epoch": 0.33728698260975737, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 38674 + }, + { + "epoch": 0.33729570389492597, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 38675 + }, + { + "epoch": 0.3373044251800945, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 38676 + }, + { + "epoch": 0.3373131464652631, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 38677 + }, + { + "epoch": 0.3373218677504317, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 38678 + }, + { + "epoch": 0.33733058903560026, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 38679 + }, + { + "epoch": 0.33733931032076886, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 38680 + }, + { + "epoch": 0.33734803160593746, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 38681 + }, + { + "epoch": 0.33735675289110606, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 38682 + }, + { + "epoch": 0.3373654741762746, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 38683 + }, + { + "epoch": 0.3373741954614432, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 38684 + }, + { + "epoch": 0.3373829167466118, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 38685 + }, + { + "epoch": 0.33739163803178035, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 38686 + }, + { + "epoch": 0.33740035931694895, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 38687 + }, + { + "epoch": 0.33740908060211755, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 38688 + }, + { + "epoch": 0.3374178018872861, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 38689 + }, + { + "epoch": 0.3374265231724547, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 38690 + }, + { + "epoch": 0.3374352444576233, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 38691 + }, + { + "epoch": 0.33744396574279184, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38692 + }, + { + "epoch": 0.33745268702796044, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 38693 + }, + { + "epoch": 0.33746140831312904, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 38694 + }, + { + "epoch": 0.3374701295982976, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 38695 + }, + { + "epoch": 0.3374788508834662, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38696 + }, + { + "epoch": 0.3374875721686348, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 38697 + }, + { + "epoch": 0.33749629345380333, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 38698 + }, + { + "epoch": 0.33750501473897193, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 38699 + }, + { + "epoch": 0.33751373602414053, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 38700 + }, + { + "epoch": 0.3375224573093091, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 38701 + }, + { + "epoch": 0.3375311785944777, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 38702 + }, + { + "epoch": 0.3375398998796463, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 38703 + }, + { + "epoch": 0.3375486211648148, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38704 + }, + { + "epoch": 0.3375573424499834, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 38705 + }, + { + "epoch": 0.337566063735152, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 38706 + }, + { + "epoch": 0.33757478502032057, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 38707 + }, + { + "epoch": 0.33758350630548917, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 38708 + }, + { + "epoch": 0.33759222759065777, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 38709 + }, + { + "epoch": 0.33760094887582637, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 38710 + }, + { + "epoch": 0.3376096701609949, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 38711 + }, + { + "epoch": 0.3376183914461635, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 38712 + }, + { + "epoch": 0.3376271127313321, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 38713 + }, + { + "epoch": 0.33763583401650066, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 38714 + }, + { + "epoch": 0.33764455530166926, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 38715 + }, + { + "epoch": 0.33765327658683786, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38716 + }, + { + "epoch": 0.3376619978720064, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38717 + }, + { + "epoch": 0.337670719157175, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 38718 + }, + { + "epoch": 0.3376794404423436, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 38719 + }, + { + "epoch": 0.33768816172751215, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 38720 + }, + { + "epoch": 0.33769688301268075, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 38721 + }, + { + "epoch": 0.33770560429784935, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 38722 + }, + { + "epoch": 0.3377143255830179, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 38723 + }, + { + "epoch": 0.3377230468681865, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 38724 + }, + { + "epoch": 0.3377317681533551, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 38725 + }, + { + "epoch": 0.33774048943852364, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 38726 + }, + { + "epoch": 0.33774921072369224, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 38727 + }, + { + "epoch": 0.33775793200886084, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 38728 + }, + { + "epoch": 0.3377666532940294, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 38729 + }, + { + "epoch": 0.337775374579198, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 38730 + }, + { + "epoch": 0.3377840958643666, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 38731 + }, + { + "epoch": 0.33779281714953513, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 38732 + }, + { + "epoch": 0.33780153843470373, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 38733 + }, + { + "epoch": 0.33781025971987233, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 38734 + }, + { + "epoch": 0.33781898100504093, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 38735 + }, + { + "epoch": 0.3378277022902095, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 38736 + }, + { + "epoch": 0.3378364235753781, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 38737 + }, + { + "epoch": 0.3378451448605467, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 38738 + }, + { + "epoch": 0.3378538661457152, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 38739 + }, + { + "epoch": 0.3378625874308838, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 38740 + }, + { + "epoch": 0.3378713087160524, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 38741 + }, + { + "epoch": 0.33788003000122097, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 38742 + }, + { + "epoch": 0.33788875128638957, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 38743 + }, + { + "epoch": 0.33789747257155817, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 38744 + }, + { + "epoch": 0.3379061938567267, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 38745 + }, + { + "epoch": 0.3379149151418953, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 38746 + }, + { + "epoch": 0.3379236364270639, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 38747 + }, + { + "epoch": 0.33793235771223246, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 38748 + }, + { + "epoch": 0.33794107899740106, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 38749 + }, + { + "epoch": 0.33794980028256966, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 38750 + }, + { + "epoch": 0.3379585215677382, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 38751 + }, + { + "epoch": 0.3379672428529068, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 38752 + }, + { + "epoch": 0.3379759641380754, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 38753 + }, + { + "epoch": 0.33798468542324395, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 38754 + }, + { + "epoch": 0.33799340670841255, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 38755 + }, + { + "epoch": 0.33800212799358115, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 38756 + }, + { + "epoch": 0.3380108492787497, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 38757 + }, + { + "epoch": 0.3380195705639183, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38758 + }, + { + "epoch": 0.3380282918490869, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 38759 + }, + { + "epoch": 0.33803701313425544, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 38760 + }, + { + "epoch": 0.33804573441942404, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 38761 + }, + { + "epoch": 0.33805445570459264, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 38762 + }, + { + "epoch": 0.33806317698976124, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38763 + }, + { + "epoch": 0.3380718982749298, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 38764 + }, + { + "epoch": 0.3380806195600984, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 38765 + }, + { + "epoch": 0.338089340845267, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 38766 + }, + { + "epoch": 0.33809806213043553, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 38767 + }, + { + "epoch": 0.33810678341560413, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 38768 + }, + { + "epoch": 0.33811550470077273, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 38769 + }, + { + "epoch": 0.3381242259859413, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 38770 + }, + { + "epoch": 0.3381329472711099, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 38771 + }, + { + "epoch": 0.3381416685562785, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 38772 + }, + { + "epoch": 0.338150389841447, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 38773 + }, + { + "epoch": 0.3381591111266156, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 38774 + }, + { + "epoch": 0.3381678324117842, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 38775 + }, + { + "epoch": 0.33817655369695276, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 38776 + }, + { + "epoch": 0.33818527498212136, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 38777 + }, + { + "epoch": 0.33819399626728996, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 38778 + }, + { + "epoch": 0.3382027175524585, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38779 + }, + { + "epoch": 0.3382114388376271, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 38780 + }, + { + "epoch": 0.3382201601227957, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 38781 + }, + { + "epoch": 0.33822888140796425, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 38782 + }, + { + "epoch": 0.33823760269313285, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 38783 + }, + { + "epoch": 0.33824632397830146, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 38784 + }, + { + "epoch": 0.33825504526347, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 38785 + }, + { + "epoch": 0.3382637665486386, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 38786 + }, + { + "epoch": 0.3382724878338072, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 38787 + }, + { + "epoch": 0.33828120911897575, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 38788 + }, + { + "epoch": 0.33828993040414435, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 38789 + }, + { + "epoch": 0.33829865168931295, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 38790 + }, + { + "epoch": 0.33830737297448155, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 38791 + }, + { + "epoch": 0.3383160942596501, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 38792 + }, + { + "epoch": 0.3383248155448187, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 38793 + }, + { + "epoch": 0.3383335368299873, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 38794 + }, + { + "epoch": 0.33834225811515584, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 38795 + }, + { + "epoch": 0.33835097940032444, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38796 + }, + { + "epoch": 0.33835970068549304, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 38797 + }, + { + "epoch": 0.3383684219706616, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 38798 + }, + { + "epoch": 0.3383771432558302, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 38799 + }, + { + "epoch": 0.3383858645409988, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 38800 + }, + { + "epoch": 0.3383945858261673, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 38801 + }, + { + "epoch": 0.3384033071113359, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 38802 + }, + { + "epoch": 0.3384120283965045, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 38803 + }, + { + "epoch": 0.33842074968167307, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 38804 + }, + { + "epoch": 0.3384294709668417, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 38805 + }, + { + "epoch": 0.3384381922520103, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 38806 + }, + { + "epoch": 0.3384469135371788, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 38807 + }, + { + "epoch": 0.3384556348223474, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 38808 + }, + { + "epoch": 0.338464356107516, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 38809 + }, + { + "epoch": 0.33847307739268456, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 38810 + }, + { + "epoch": 0.33848179867785316, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 38811 + }, + { + "epoch": 0.33849051996302176, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 38812 + }, + { + "epoch": 0.3384992412481903, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 38813 + }, + { + "epoch": 0.3385079625333589, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38814 + }, + { + "epoch": 0.3385166838185275, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 38815 + }, + { + "epoch": 0.33852540510369605, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 38816 + }, + { + "epoch": 0.33853412638886465, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 38817 + }, + { + "epoch": 0.33854284767403325, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 38818 + }, + { + "epoch": 0.33855156895920185, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 38819 + }, + { + "epoch": 0.3385602902443704, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 38820 + }, + { + "epoch": 0.338569011529539, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 38821 + }, + { + "epoch": 0.3385777328147076, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 38822 + }, + { + "epoch": 0.33858645409987614, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 38823 + }, + { + "epoch": 0.33859517538504474, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 38824 + }, + { + "epoch": 0.33860389667021334, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 38825 + }, + { + "epoch": 0.3386126179553819, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 38826 + }, + { + "epoch": 0.3386213392405505, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 38827 + }, + { + "epoch": 0.3386300605257191, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 38828 + }, + { + "epoch": 0.33863878181088763, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 38829 + }, + { + "epoch": 0.33864750309605623, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 38830 + }, + { + "epoch": 0.33865622438122484, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 38831 + }, + { + "epoch": 0.3386649456663934, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 38832 + }, + { + "epoch": 0.338673666951562, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 38833 + }, + { + "epoch": 0.3386823882367306, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 38834 + }, + { + "epoch": 0.3386911095218991, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 38835 + }, + { + "epoch": 0.3386998308070677, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 38836 + }, + { + "epoch": 0.3387085520922363, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 38837 + }, + { + "epoch": 0.33871727337740487, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38838 + }, + { + "epoch": 0.33872599466257347, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 38839 + }, + { + "epoch": 0.33873471594774207, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 38840 + }, + { + "epoch": 0.3387434372329106, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 38841 + }, + { + "epoch": 0.3387521585180792, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 38842 + }, + { + "epoch": 0.3387608798032478, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 38843 + }, + { + "epoch": 0.3387696010884164, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 38844 + }, + { + "epoch": 0.33877832237358496, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 38845 + }, + { + "epoch": 0.33878704365875356, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38846 + }, + { + "epoch": 0.33879576494392216, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38847 + }, + { + "epoch": 0.3388044862290907, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 38848 + }, + { + "epoch": 0.3388132075142593, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 38849 + }, + { + "epoch": 0.3388219287994279, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 38850 + }, + { + "epoch": 0.33883065008459645, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38851 + }, + { + "epoch": 0.33883937136976505, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38852 + }, + { + "epoch": 0.33884809265493365, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 38853 + }, + { + "epoch": 0.3388568139401022, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 38854 + }, + { + "epoch": 0.3388655352252708, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 38855 + }, + { + "epoch": 0.3388742565104394, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 38856 + }, + { + "epoch": 0.33888297779560794, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 38857 + }, + { + "epoch": 0.33889169908077654, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38858 + }, + { + "epoch": 0.33890042036594514, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 38859 + }, + { + "epoch": 0.3389091416511137, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 38860 + }, + { + "epoch": 0.3389178629362823, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 38861 + }, + { + "epoch": 0.3389265842214509, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 38862 + }, + { + "epoch": 0.33893530550661943, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 38863 + }, + { + "epoch": 0.33894402679178803, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 38864 + }, + { + "epoch": 0.33895274807695663, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 38865 + }, + { + "epoch": 0.3389614693621252, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 38866 + }, + { + "epoch": 0.3389701906472938, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 38867 + }, + { + "epoch": 0.3389789119324624, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 38868 + }, + { + "epoch": 0.3389876332176309, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 38869 + }, + { + "epoch": 0.3389963545027995, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38870 + }, + { + "epoch": 0.3390050757879681, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 38871 + }, + { + "epoch": 0.3390137970731367, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 38872 + }, + { + "epoch": 0.33902251835830527, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 38873 + }, + { + "epoch": 0.33903123964347387, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 38874 + }, + { + "epoch": 0.33903996092864247, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 38875 + }, + { + "epoch": 0.339048682213811, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9766, + "step": 38876 + }, + { + "epoch": 0.3390574034989796, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 38877 + }, + { + "epoch": 0.3390661247841482, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 38878 + }, + { + "epoch": 0.33907484606931676, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 38879 + }, + { + "epoch": 0.33908356735448536, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 38880 + }, + { + "epoch": 0.33909228863965396, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 38881 + }, + { + "epoch": 0.3391010099248225, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 38882 + }, + { + "epoch": 0.3391097312099911, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 38883 + }, + { + "epoch": 0.3391184524951597, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 38884 + }, + { + "epoch": 0.33912717378032825, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 38885 + }, + { + "epoch": 0.33913589506549685, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 38886 + }, + { + "epoch": 0.33914461635066545, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 38887 + }, + { + "epoch": 0.339153337635834, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 38888 + }, + { + "epoch": 0.3391620589210026, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 38889 + }, + { + "epoch": 0.3391707802061712, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 38890 + }, + { + "epoch": 0.33917950149133974, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 38891 + }, + { + "epoch": 0.33918822277650834, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 38892 + }, + { + "epoch": 0.33919694406167694, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 38893 + }, + { + "epoch": 0.3392056653468455, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 38894 + }, + { + "epoch": 0.3392143866320141, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 38895 + }, + { + "epoch": 0.3392231079171827, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 38896 + }, + { + "epoch": 0.33923182920235123, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 38897 + }, + { + "epoch": 0.33924055048751983, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 38898 + }, + { + "epoch": 0.33924927177268843, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 38899 + }, + { + "epoch": 0.33925799305785703, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 38900 + }, + { + "epoch": 0.3392667143430256, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 38901 + }, + { + "epoch": 0.3392754356281942, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 38902 + }, + { + "epoch": 0.3392841569133628, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 38903 + }, + { + "epoch": 0.3392928781985313, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 38904 + }, + { + "epoch": 0.3393015994836999, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 38905 + }, + { + "epoch": 0.3393103207688685, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 38906 + }, + { + "epoch": 0.33931904205403707, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 38907 + }, + { + "epoch": 0.33932776333920567, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 38908 + }, + { + "epoch": 0.33933648462437427, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 38909 + }, + { + "epoch": 0.3393452059095428, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 38910 + }, + { + "epoch": 0.3393539271947114, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 38911 + }, + { + "epoch": 0.33936264847988, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 38912 + }, + { + "epoch": 0.33937136976504856, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 38913 + }, + { + "epoch": 0.33938009105021716, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 38914 + }, + { + "epoch": 0.33938881233538576, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 38915 + }, + { + "epoch": 0.3393975336205543, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 38916 + }, + { + "epoch": 0.3394062549057229, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 38917 + }, + { + "epoch": 0.3394149761908915, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 38918 + }, + { + "epoch": 0.33942369747606005, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 38919 + }, + { + "epoch": 0.33943241876122865, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 38920 + }, + { + "epoch": 0.33944114004639725, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 38921 + }, + { + "epoch": 0.3394498613315658, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38922 + }, + { + "epoch": 0.3394585826167344, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 38923 + }, + { + "epoch": 0.339467303901903, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 38924 + }, + { + "epoch": 0.33947602518707154, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 38925 + }, + { + "epoch": 0.33948474647224014, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 38926 + }, + { + "epoch": 0.33949346775740874, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 38927 + }, + { + "epoch": 0.33950218904257734, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 38928 + }, + { + "epoch": 0.3395109103277459, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 38929 + }, + { + "epoch": 0.3395196316129145, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 38930 + }, + { + "epoch": 0.3395283528980831, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 38931 + }, + { + "epoch": 0.33953707418325163, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 38932 + }, + { + "epoch": 0.33954579546842023, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 38933 + }, + { + "epoch": 0.33955451675358883, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 38934 + }, + { + "epoch": 0.3395632380387574, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 38935 + }, + { + "epoch": 0.339571959323926, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 38936 + }, + { + "epoch": 0.3395806806090946, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 38937 + }, + { + "epoch": 0.3395894018942631, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 38938 + }, + { + "epoch": 0.3395981231794317, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 38939 + }, + { + "epoch": 0.3396068444646003, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 38940 + }, + { + "epoch": 0.33961556574976887, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 38941 + }, + { + "epoch": 0.33962428703493747, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 38942 + }, + { + "epoch": 0.33963300832010607, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 38943 + }, + { + "epoch": 0.3396417296052746, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 38944 + }, + { + "epoch": 0.3396504508904432, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 38945 + }, + { + "epoch": 0.3396591721756118, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 38946 + }, + { + "epoch": 0.33966789346078036, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 38947 + }, + { + "epoch": 0.33967661474594896, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 38948 + }, + { + "epoch": 0.33968533603111756, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 38949 + }, + { + "epoch": 0.3396940573162861, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 38950 + }, + { + "epoch": 0.3397027786014547, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 38951 + }, + { + "epoch": 0.3397114998866233, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 38952 + }, + { + "epoch": 0.3397202211717919, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38953 + }, + { + "epoch": 0.33972894245696045, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 38954 + }, + { + "epoch": 0.33973766374212905, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 38955 + }, + { + "epoch": 0.33974638502729765, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 38956 + }, + { + "epoch": 0.3397551063124662, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 38957 + }, + { + "epoch": 0.3397638275976348, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 38958 + }, + { + "epoch": 0.3397725488828034, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 38959 + }, + { + "epoch": 0.33978127016797194, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 38960 + }, + { + "epoch": 0.33978999145314054, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 38961 + }, + { + "epoch": 0.33979871273830914, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 38962 + }, + { + "epoch": 0.3398074340234777, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 38963 + }, + { + "epoch": 0.3398161553086463, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 38964 + }, + { + "epoch": 0.3398248765938149, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 38965 + }, + { + "epoch": 0.33983359787898343, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 38966 + }, + { + "epoch": 0.33984231916415203, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 38967 + }, + { + "epoch": 0.33985104044932063, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38968 + }, + { + "epoch": 0.3398597617344892, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 38969 + }, + { + "epoch": 0.3398684830196578, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 38970 + }, + { + "epoch": 0.3398772043048264, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 38971 + }, + { + "epoch": 0.3398859255899949, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 38972 + }, + { + "epoch": 0.3398946468751635, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 38973 + }, + { + "epoch": 0.3399033681603321, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 38974 + }, + { + "epoch": 0.33991208944550066, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 38975 + }, + { + "epoch": 0.33992081073066926, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 38976 + }, + { + "epoch": 0.33992953201583787, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 38977 + }, + { + "epoch": 0.3399382533010064, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 38978 + }, + { + "epoch": 0.339946974586175, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 38979 + }, + { + "epoch": 0.3399556958713436, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 38980 + }, + { + "epoch": 0.3399644171565122, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 38981 + }, + { + "epoch": 0.33997313844168076, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 38982 + }, + { + "epoch": 0.33998185972684936, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 38983 + }, + { + "epoch": 0.33999058101201796, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 38984 + }, + { + "epoch": 0.3399993022971865, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 38985 + }, + { + "epoch": 0.3400080235823551, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 38986 + }, + { + "epoch": 0.3400167448675237, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 38987 + }, + { + "epoch": 0.34002546615269225, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 38988 + }, + { + "epoch": 0.34003418743786085, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 38989 + }, + { + "epoch": 0.34004290872302945, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 38990 + }, + { + "epoch": 0.340051630008198, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 38991 + }, + { + "epoch": 0.3400603512933666, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 38992 + }, + { + "epoch": 0.3400690725785352, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 38993 + }, + { + "epoch": 0.34007779386370374, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 38994 + }, + { + "epoch": 0.34008651514887234, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 38995 + }, + { + "epoch": 0.34009523643404094, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 38996 + }, + { + "epoch": 0.3401039577192095, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 38997 + }, + { + "epoch": 0.3401126790043781, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 38998 + }, + { + "epoch": 0.3401214002895467, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 38999 + }, + { + "epoch": 0.3401301215747152, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 39000 + }, + { + "epoch": 0.3401388428598838, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 39001 + }, + { + "epoch": 0.34014756414505243, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 39002 + }, + { + "epoch": 0.340156285430221, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 39003 + }, + { + "epoch": 0.3401650067153896, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 39004 + }, + { + "epoch": 0.3401737280005582, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 39005 + }, + { + "epoch": 0.3401824492857267, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 39006 + }, + { + "epoch": 0.3401911705708953, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 39007 + }, + { + "epoch": 0.3401998918560639, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39008 + }, + { + "epoch": 0.3402086131412325, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 39009 + }, + { + "epoch": 0.34021733442640106, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 39010 + }, + { + "epoch": 0.34022605571156966, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 39011 + }, + { + "epoch": 0.34023477699673826, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 39012 + }, + { + "epoch": 0.3402434982819068, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 39013 + }, + { + "epoch": 0.3402522195670754, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 39014 + }, + { + "epoch": 0.340260940852244, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 39015 + }, + { + "epoch": 0.34026966213741255, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 39016 + }, + { + "epoch": 0.34027838342258115, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 39017 + }, + { + "epoch": 0.34028710470774975, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39018 + }, + { + "epoch": 0.3402958259929183, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 39019 + }, + { + "epoch": 0.3403045472780869, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 39020 + }, + { + "epoch": 0.3403132685632555, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 39021 + }, + { + "epoch": 0.34032198984842404, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 39022 + }, + { + "epoch": 0.34033071113359264, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 39023 + }, + { + "epoch": 0.34033943241876125, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 39024 + }, + { + "epoch": 0.3403481537039298, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 39025 + }, + { + "epoch": 0.3403568749890984, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 39026 + }, + { + "epoch": 0.340365596274267, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 39027 + }, + { + "epoch": 0.34037431755943554, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 39028 + }, + { + "epoch": 0.34038303884460414, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 39029 + }, + { + "epoch": 0.34039176012977274, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 39030 + }, + { + "epoch": 0.3404004814149413, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 39031 + }, + { + "epoch": 0.3404092027001099, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 39032 + }, + { + "epoch": 0.3404179239852785, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 39033 + }, + { + "epoch": 0.340426645270447, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 39034 + }, + { + "epoch": 0.3404353665556156, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 39035 + }, + { + "epoch": 0.3404440878407842, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 39036 + }, + { + "epoch": 0.3404528091259528, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 39037 + }, + { + "epoch": 0.34046153041112137, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 39038 + }, + { + "epoch": 0.34047025169628997, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 39039 + }, + { + "epoch": 0.34047897298145857, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 39040 + }, + { + "epoch": 0.3404876942666271, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 39041 + }, + { + "epoch": 0.3404964155517957, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 39042 + }, + { + "epoch": 0.3405051368369643, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 39043 + }, + { + "epoch": 0.34051385812213286, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39044 + }, + { + "epoch": 0.34052257940730146, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 39045 + }, + { + "epoch": 0.34053130069247006, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 39046 + }, + { + "epoch": 0.3405400219776386, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 39047 + }, + { + "epoch": 0.3405487432628072, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 39048 + }, + { + "epoch": 0.3405574645479758, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 39049 + }, + { + "epoch": 0.34056618583314435, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 39050 + }, + { + "epoch": 0.34057490711831295, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 39051 + }, + { + "epoch": 0.34058362840348155, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 39052 + }, + { + "epoch": 0.3405923496886501, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 39053 + }, + { + "epoch": 0.3406010709738187, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 39054 + }, + { + "epoch": 0.3406097922589873, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9835, + "step": 39055 + }, + { + "epoch": 0.34061851354415584, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 39056 + }, + { + "epoch": 0.34062723482932444, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 39057 + }, + { + "epoch": 0.34063595611449304, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 39058 + }, + { + "epoch": 0.3406446773996616, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 39059 + }, + { + "epoch": 0.3406533986848302, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 39060 + }, + { + "epoch": 0.3406621199699988, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 39061 + }, + { + "epoch": 0.34067084125516733, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 39062 + }, + { + "epoch": 0.34067956254033593, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 39063 + }, + { + "epoch": 0.34068828382550453, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 39064 + }, + { + "epoch": 0.34069700511067313, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 39065 + }, + { + "epoch": 0.3407057263958417, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 39066 + }, + { + "epoch": 0.3407144476810103, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39067 + }, + { + "epoch": 0.3407231689661789, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 39068 + }, + { + "epoch": 0.3407318902513474, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 39069 + }, + { + "epoch": 0.340740611536516, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39070 + }, + { + "epoch": 0.3407493328216846, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 39071 + }, + { + "epoch": 0.34075805410685317, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 39072 + }, + { + "epoch": 0.34076677539202177, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 39073 + }, + { + "epoch": 0.34077549667719037, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 39074 + }, + { + "epoch": 0.3407842179623589, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 39075 + }, + { + "epoch": 0.3407929392475275, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 39076 + }, + { + "epoch": 0.3408016605326961, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 39077 + }, + { + "epoch": 0.34081038181786466, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 39078 + }, + { + "epoch": 0.34081910310303326, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 39079 + }, + { + "epoch": 0.34082782438820186, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 39080 + }, + { + "epoch": 0.3408365456733704, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 39081 + }, + { + "epoch": 0.340845266958539, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 39082 + }, + { + "epoch": 0.3408539882437076, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 39083 + }, + { + "epoch": 0.34086270952887615, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 39084 + }, + { + "epoch": 0.34087143081404475, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 39085 + }, + { + "epoch": 0.34088015209921335, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 39086 + }, + { + "epoch": 0.3408888733843819, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 39087 + }, + { + "epoch": 0.3408975946695505, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 39088 + }, + { + "epoch": 0.3409063159547191, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 39089 + }, + { + "epoch": 0.3409150372398877, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 39090 + }, + { + "epoch": 0.34092375852505624, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 39091 + }, + { + "epoch": 0.34093247981022484, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 39092 + }, + { + "epoch": 0.34094120109539344, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9784, + "step": 39093 + }, + { + "epoch": 0.340949922380562, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 39094 + }, + { + "epoch": 0.3409586436657306, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 39095 + }, + { + "epoch": 0.3409673649508992, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 39096 + }, + { + "epoch": 0.34097608623606773, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 39097 + }, + { + "epoch": 0.34098480752123633, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39098 + }, + { + "epoch": 0.34099352880640493, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 39099 + }, + { + "epoch": 0.3410022500915735, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 39100 + }, + { + "epoch": 0.3410109713767421, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 39101 + }, + { + "epoch": 0.3410196926619107, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 39102 + }, + { + "epoch": 0.3410284139470792, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 39103 + }, + { + "epoch": 0.3410371352322478, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 39104 + }, + { + "epoch": 0.3410458565174164, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 39105 + }, + { + "epoch": 0.34105457780258497, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 39106 + }, + { + "epoch": 0.34106329908775357, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 39107 + }, + { + "epoch": 0.34107202037292217, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 39108 + }, + { + "epoch": 0.3410807416580907, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 39109 + }, + { + "epoch": 0.3410894629432593, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 39110 + }, + { + "epoch": 0.3410981842284279, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 39111 + }, + { + "epoch": 0.34110690551359646, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 39112 + }, + { + "epoch": 0.34111562679876506, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 39113 + }, + { + "epoch": 0.34112434808393366, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 39114 + }, + { + "epoch": 0.3411330693691022, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 39115 + }, + { + "epoch": 0.3411417906542708, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 39116 + }, + { + "epoch": 0.3411505119394394, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 39117 + }, + { + "epoch": 0.341159233224608, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 39118 + }, + { + "epoch": 0.34116795450977655, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 39119 + }, + { + "epoch": 0.34117667579494515, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 39120 + }, + { + "epoch": 0.34118539708011375, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 39121 + }, + { + "epoch": 0.3411941183652823, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 39122 + }, + { + "epoch": 0.3412028396504509, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 39123 + }, + { + "epoch": 0.3412115609356195, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 39124 + }, + { + "epoch": 0.34122028222078804, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 39125 + }, + { + "epoch": 0.34122900350595664, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 39126 + }, + { + "epoch": 0.34123772479112524, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 39127 + }, + { + "epoch": 0.3412464460762938, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39128 + }, + { + "epoch": 0.3412551673614624, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 39129 + }, + { + "epoch": 0.341263888646631, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 39130 + }, + { + "epoch": 0.34127260993179953, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 39131 + }, + { + "epoch": 0.34128133121696813, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 39132 + }, + { + "epoch": 0.34129005250213673, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 39133 + }, + { + "epoch": 0.3412987737873053, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 39134 + }, + { + "epoch": 0.3413074950724739, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 39135 + }, + { + "epoch": 0.3413162163576425, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 39136 + }, + { + "epoch": 0.341324937642811, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 39137 + }, + { + "epoch": 0.3413336589279796, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 39138 + }, + { + "epoch": 0.3413423802131482, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 39139 + }, + { + "epoch": 0.34135110149831677, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 39140 + }, + { + "epoch": 0.34135982278348537, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 39141 + }, + { + "epoch": 0.34136854406865397, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 39142 + }, + { + "epoch": 0.3413772653538225, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 39143 + }, + { + "epoch": 0.3413859866389911, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 39144 + }, + { + "epoch": 0.3413947079241597, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 39145 + }, + { + "epoch": 0.3414034292093283, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 39146 + }, + { + "epoch": 0.34141215049449686, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 39147 + }, + { + "epoch": 0.34142087177966546, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 39148 + }, + { + "epoch": 0.34142959306483406, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 39149 + }, + { + "epoch": 0.3414383143500026, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 39150 + }, + { + "epoch": 0.3414470356351712, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 39151 + }, + { + "epoch": 0.3414557569203398, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 39152 + }, + { + "epoch": 0.34146447820550835, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 39153 + }, + { + "epoch": 0.34147319949067695, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 39154 + }, + { + "epoch": 0.34148192077584555, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 39155 + }, + { + "epoch": 0.3414906420610141, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 39156 + }, + { + "epoch": 0.3414993633461827, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 39157 + }, + { + "epoch": 0.3415080846313513, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 39158 + }, + { + "epoch": 0.34151680591651984, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 39159 + }, + { + "epoch": 0.34152552720168844, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 39160 + }, + { + "epoch": 0.34153424848685704, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 39161 + }, + { + "epoch": 0.3415429697720256, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 39162 + }, + { + "epoch": 0.3415516910571942, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 39163 + }, + { + "epoch": 0.3415604123423628, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 39164 + }, + { + "epoch": 0.34156913362753133, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 39165 + }, + { + "epoch": 0.34157785491269993, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 39166 + }, + { + "epoch": 0.34158657619786853, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 39167 + }, + { + "epoch": 0.3415952974830371, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 39168 + }, + { + "epoch": 0.3416040187682057, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 39169 + }, + { + "epoch": 0.3416127400533743, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 39170 + }, + { + "epoch": 0.3416214613385428, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 39171 + }, + { + "epoch": 0.3416301826237114, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 39172 + }, + { + "epoch": 0.34163890390888, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 39173 + }, + { + "epoch": 0.3416476251940486, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 39174 + }, + { + "epoch": 0.34165634647921717, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 39175 + }, + { + "epoch": 0.34166506776438577, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 39176 + }, + { + "epoch": 0.34167378904955437, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 39177 + }, + { + "epoch": 0.3416825103347229, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 39178 + }, + { + "epoch": 0.3416912316198915, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 39179 + }, + { + "epoch": 0.3416999529050601, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 39180 + }, + { + "epoch": 0.34170867419022866, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9801, + "step": 39181 + }, + { + "epoch": 0.34171739547539726, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 39182 + }, + { + "epoch": 0.34172611676056586, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 39183 + }, + { + "epoch": 0.3417348380457344, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 39184 + }, + { + "epoch": 0.341743559330903, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 39185 + }, + { + "epoch": 0.3417522806160716, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 39186 + }, + { + "epoch": 0.34176100190124015, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 39187 + }, + { + "epoch": 0.34176972318640875, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 39188 + }, + { + "epoch": 0.34177844447157735, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 39189 + }, + { + "epoch": 0.3417871657567459, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 39190 + }, + { + "epoch": 0.3417958870419145, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 39191 + }, + { + "epoch": 0.3418046083270831, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 39192 + }, + { + "epoch": 0.34181332961225164, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9779, + "step": 39193 + }, + { + "epoch": 0.34182205089742024, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 39194 + }, + { + "epoch": 0.34183077218258884, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 39195 + }, + { + "epoch": 0.3418394934677574, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 39196 + }, + { + "epoch": 0.341848214752926, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 39197 + }, + { + "epoch": 0.3418569360380946, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 39198 + }, + { + "epoch": 0.3418656573232632, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 39199 + }, + { + "epoch": 0.34187437860843173, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 39200 + }, + { + "epoch": 0.34188309989360033, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 39201 + }, + { + "epoch": 0.34189182117876893, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 39202 + }, + { + "epoch": 0.3419005424639375, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 39203 + }, + { + "epoch": 0.3419092637491061, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 39204 + }, + { + "epoch": 0.3419179850342747, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 39205 + }, + { + "epoch": 0.3419267063194432, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 39206 + }, + { + "epoch": 0.3419354276046118, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 39207 + }, + { + "epoch": 0.3419441488897804, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 39208 + }, + { + "epoch": 0.34195287017494896, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 39209 + }, + { + "epoch": 0.34196159146011756, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 39210 + }, + { + "epoch": 0.34197031274528616, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 39211 + }, + { + "epoch": 0.3419790340304547, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 39212 + }, + { + "epoch": 0.3419877553156233, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 39213 + }, + { + "epoch": 0.3419964766007919, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 39214 + }, + { + "epoch": 0.34200519788596045, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 39215 + }, + { + "epoch": 0.34201391917112905, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 39216 + }, + { + "epoch": 0.34202264045629766, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9776, + "step": 39217 + }, + { + "epoch": 0.3420313617414662, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 39218 + }, + { + "epoch": 0.3420400830266348, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 39219 + }, + { + "epoch": 0.3420488043118034, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 39220 + }, + { + "epoch": 0.34205752559697195, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 39221 + }, + { + "epoch": 0.34206624688214055, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 39222 + }, + { + "epoch": 0.34207496816730915, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 39223 + }, + { + "epoch": 0.3420836894524777, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 39224 + }, + { + "epoch": 0.3420924107376463, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 39225 + }, + { + "epoch": 0.3421011320228149, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39226 + }, + { + "epoch": 0.3421098533079835, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 39227 + }, + { + "epoch": 0.34211857459315204, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 39228 + }, + { + "epoch": 0.34212729587832064, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 39229 + }, + { + "epoch": 0.34213601716348924, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 39230 + }, + { + "epoch": 0.3421447384486578, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 39231 + }, + { + "epoch": 0.3421534597338264, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 39232 + }, + { + "epoch": 0.342162181018995, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 39233 + }, + { + "epoch": 0.3421709023041635, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 39234 + }, + { + "epoch": 0.3421796235893321, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 39235 + }, + { + "epoch": 0.3421883448745007, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39236 + }, + { + "epoch": 0.34219706615966927, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 39237 + }, + { + "epoch": 0.3422057874448379, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 39238 + }, + { + "epoch": 0.3422145087300065, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 39239 + }, + { + "epoch": 0.342223230015175, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 39240 + }, + { + "epoch": 0.3422319513003436, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 39241 + }, + { + "epoch": 0.3422406725855122, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 39242 + }, + { + "epoch": 0.34224939387068076, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 39243 + }, + { + "epoch": 0.34225811515584936, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 39244 + }, + { + "epoch": 0.34226683644101796, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 39245 + }, + { + "epoch": 0.3422755577261865, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 39246 + }, + { + "epoch": 0.3422842790113551, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 39247 + }, + { + "epoch": 0.3422930002965237, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 39248 + }, + { + "epoch": 0.34230172158169225, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 39249 + }, + { + "epoch": 0.34231044286686085, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 39250 + }, + { + "epoch": 0.34231916415202945, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 39251 + }, + { + "epoch": 0.342327885437198, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 39252 + }, + { + "epoch": 0.3423366067223666, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 39253 + }, + { + "epoch": 0.3423453280075352, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 39254 + }, + { + "epoch": 0.3423540492927038, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 39255 + }, + { + "epoch": 0.34236277057787234, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 39256 + }, + { + "epoch": 0.34237149186304094, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 39257 + }, + { + "epoch": 0.34238021314820954, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 39258 + }, + { + "epoch": 0.3423889344333781, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 39259 + }, + { + "epoch": 0.3423976557185467, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 39260 + }, + { + "epoch": 0.3424063770037153, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 39261 + }, + { + "epoch": 0.34241509828888383, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 39262 + }, + { + "epoch": 0.34242381957405243, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 39263 + }, + { + "epoch": 0.34243254085922104, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 39264 + }, + { + "epoch": 0.3424412621443896, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 39265 + }, + { + "epoch": 0.3424499834295582, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 39266 + }, + { + "epoch": 0.3424587047147268, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 39267 + }, + { + "epoch": 0.3424674259998953, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 39268 + }, + { + "epoch": 0.3424761472850639, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 39269 + }, + { + "epoch": 0.3424848685702325, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 39270 + }, + { + "epoch": 0.34249358985540107, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 39271 + }, + { + "epoch": 0.34250231114056967, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 39272 + }, + { + "epoch": 0.34251103242573827, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 39273 + }, + { + "epoch": 0.3425197537109068, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 39274 + }, + { + "epoch": 0.3425284749960754, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 39275 + }, + { + "epoch": 0.342537196281244, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 39276 + }, + { + "epoch": 0.34254591756641256, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 39277 + }, + { + "epoch": 0.34255463885158116, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 39278 + }, + { + "epoch": 0.34256336013674976, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 39279 + }, + { + "epoch": 0.3425720814219183, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39280 + }, + { + "epoch": 0.3425808027070869, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 39281 + }, + { + "epoch": 0.3425895239922555, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 39282 + }, + { + "epoch": 0.3425982452774241, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 39283 + }, + { + "epoch": 0.34260696656259265, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39284 + }, + { + "epoch": 0.34261568784776125, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 39285 + }, + { + "epoch": 0.34262440913292985, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 39286 + }, + { + "epoch": 0.3426331304180984, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 39287 + }, + { + "epoch": 0.342641851703267, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 39288 + }, + { + "epoch": 0.3426505729884356, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 39289 + }, + { + "epoch": 0.34265929427360414, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 39290 + }, + { + "epoch": 0.34266801555877274, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 39291 + }, + { + "epoch": 0.34267673684394134, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 39292 + }, + { + "epoch": 0.3426854581291099, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 39293 + }, + { + "epoch": 0.3426941794142785, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 39294 + }, + { + "epoch": 0.3427029006994471, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 39295 + }, + { + "epoch": 0.34271162198461563, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 39296 + }, + { + "epoch": 0.34272034326978423, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 39297 + }, + { + "epoch": 0.34272906455495283, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 39298 + }, + { + "epoch": 0.3427377858401214, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 39299 + }, + { + "epoch": 0.34274650712529, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 39300 + }, + { + "epoch": 0.3427552284104586, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 39301 + }, + { + "epoch": 0.3427639496956271, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 39302 + }, + { + "epoch": 0.3427726709807957, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 39303 + }, + { + "epoch": 0.3427813922659643, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39304 + }, + { + "epoch": 0.34279011355113287, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 39305 + }, + { + "epoch": 0.34279883483630147, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 39306 + }, + { + "epoch": 0.34280755612147007, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 39307 + }, + { + "epoch": 0.34281627740663867, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 39308 + }, + { + "epoch": 0.3428249986918072, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 39309 + }, + { + "epoch": 0.3428337199769758, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 39310 + }, + { + "epoch": 0.3428424412621444, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 39311 + }, + { + "epoch": 0.34285116254731296, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 39312 + }, + { + "epoch": 0.34285988383248156, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 39313 + }, + { + "epoch": 0.34286860511765016, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 39314 + }, + { + "epoch": 0.3428773264028187, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 39315 + }, + { + "epoch": 0.3428860476879873, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 39316 + }, + { + "epoch": 0.3428947689731559, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 39317 + }, + { + "epoch": 0.34290349025832445, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 39318 + }, + { + "epoch": 0.34291221154349305, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 39319 + }, + { + "epoch": 0.34292093282866165, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 39320 + }, + { + "epoch": 0.3429296541138302, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 39321 + }, + { + "epoch": 0.3429383753989988, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 39322 + }, + { + "epoch": 0.3429470966841674, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 39323 + }, + { + "epoch": 0.34295581796933594, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 39324 + }, + { + "epoch": 0.34296453925450454, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 39325 + }, + { + "epoch": 0.34297326053967314, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 39326 + }, + { + "epoch": 0.3429819818248417, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 39327 + }, + { + "epoch": 0.3429907031100103, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 39328 + }, + { + "epoch": 0.3429994243951789, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 39329 + }, + { + "epoch": 0.34300814568034743, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 39330 + }, + { + "epoch": 0.34301686696551603, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 39331 + }, + { + "epoch": 0.34302558825068463, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 39332 + }, + { + "epoch": 0.3430343095358532, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 39333 + }, + { + "epoch": 0.3430430308210218, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39334 + }, + { + "epoch": 0.3430517521061904, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 39335 + }, + { + "epoch": 0.343060473391359, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 39336 + }, + { + "epoch": 0.3430691946765275, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 39337 + }, + { + "epoch": 0.3430779159616961, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 39338 + }, + { + "epoch": 0.3430866372468647, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 39339 + }, + { + "epoch": 0.34309535853203327, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 39340 + }, + { + "epoch": 0.34310407981720187, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9777, + "step": 39341 + }, + { + "epoch": 0.34311280110237047, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 39342 + }, + { + "epoch": 0.343121522387539, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 39343 + }, + { + "epoch": 0.3431302436727076, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 39344 + }, + { + "epoch": 0.3431389649578762, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 39345 + }, + { + "epoch": 0.34314768624304476, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 39346 + }, + { + "epoch": 0.34315640752821336, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 39347 + }, + { + "epoch": 0.34316512881338196, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 39348 + }, + { + "epoch": 0.3431738500985505, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 39349 + }, + { + "epoch": 0.3431825713837191, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 39350 + }, + { + "epoch": 0.3431912926688877, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 39351 + }, + { + "epoch": 0.34320001395405625, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 39352 + }, + { + "epoch": 0.34320873523922485, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 39353 + }, + { + "epoch": 0.34321745652439345, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 39354 + }, + { + "epoch": 0.343226177809562, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 39355 + }, + { + "epoch": 0.3432348990947306, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39356 + }, + { + "epoch": 0.3432436203798992, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 39357 + }, + { + "epoch": 0.34325234166506774, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 39358 + }, + { + "epoch": 0.34326106295023634, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 39359 + }, + { + "epoch": 0.34326978423540494, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 39360 + }, + { + "epoch": 0.3432785055205735, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 39361 + }, + { + "epoch": 0.3432872268057421, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 39362 + }, + { + "epoch": 0.3432959480909107, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 39363 + }, + { + "epoch": 0.3433046693760793, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 39364 + }, + { + "epoch": 0.34331339066124783, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 39365 + }, + { + "epoch": 0.34332211194641643, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 39366 + }, + { + "epoch": 0.34333083323158503, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 39367 + }, + { + "epoch": 0.3433395545167536, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 39368 + }, + { + "epoch": 0.3433482758019222, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 39369 + }, + { + "epoch": 0.3433569970870908, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 39370 + }, + { + "epoch": 0.3433657183722593, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 39371 + }, + { + "epoch": 0.3433744396574279, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 39372 + }, + { + "epoch": 0.3433831609425965, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 39373 + }, + { + "epoch": 0.34339188222776507, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 39374 + }, + { + "epoch": 0.34340060351293367, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 39375 + }, + { + "epoch": 0.34340932479810227, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 39376 + }, + { + "epoch": 0.3434180460832708, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 39377 + }, + { + "epoch": 0.3434267673684394, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 39378 + }, + { + "epoch": 0.343435488653608, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 39379 + }, + { + "epoch": 0.34344420993877656, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 39380 + }, + { + "epoch": 0.34345293122394516, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 39381 + }, + { + "epoch": 0.34346165250911376, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 39382 + }, + { + "epoch": 0.3434703737942823, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 39383 + }, + { + "epoch": 0.3434790950794509, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 39384 + }, + { + "epoch": 0.3434878163646195, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 39385 + }, + { + "epoch": 0.34349653764978805, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 39386 + }, + { + "epoch": 0.34350525893495665, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39387 + }, + { + "epoch": 0.34351398022012525, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 39388 + }, + { + "epoch": 0.3435227015052938, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 39389 + }, + { + "epoch": 0.3435314227904624, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 39390 + }, + { + "epoch": 0.343540144075631, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 39391 + }, + { + "epoch": 0.3435488653607996, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 39392 + }, + { + "epoch": 0.34355758664596814, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 39393 + }, + { + "epoch": 0.34356630793113674, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 39394 + }, + { + "epoch": 0.34357502921630534, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 39395 + }, + { + "epoch": 0.3435837505014739, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 39396 + }, + { + "epoch": 0.3435924717866425, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 39397 + }, + { + "epoch": 0.3436011930718111, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 39398 + }, + { + "epoch": 0.34360991435697963, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 39399 + }, + { + "epoch": 0.34361863564214823, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 39400 + }, + { + "epoch": 0.34362735692731683, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 39401 + }, + { + "epoch": 0.3436360782124854, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 39402 + }, + { + "epoch": 0.343644799497654, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 39403 + }, + { + "epoch": 0.3436535207828226, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 39404 + }, + { + "epoch": 0.3436622420679911, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 39405 + }, + { + "epoch": 0.3436709633531597, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 39406 + }, + { + "epoch": 0.3436796846383283, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 39407 + }, + { + "epoch": 0.34368840592349686, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 39408 + }, + { + "epoch": 0.34369712720866546, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 39409 + }, + { + "epoch": 0.34370584849383407, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 39410 + }, + { + "epoch": 0.3437145697790026, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 39411 + }, + { + "epoch": 0.3437232910641712, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 39412 + }, + { + "epoch": 0.3437320123493398, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 39413 + }, + { + "epoch": 0.34374073363450836, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 39414 + }, + { + "epoch": 0.34374945491967696, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 39415 + }, + { + "epoch": 0.34375817620484556, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 39416 + }, + { + "epoch": 0.34376689749001416, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 39417 + }, + { + "epoch": 0.3437756187751827, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 39418 + }, + { + "epoch": 0.3437843400603513, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 39419 + }, + { + "epoch": 0.3437930613455199, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 39420 + }, + { + "epoch": 0.34380178263068845, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 39421 + }, + { + "epoch": 0.34381050391585705, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 39422 + }, + { + "epoch": 0.34381922520102565, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 39423 + }, + { + "epoch": 0.3438279464861942, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 39424 + }, + { + "epoch": 0.3438366677713628, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 39425 + }, + { + "epoch": 0.3438453890565314, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 39426 + }, + { + "epoch": 0.34385411034169994, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 39427 + }, + { + "epoch": 0.34386283162686854, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 39428 + }, + { + "epoch": 0.34387155291203714, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 39429 + }, + { + "epoch": 0.3438802741972057, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39430 + }, + { + "epoch": 0.3438889954823743, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 39431 + }, + { + "epoch": 0.3438977167675429, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 39432 + }, + { + "epoch": 0.3439064380527114, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 39433 + }, + { + "epoch": 0.34391515933788, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 39434 + }, + { + "epoch": 0.34392388062304863, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 39435 + }, + { + "epoch": 0.3439326019082172, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 39436 + }, + { + "epoch": 0.3439413231933858, + "grad_norm": 0.0703125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 39437 + }, + { + "epoch": 0.3439500444785544, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 39438 + }, + { + "epoch": 0.3439587657637229, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 39439 + }, + { + "epoch": 0.3439674870488915, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 39440 + }, + { + "epoch": 0.3439762083340601, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 39441 + }, + { + "epoch": 0.34398492961922866, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 39442 + }, + { + "epoch": 0.34399365090439726, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 39443 + }, + { + "epoch": 0.34400237218956586, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 39444 + }, + { + "epoch": 0.34401109347473446, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 39445 + }, + { + "epoch": 0.344019814759903, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 39446 + }, + { + "epoch": 0.3440285360450716, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 39447 + }, + { + "epoch": 0.3440372573302402, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 39448 + }, + { + "epoch": 0.34404597861540875, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 39449 + }, + { + "epoch": 0.34405469990057735, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 39450 + }, + { + "epoch": 0.34406342118574595, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 39451 + }, + { + "epoch": 0.3440721424709145, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 39452 + }, + { + "epoch": 0.3440808637560831, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 39453 + }, + { + "epoch": 0.3440895850412517, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 39454 + }, + { + "epoch": 0.34409830632642024, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 39455 + }, + { + "epoch": 0.34410702761158884, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 39456 + }, + { + "epoch": 0.34411574889675745, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 39457 + }, + { + "epoch": 0.344124470181926, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 39458 + }, + { + "epoch": 0.3441331914670946, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 39459 + }, + { + "epoch": 0.3441419127522632, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 39460 + }, + { + "epoch": 0.34415063403743174, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 39461 + }, + { + "epoch": 0.34415935532260034, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 39462 + }, + { + "epoch": 0.34416807660776894, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 39463 + }, + { + "epoch": 0.3441767978929375, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 39464 + }, + { + "epoch": 0.3441855191781061, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 39465 + }, + { + "epoch": 0.3441942404632747, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 39466 + }, + { + "epoch": 0.3442029617484432, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 39467 + }, + { + "epoch": 0.3442116830336118, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 39468 + }, + { + "epoch": 0.3442204043187804, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 39469 + }, + { + "epoch": 0.34422912560394897, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 39470 + }, + { + "epoch": 0.34423784688911757, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 39471 + }, + { + "epoch": 0.34424656817428617, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 39472 + }, + { + "epoch": 0.34425528945945477, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 39473 + }, + { + "epoch": 0.3442640107446233, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 39474 + }, + { + "epoch": 0.3442727320297919, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 39475 + }, + { + "epoch": 0.3442814533149605, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 39476 + }, + { + "epoch": 0.34429017460012906, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 39477 + }, + { + "epoch": 0.34429889588529766, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 39478 + }, + { + "epoch": 0.34430761717046626, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 39479 + }, + { + "epoch": 0.3443163384556348, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 39480 + }, + { + "epoch": 0.3443250597408034, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 39481 + }, + { + "epoch": 0.344333781025972, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39482 + }, + { + "epoch": 0.34434250231114055, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 39483 + }, + { + "epoch": 0.34435122359630915, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 39484 + }, + { + "epoch": 0.34435994488147775, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 39485 + }, + { + "epoch": 0.3443686661666463, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 39486 + }, + { + "epoch": 0.3443773874518149, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 39487 + }, + { + "epoch": 0.3443861087369835, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 39488 + }, + { + "epoch": 0.34439483002215204, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 39489 + }, + { + "epoch": 0.34440355130732064, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 39490 + }, + { + "epoch": 0.34441227259248924, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 39491 + }, + { + "epoch": 0.3444209938776578, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 39492 + }, + { + "epoch": 0.3444297151628264, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 39493 + }, + { + "epoch": 0.344438436447995, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 39494 + }, + { + "epoch": 0.34444715773316353, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 39495 + }, + { + "epoch": 0.34445587901833213, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 39496 + }, + { + "epoch": 0.34446460030350073, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 39497 + }, + { + "epoch": 0.3444733215886693, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39498 + }, + { + "epoch": 0.3444820428738379, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 39499 + }, + { + "epoch": 0.3444907641590065, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 39500 + }, + { + "epoch": 0.3444994854441751, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 39501 + }, + { + "epoch": 0.3445082067293436, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 39502 + }, + { + "epoch": 0.3445169280145122, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 39503 + }, + { + "epoch": 0.3445256492996808, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 39504 + }, + { + "epoch": 0.34453437058484937, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 39505 + }, + { + "epoch": 0.34454309187001797, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 39506 + }, + { + "epoch": 0.34455181315518657, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 39507 + }, + { + "epoch": 0.3445605344403551, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 39508 + }, + { + "epoch": 0.3445692557255237, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 39509 + }, + { + "epoch": 0.3445779770106923, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 39510 + }, + { + "epoch": 0.34458669829586086, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 39511 + }, + { + "epoch": 0.34459541958102946, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 39512 + }, + { + "epoch": 0.34460414086619806, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 39513 + }, + { + "epoch": 0.3446128621513666, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 39514 + }, + { + "epoch": 0.3446215834365352, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 39515 + }, + { + "epoch": 0.3446303047217038, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 39516 + }, + { + "epoch": 0.34463902600687235, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 39517 + }, + { + "epoch": 0.34464774729204095, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 39518 + }, + { + "epoch": 0.34465646857720955, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 39519 + }, + { + "epoch": 0.3446651898623781, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 39520 + }, + { + "epoch": 0.3446739111475467, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 39521 + }, + { + "epoch": 0.3446826324327153, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 39522 + }, + { + "epoch": 0.34469135371788384, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 39523 + }, + { + "epoch": 0.34470007500305244, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 39524 + }, + { + "epoch": 0.34470879628822104, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 39525 + }, + { + "epoch": 0.34471751757338964, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39526 + }, + { + "epoch": 0.3447262388585582, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 39527 + }, + { + "epoch": 0.3447349601437268, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 39528 + }, + { + "epoch": 0.3447436814288954, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 39529 + }, + { + "epoch": 0.34475240271406393, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 39530 + }, + { + "epoch": 0.34476112399923253, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 39531 + }, + { + "epoch": 0.34476984528440113, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 39532 + }, + { + "epoch": 0.3447785665695697, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 39533 + }, + { + "epoch": 0.3447872878547383, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 39534 + }, + { + "epoch": 0.3447960091399069, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 39535 + }, + { + "epoch": 0.3448047304250754, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 39536 + }, + { + "epoch": 0.344813451710244, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 39537 + }, + { + "epoch": 0.3448221729954126, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 39538 + }, + { + "epoch": 0.34483089428058117, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 39539 + }, + { + "epoch": 0.34483961556574977, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 39540 + }, + { + "epoch": 0.34484833685091837, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 39541 + }, + { + "epoch": 0.3448570581360869, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 39542 + }, + { + "epoch": 0.3448657794212555, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 39543 + }, + { + "epoch": 0.3448745007064241, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 39544 + }, + { + "epoch": 0.34488322199159266, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 39545 + }, + { + "epoch": 0.34489194327676126, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 39546 + }, + { + "epoch": 0.34490066456192986, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 39547 + }, + { + "epoch": 0.3449093858470984, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 39548 + }, + { + "epoch": 0.344918107132267, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 39549 + }, + { + "epoch": 0.3449268284174356, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 39550 + }, + { + "epoch": 0.34493554970260415, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 39551 + }, + { + "epoch": 0.34494427098777275, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39552 + }, + { + "epoch": 0.34495299227294135, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 39553 + }, + { + "epoch": 0.34496171355810995, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 39554 + }, + { + "epoch": 0.3449704348432785, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 39555 + }, + { + "epoch": 0.3449791561284471, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 39556 + }, + { + "epoch": 0.3449878774136157, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 39557 + }, + { + "epoch": 0.34499659869878424, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 39558 + }, + { + "epoch": 0.34500531998395284, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 39559 + }, + { + "epoch": 0.34501404126912144, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 39560 + }, + { + "epoch": 0.34502276255429, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 39561 + }, + { + "epoch": 0.3450314838394586, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 39562 + }, + { + "epoch": 0.3450402051246272, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 39563 + }, + { + "epoch": 0.34504892640979573, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 39564 + }, + { + "epoch": 0.34505764769496433, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 39565 + }, + { + "epoch": 0.34506636898013293, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 39566 + }, + { + "epoch": 0.3450750902653015, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 39567 + }, + { + "epoch": 0.3450838115504701, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 39568 + }, + { + "epoch": 0.3450925328356387, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 39569 + }, + { + "epoch": 0.3451012541208072, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 39570 + }, + { + "epoch": 0.3451099754059758, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 39571 + }, + { + "epoch": 0.3451186966911444, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 39572 + }, + { + "epoch": 0.34512741797631297, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 39573 + }, + { + "epoch": 0.34513613926148157, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9815, + "step": 39574 + }, + { + "epoch": 0.34514486054665017, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 39575 + }, + { + "epoch": 0.3451535818318187, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 39576 + }, + { + "epoch": 0.3451623031169873, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 39577 + }, + { + "epoch": 0.3451710244021559, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 39578 + }, + { + "epoch": 0.34517974568732446, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 39579 + }, + { + "epoch": 0.34518846697249306, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 39580 + }, + { + "epoch": 0.34519718825766166, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 39581 + }, + { + "epoch": 0.34520590954283026, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 39582 + }, + { + "epoch": 0.3452146308279988, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 39583 + }, + { + "epoch": 0.3452233521131674, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 39584 + }, + { + "epoch": 0.345232073398336, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 39585 + }, + { + "epoch": 0.34524079468350455, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 39586 + }, + { + "epoch": 0.34524951596867315, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 39587 + }, + { + "epoch": 0.34525823725384175, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 39588 + }, + { + "epoch": 0.3452669585390103, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 39589 + }, + { + "epoch": 0.3452756798241789, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 39590 + }, + { + "epoch": 0.3452844011093475, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 39591 + }, + { + "epoch": 0.34529312239451604, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 39592 + }, + { + "epoch": 0.34530184367968464, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 39593 + }, + { + "epoch": 0.34531056496485324, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9821, + "step": 39594 + }, + { + "epoch": 0.3453192862500218, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 39595 + }, + { + "epoch": 0.3453280075351904, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 39596 + }, + { + "epoch": 0.345336728820359, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 39597 + }, + { + "epoch": 0.34534545010552753, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 39598 + }, + { + "epoch": 0.34535417139069613, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 39599 + }, + { + "epoch": 0.34536289267586473, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 39600 + }, + { + "epoch": 0.3453716139610333, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 39601 + }, + { + "epoch": 0.3453803352462019, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 39602 + }, + { + "epoch": 0.3453890565313705, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 39603 + }, + { + "epoch": 0.345397777816539, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 39604 + }, + { + "epoch": 0.3454064991017076, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 39605 + }, + { + "epoch": 0.3454152203868762, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 39606 + }, + { + "epoch": 0.34542394167204477, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 39607 + }, + { + "epoch": 0.34543266295721337, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 39608 + }, + { + "epoch": 0.34544138424238197, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 39609 + }, + { + "epoch": 0.34545010552755057, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.982, + "step": 39610 + }, + { + "epoch": 0.3454588268127191, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 39611 + }, + { + "epoch": 0.3454675480978877, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 39612 + }, + { + "epoch": 0.3454762693830563, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39613 + }, + { + "epoch": 0.34548499066822486, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 39614 + }, + { + "epoch": 0.34549371195339346, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 39615 + }, + { + "epoch": 0.34550243323856206, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 39616 + }, + { + "epoch": 0.3455111545237306, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 39617 + }, + { + "epoch": 0.3455198758088992, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 39618 + }, + { + "epoch": 0.3455285970940678, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 39619 + }, + { + "epoch": 0.34553731837923635, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 39620 + }, + { + "epoch": 0.34554603966440495, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 39621 + }, + { + "epoch": 0.34555476094957355, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 39622 + }, + { + "epoch": 0.3455634822347421, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 39623 + }, + { + "epoch": 0.3455722035199107, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 39624 + }, + { + "epoch": 0.3455809248050793, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 39625 + }, + { + "epoch": 0.34558964609024784, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 39626 + }, + { + "epoch": 0.34559836737541644, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 39627 + }, + { + "epoch": 0.34560708866058504, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 39628 + }, + { + "epoch": 0.3456158099457536, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 39629 + }, + { + "epoch": 0.3456245312309222, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 39630 + }, + { + "epoch": 0.3456332525160908, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 39631 + }, + { + "epoch": 0.34564197380125933, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 39632 + }, + { + "epoch": 0.34565069508642793, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 39633 + }, + { + "epoch": 0.34565941637159653, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 39634 + }, + { + "epoch": 0.3456681376567651, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 39635 + }, + { + "epoch": 0.3456768589419337, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 39636 + }, + { + "epoch": 0.3456855802271023, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 39637 + }, + { + "epoch": 0.3456943015122709, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 39638 + }, + { + "epoch": 0.3457030227974394, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 39639 + }, + { + "epoch": 0.345711744082608, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 39640 + }, + { + "epoch": 0.3457204653677766, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 39641 + }, + { + "epoch": 0.34572918665294516, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 39642 + }, + { + "epoch": 0.34573790793811376, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 39643 + }, + { + "epoch": 0.34574662922328236, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 39644 + }, + { + "epoch": 0.3457553505084509, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 39645 + }, + { + "epoch": 0.3457640717936195, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 39646 + }, + { + "epoch": 0.3457727930787881, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 39647 + }, + { + "epoch": 0.34578151436395665, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 39648 + }, + { + "epoch": 0.34579023564912525, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 39649 + }, + { + "epoch": 0.34579895693429386, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 39650 + }, + { + "epoch": 0.3458076782194624, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 39651 + }, + { + "epoch": 0.345816399504631, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 39652 + }, + { + "epoch": 0.3458251207897996, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 39653 + }, + { + "epoch": 0.34583384207496815, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 39654 + }, + { + "epoch": 0.34584256336013675, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 39655 + }, + { + "epoch": 0.34585128464530535, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 39656 + }, + { + "epoch": 0.3458600059304739, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 39657 + }, + { + "epoch": 0.3458687272156425, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 39658 + }, + { + "epoch": 0.3458774485008111, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 39659 + }, + { + "epoch": 0.34588616978597964, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 39660 + }, + { + "epoch": 0.34589489107114824, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 39661 + }, + { + "epoch": 0.34590361235631684, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 39662 + }, + { + "epoch": 0.34591233364148544, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 39663 + }, + { + "epoch": 0.345921054926654, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 39664 + }, + { + "epoch": 0.3459297762118226, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 39665 + }, + { + "epoch": 0.3459384974969912, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 39666 + }, + { + "epoch": 0.3459472187821597, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 39667 + }, + { + "epoch": 0.3459559400673283, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 39668 + }, + { + "epoch": 0.3459646613524969, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 39669 + }, + { + "epoch": 0.34597338263766547, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 39670 + }, + { + "epoch": 0.34598210392283407, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 39671 + }, + { + "epoch": 0.3459908252080027, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 39672 + }, + { + "epoch": 0.3459995464931712, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 39673 + }, + { + "epoch": 0.3460082677783398, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 39674 + }, + { + "epoch": 0.3460169890635084, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 39675 + }, + { + "epoch": 0.34602571034867696, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 39676 + }, + { + "epoch": 0.34603443163384556, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 39677 + }, + { + "epoch": 0.34604315291901416, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 39678 + }, + { + "epoch": 0.3460518742041827, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 39679 + }, + { + "epoch": 0.3460605954893513, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 39680 + }, + { + "epoch": 0.3460693167745199, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 39681 + }, + { + "epoch": 0.34607803805968845, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 39682 + }, + { + "epoch": 0.34608675934485705, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 39683 + }, + { + "epoch": 0.34609548063002565, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 39684 + }, + { + "epoch": 0.3461042019151942, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 39685 + }, + { + "epoch": 0.3461129232003628, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 39686 + }, + { + "epoch": 0.3461216444855314, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 39687 + }, + { + "epoch": 0.34613036577069994, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 39688 + }, + { + "epoch": 0.34613908705586854, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 39689 + }, + { + "epoch": 0.34614780834103714, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 39690 + }, + { + "epoch": 0.34615652962620574, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 39691 + }, + { + "epoch": 0.3461652509113743, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 39692 + }, + { + "epoch": 0.3461739721965429, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 39693 + }, + { + "epoch": 0.3461826934817115, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 39694 + }, + { + "epoch": 0.34619141476688003, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 39695 + }, + { + "epoch": 0.34620013605204863, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 39696 + }, + { + "epoch": 0.34620885733721724, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 39697 + }, + { + "epoch": 0.3462175786223858, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 39698 + }, + { + "epoch": 0.3462262999075544, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 39699 + }, + { + "epoch": 0.346235021192723, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 39700 + }, + { + "epoch": 0.3462437424778915, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 39701 + }, + { + "epoch": 0.3462524637630601, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 39702 + }, + { + "epoch": 0.3462611850482287, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 39703 + }, + { + "epoch": 0.34626990633339727, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 39704 + }, + { + "epoch": 0.34627862761856587, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 39705 + }, + { + "epoch": 0.34628734890373447, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 39706 + }, + { + "epoch": 0.346296070188903, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 39707 + }, + { + "epoch": 0.3463047914740716, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 39708 + }, + { + "epoch": 0.3463135127592402, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 39709 + }, + { + "epoch": 0.34632223404440876, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 39710 + }, + { + "epoch": 0.34633095532957736, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 39711 + }, + { + "epoch": 0.34633967661474596, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 39712 + }, + { + "epoch": 0.3463483978999145, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 39713 + }, + { + "epoch": 0.3463571191850831, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 39714 + }, + { + "epoch": 0.3463658404702517, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 39715 + }, + { + "epoch": 0.34637456175542025, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 39716 + }, + { + "epoch": 0.34638328304058885, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 39717 + }, + { + "epoch": 0.34639200432575745, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 39718 + }, + { + "epoch": 0.34640072561092605, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 39719 + }, + { + "epoch": 0.3464094468960946, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 39720 + }, + { + "epoch": 0.3464181681812632, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 39721 + }, + { + "epoch": 0.3464268894664318, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 39722 + }, + { + "epoch": 0.34643561075160034, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 39723 + }, + { + "epoch": 0.34644433203676894, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 39724 + }, + { + "epoch": 0.34645305332193754, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 39725 + }, + { + "epoch": 0.3464617746071061, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 39726 + }, + { + "epoch": 0.3464704958922747, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39727 + }, + { + "epoch": 0.3464792171774433, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 39728 + }, + { + "epoch": 0.34648793846261183, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 39729 + }, + { + "epoch": 0.34649665974778043, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 39730 + }, + { + "epoch": 0.34650538103294903, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 39731 + }, + { + "epoch": 0.3465141023181176, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 39732 + }, + { + "epoch": 0.3465228236032862, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 39733 + }, + { + "epoch": 0.3465315448884548, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 39734 + }, + { + "epoch": 0.3465402661736233, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 39735 + }, + { + "epoch": 0.3465489874587919, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 39736 + }, + { + "epoch": 0.3465577087439605, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 39737 + }, + { + "epoch": 0.34656643002912907, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 39738 + }, + { + "epoch": 0.34657515131429767, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 39739 + }, + { + "epoch": 0.34658387259946627, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 39740 + }, + { + "epoch": 0.3465925938846348, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 39741 + }, + { + "epoch": 0.3466013151698034, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 39742 + }, + { + "epoch": 0.346610036454972, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 39743 + }, + { + "epoch": 0.34661875774014056, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 39744 + }, + { + "epoch": 0.34662747902530916, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 39745 + }, + { + "epoch": 0.34663620031047776, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 39746 + }, + { + "epoch": 0.34664492159564636, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 39747 + }, + { + "epoch": 0.3466536428808149, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 39748 + }, + { + "epoch": 0.3466623641659835, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 39749 + }, + { + "epoch": 0.3466710854511521, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 39750 + }, + { + "epoch": 0.34667980673632065, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 39751 + }, + { + "epoch": 0.34668852802148925, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 39752 + }, + { + "epoch": 0.34669724930665785, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 39753 + }, + { + "epoch": 0.3467059705918264, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 39754 + }, + { + "epoch": 0.346714691876995, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 39755 + }, + { + "epoch": 0.3467234131621636, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 39756 + }, + { + "epoch": 0.34673213444733214, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 39757 + }, + { + "epoch": 0.34674085573250074, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 39758 + }, + { + "epoch": 0.34674957701766934, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 39759 + }, + { + "epoch": 0.3467582983028379, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 39760 + }, + { + "epoch": 0.3467670195880065, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 39761 + }, + { + "epoch": 0.3467757408731751, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 39762 + }, + { + "epoch": 0.34678446215834363, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 39763 + }, + { + "epoch": 0.34679318344351223, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 39764 + }, + { + "epoch": 0.34680190472868083, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 39765 + }, + { + "epoch": 0.3468106260138494, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39766 + }, + { + "epoch": 0.346819347299018, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 39767 + }, + { + "epoch": 0.3468280685841866, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 39768 + }, + { + "epoch": 0.3468367898693551, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 39769 + }, + { + "epoch": 0.3468455111545237, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 39770 + }, + { + "epoch": 0.3468542324396923, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 39771 + }, + { + "epoch": 0.3468629537248609, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 39772 + }, + { + "epoch": 0.34687167501002947, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 39773 + }, + { + "epoch": 0.34688039629519807, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 39774 + }, + { + "epoch": 0.34688911758036667, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 39775 + }, + { + "epoch": 0.3468978388655352, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 39776 + }, + { + "epoch": 0.3469065601507038, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 39777 + }, + { + "epoch": 0.3469152814358724, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 39778 + }, + { + "epoch": 0.34692400272104096, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 39779 + }, + { + "epoch": 0.34693272400620956, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39780 + }, + { + "epoch": 0.34694144529137816, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 39781 + }, + { + "epoch": 0.3469501665765467, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39782 + }, + { + "epoch": 0.3469588878617153, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 39783 + }, + { + "epoch": 0.3469676091468839, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 39784 + }, + { + "epoch": 0.34697633043205245, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 39785 + }, + { + "epoch": 0.34698505171722105, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 39786 + }, + { + "epoch": 0.34699377300238965, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39787 + }, + { + "epoch": 0.3470024942875582, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 39788 + }, + { + "epoch": 0.3470112155727268, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39789 + }, + { + "epoch": 0.3470199368578954, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 39790 + }, + { + "epoch": 0.34702865814306394, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39791 + }, + { + "epoch": 0.34703737942823254, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 39792 + }, + { + "epoch": 0.34704610071340114, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 39793 + }, + { + "epoch": 0.3470548219985697, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 39794 + }, + { + "epoch": 0.3470635432837383, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 39795 + }, + { + "epoch": 0.3470722645689069, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 39796 + }, + { + "epoch": 0.34708098585407543, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 39797 + }, + { + "epoch": 0.34708970713924403, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 39798 + }, + { + "epoch": 0.34709842842441263, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 39799 + }, + { + "epoch": 0.34710714970958123, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 39800 + }, + { + "epoch": 0.3471158709947498, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 39801 + }, + { + "epoch": 0.3471245922799184, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 39802 + }, + { + "epoch": 0.347133313565087, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 39803 + }, + { + "epoch": 0.3471420348502555, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 39804 + }, + { + "epoch": 0.3471507561354241, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 39805 + }, + { + "epoch": 0.3471594774205927, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 39806 + }, + { + "epoch": 0.34716819870576127, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 39807 + }, + { + "epoch": 0.34717691999092987, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 39808 + }, + { + "epoch": 0.34718564127609847, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 39809 + }, + { + "epoch": 0.347194362561267, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 39810 + }, + { + "epoch": 0.3472030838464356, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 39811 + }, + { + "epoch": 0.3472118051316042, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 39812 + }, + { + "epoch": 0.34722052641677276, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 39813 + }, + { + "epoch": 0.34722924770194136, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 39814 + }, + { + "epoch": 0.34723796898710996, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 39815 + }, + { + "epoch": 0.3472466902722785, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 39816 + }, + { + "epoch": 0.3472554115574471, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 39817 + }, + { + "epoch": 0.3472641328426157, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 39818 + }, + { + "epoch": 0.34727285412778425, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 39819 + }, + { + "epoch": 0.34728157541295285, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 39820 + }, + { + "epoch": 0.34729029669812145, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 39821 + }, + { + "epoch": 0.34729901798329, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 39822 + }, + { + "epoch": 0.3473077392684586, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 39823 + }, + { + "epoch": 0.3473164605536272, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 39824 + }, + { + "epoch": 0.34732518183879574, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 39825 + }, + { + "epoch": 0.34733390312396434, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 39826 + }, + { + "epoch": 0.34734262440913294, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 39827 + }, + { + "epoch": 0.34735134569430154, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 39828 + }, + { + "epoch": 0.3473600669794701, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 39829 + }, + { + "epoch": 0.3473687882646387, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 39830 + }, + { + "epoch": 0.3473775095498073, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 39831 + }, + { + "epoch": 0.34738623083497583, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 0.9773, + "step": 39832 + }, + { + "epoch": 0.34739495212014443, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 39833 + }, + { + "epoch": 0.34740367340531303, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 39834 + }, + { + "epoch": 0.3474123946904816, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 39835 + }, + { + "epoch": 0.3474211159756502, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 39836 + }, + { + "epoch": 0.3474298372608188, + "grad_norm": 0.361328125, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 39837 + }, + { + "epoch": 0.3474385585459873, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 39838 + }, + { + "epoch": 0.3474472798311559, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 39839 + }, + { + "epoch": 0.3474560011163245, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 39840 + }, + { + "epoch": 0.34746472240149306, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 39841 + }, + { + "epoch": 0.34747344368666166, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 39842 + }, + { + "epoch": 0.34748216497183027, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 39843 + }, + { + "epoch": 0.3474908862569988, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 39844 + }, + { + "epoch": 0.3474996075421674, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39845 + }, + { + "epoch": 0.347508328827336, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 39846 + }, + { + "epoch": 0.34751705011250456, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 39847 + }, + { + "epoch": 0.34752577139767316, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 39848 + }, + { + "epoch": 0.34753449268284176, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 39849 + }, + { + "epoch": 0.3475432139680103, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 39850 + }, + { + "epoch": 0.3475519352531789, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 39851 + }, + { + "epoch": 0.3475606565383475, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 39852 + }, + { + "epoch": 0.34756937782351605, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 39853 + }, + { + "epoch": 0.34757809910868465, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 39854 + }, + { + "epoch": 0.34758682039385325, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 39855 + }, + { + "epoch": 0.34759554167902185, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 39856 + }, + { + "epoch": 0.3476042629641904, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 39857 + }, + { + "epoch": 0.347612984249359, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 39858 + }, + { + "epoch": 0.3476217055345276, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 39859 + }, + { + "epoch": 0.34763042681969614, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 39860 + }, + { + "epoch": 0.34763914810486474, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 39861 + }, + { + "epoch": 0.34764786939003334, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 39862 + }, + { + "epoch": 0.3476565906752019, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 39863 + }, + { + "epoch": 0.3476653119603705, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 39864 + }, + { + "epoch": 0.3476740332455391, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 39865 + }, + { + "epoch": 0.3476827545307076, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 39866 + }, + { + "epoch": 0.3476914758158762, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 39867 + }, + { + "epoch": 0.34770019710104483, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 39868 + }, + { + "epoch": 0.3477089183862134, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 39869 + }, + { + "epoch": 0.347717639671382, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 39870 + }, + { + "epoch": 0.3477263609565506, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 39871 + }, + { + "epoch": 0.3477350822417191, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 39872 + }, + { + "epoch": 0.3477438035268877, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 39873 + }, + { + "epoch": 0.3477525248120563, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 39874 + }, + { + "epoch": 0.34776124609722486, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 39875 + }, + { + "epoch": 0.34776996738239346, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 39876 + }, + { + "epoch": 0.34777868866756206, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 39877 + }, + { + "epoch": 0.3477874099527306, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 39878 + }, + { + "epoch": 0.3477961312378992, + "grad_norm": 0.072265625, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 39879 + }, + { + "epoch": 0.3478048525230678, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 39880 + }, + { + "epoch": 0.3478135738082364, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 39881 + }, + { + "epoch": 0.34782229509340495, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 39882 + }, + { + "epoch": 0.34783101637857355, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 39883 + }, + { + "epoch": 0.34783973766374215, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 39884 + }, + { + "epoch": 0.3478484589489107, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 39885 + }, + { + "epoch": 0.3478571802340793, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 39886 + }, + { + "epoch": 0.3478659015192479, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 39887 + }, + { + "epoch": 0.34787462280441644, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 39888 + }, + { + "epoch": 0.34788334408958504, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 39889 + }, + { + "epoch": 0.34789206537475365, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 39890 + }, + { + "epoch": 0.3479007866599222, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 39891 + }, + { + "epoch": 0.3479095079450908, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 39892 + }, + { + "epoch": 0.3479182292302594, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 39893 + }, + { + "epoch": 0.34792695051542794, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 39894 + }, + { + "epoch": 0.34793567180059654, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 39895 + }, + { + "epoch": 0.34794439308576514, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 39896 + }, + { + "epoch": 0.3479531143709337, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 39897 + }, + { + "epoch": 0.3479618356561023, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 39898 + }, + { + "epoch": 0.3479705569412709, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 39899 + }, + { + "epoch": 0.3479792782264394, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 39900 + }, + { + "epoch": 0.347987999511608, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 39901 + }, + { + "epoch": 0.3479967207967766, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 39902 + }, + { + "epoch": 0.34800544208194517, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 39903 + }, + { + "epoch": 0.34801416336711377, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 39904 + }, + { + "epoch": 0.34802288465228237, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 39905 + }, + { + "epoch": 0.3480316059374509, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 39906 + }, + { + "epoch": 0.3480403272226195, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 39907 + }, + { + "epoch": 0.3480490485077881, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 39908 + }, + { + "epoch": 0.3480577697929567, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 39909 + }, + { + "epoch": 0.34806649107812526, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 39910 + }, + { + "epoch": 0.34807521236329386, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 39911 + }, + { + "epoch": 0.34808393364846246, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 39912 + }, + { + "epoch": 0.348092654933631, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 39913 + }, + { + "epoch": 0.3481013762187996, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 39914 + }, + { + "epoch": 0.3481100975039682, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 39915 + }, + { + "epoch": 0.34811881878913675, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 39916 + }, + { + "epoch": 0.34812754007430535, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 39917 + }, + { + "epoch": 0.34813626135947395, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 39918 + }, + { + "epoch": 0.3481449826446425, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 39919 + }, + { + "epoch": 0.3481537039298111, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 39920 + }, + { + "epoch": 0.3481624252149797, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 39921 + }, + { + "epoch": 0.34817114650014824, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 39922 + }, + { + "epoch": 0.34817986778531684, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 39923 + }, + { + "epoch": 0.34818858907048544, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 39924 + }, + { + "epoch": 0.348197310355654, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 39925 + }, + { + "epoch": 0.3482060316408226, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 39926 + }, + { + "epoch": 0.3482147529259912, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 39927 + }, + { + "epoch": 0.34822347421115973, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 39928 + }, + { + "epoch": 0.34823219549632833, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 39929 + }, + { + "epoch": 0.34824091678149693, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 39930 + }, + { + "epoch": 0.3482496380666655, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 39931 + }, + { + "epoch": 0.3482583593518341, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 39932 + }, + { + "epoch": 0.3482670806370027, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 39933 + }, + { + "epoch": 0.3482758019221712, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 39934 + }, + { + "epoch": 0.3482845232073398, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 39935 + }, + { + "epoch": 0.3482932444925084, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 39936 + }, + { + "epoch": 0.348301965777677, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 39937 + }, + { + "epoch": 0.34831068706284557, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 39938 + }, + { + "epoch": 0.34831940834801417, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 39939 + }, + { + "epoch": 0.34832812963318277, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 39940 + }, + { + "epoch": 0.3483368509183513, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 39941 + }, + { + "epoch": 0.3483455722035199, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 39942 + }, + { + "epoch": 0.3483542934886885, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 39943 + }, + { + "epoch": 0.34836301477385706, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 39944 + }, + { + "epoch": 0.34837173605902566, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 39945 + }, + { + "epoch": 0.34838045734419426, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 39946 + }, + { + "epoch": 0.3483891786293628, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 39947 + }, + { + "epoch": 0.3483978999145314, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 39948 + }, + { + "epoch": 0.3484066211997, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 39949 + }, + { + "epoch": 0.34841534248486855, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 39950 + }, + { + "epoch": 0.34842406377003715, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 39951 + }, + { + "epoch": 0.34843278505520575, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 39952 + }, + { + "epoch": 0.3484415063403743, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 39953 + }, + { + "epoch": 0.3484502276255429, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 39954 + }, + { + "epoch": 0.3484589489107115, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 39955 + }, + { + "epoch": 0.34846767019588004, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 39956 + }, + { + "epoch": 0.34847639148104864, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 39957 + }, + { + "epoch": 0.34848511276621724, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 39958 + }, + { + "epoch": 0.3484938340513858, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 39959 + }, + { + "epoch": 0.3485025553365544, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 39960 + }, + { + "epoch": 0.348511276621723, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 39961 + }, + { + "epoch": 0.34851999790689153, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 39962 + }, + { + "epoch": 0.34852871919206013, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 39963 + }, + { + "epoch": 0.34853744047722873, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 39964 + }, + { + "epoch": 0.34854616176239733, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 39965 + }, + { + "epoch": 0.3485548830475659, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 39966 + }, + { + "epoch": 0.3485636043327345, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 39967 + }, + { + "epoch": 0.3485723256179031, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 39968 + }, + { + "epoch": 0.3485810469030716, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 39969 + }, + { + "epoch": 0.3485897681882402, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 39970 + }, + { + "epoch": 0.3485984894734088, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 39971 + }, + { + "epoch": 0.34860721075857737, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 39972 + }, + { + "epoch": 0.34861593204374597, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39973 + }, + { + "epoch": 0.34862465332891457, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 39974 + }, + { + "epoch": 0.3486333746140831, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 39975 + }, + { + "epoch": 0.3486420958992517, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 39976 + }, + { + "epoch": 0.3486508171844203, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 39977 + }, + { + "epoch": 0.34865953846958886, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 39978 + }, + { + "epoch": 0.34866825975475746, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 39979 + }, + { + "epoch": 0.34867698103992606, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 39980 + }, + { + "epoch": 0.3486857023250946, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 39981 + }, + { + "epoch": 0.3486944236102632, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 39982 + }, + { + "epoch": 0.3487031448954318, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 39983 + }, + { + "epoch": 0.34871186618060035, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 39984 + }, + { + "epoch": 0.34872058746576895, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 39985 + }, + { + "epoch": 0.34872930875093755, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 39986 + }, + { + "epoch": 0.3487380300361061, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 39987 + }, + { + "epoch": 0.3487467513212747, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 0.982, + "step": 39988 + }, + { + "epoch": 0.3487554726064433, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 39989 + }, + { + "epoch": 0.3487641938916119, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 39990 + }, + { + "epoch": 0.34877291517678044, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 39991 + }, + { + "epoch": 0.34878163646194904, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 39992 + }, + { + "epoch": 0.34879035774711764, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 39993 + }, + { + "epoch": 0.3487990790322862, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 39994 + }, + { + "epoch": 0.3488078003174548, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 39995 + }, + { + "epoch": 0.3488165216026234, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 39996 + }, + { + "epoch": 0.34882524288779193, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 39997 + }, + { + "epoch": 0.34883396417296053, + "grad_norm": 0.328125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 39998 + }, + { + "epoch": 0.34884268545812913, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 39999 + }, + { + "epoch": 0.3488514067432977, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 40000 + }, + { + "epoch": 0.3488601280284663, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 40001 + }, + { + "epoch": 0.3488688493136349, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 40002 + }, + { + "epoch": 0.3488775705988034, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 40003 + }, + { + "epoch": 0.348886291883972, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 40004 + }, + { + "epoch": 0.3488950131691406, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40005 + }, + { + "epoch": 0.34890373445430917, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 40006 + }, + { + "epoch": 0.34891245573947777, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 40007 + }, + { + "epoch": 0.34892117702464637, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 40008 + }, + { + "epoch": 0.3489298983098149, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 40009 + }, + { + "epoch": 0.3489386195949835, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 40010 + }, + { + "epoch": 0.3489473408801521, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 40011 + }, + { + "epoch": 0.34895606216532066, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 40012 + }, + { + "epoch": 0.34896478345048926, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 40013 + }, + { + "epoch": 0.34897350473565786, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 40014 + }, + { + "epoch": 0.3489822260208264, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 40015 + }, + { + "epoch": 0.348990947305995, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 40016 + }, + { + "epoch": 0.3489996685911636, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 40017 + }, + { + "epoch": 0.3490083898763322, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 40018 + }, + { + "epoch": 0.34901711116150075, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 40019 + }, + { + "epoch": 0.34902583244666935, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 40020 + }, + { + "epoch": 0.34903455373183795, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 40021 + }, + { + "epoch": 0.3490432750170065, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 40022 + }, + { + "epoch": 0.3490519963021751, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 40023 + }, + { + "epoch": 0.3490607175873437, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 40024 + }, + { + "epoch": 0.34906943887251224, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 40025 + }, + { + "epoch": 0.34907816015768084, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 40026 + }, + { + "epoch": 0.34908688144284944, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 40027 + }, + { + "epoch": 0.349095602728018, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 40028 + }, + { + "epoch": 0.3491043240131866, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 40029 + }, + { + "epoch": 0.3491130452983552, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 40030 + }, + { + "epoch": 0.34912176658352373, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 40031 + }, + { + "epoch": 0.34913048786869233, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 40032 + }, + { + "epoch": 0.34913920915386093, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 40033 + }, + { + "epoch": 0.3491479304390295, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 40034 + }, + { + "epoch": 0.3491566517241981, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 40035 + }, + { + "epoch": 0.3491653730093667, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 40036 + }, + { + "epoch": 0.3491740942945352, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 40037 + }, + { + "epoch": 0.3491828155797038, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 40038 + }, + { + "epoch": 0.3491915368648724, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 40039 + }, + { + "epoch": 0.34920025815004097, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 40040 + }, + { + "epoch": 0.34920897943520957, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 40041 + }, + { + "epoch": 0.34921770072037817, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 40042 + }, + { + "epoch": 0.3492264220055467, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 40043 + }, + { + "epoch": 0.3492351432907153, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 40044 + }, + { + "epoch": 0.3492438645758839, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 40045 + }, + { + "epoch": 0.3492525858610525, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 40046 + }, + { + "epoch": 0.34926130714622106, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 40047 + }, + { + "epoch": 0.34927002843138966, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 40048 + }, + { + "epoch": 0.34927874971655826, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40049 + }, + { + "epoch": 0.3492874710017268, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 40050 + }, + { + "epoch": 0.3492961922868954, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 40051 + }, + { + "epoch": 0.349304913572064, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9771, + "step": 40052 + }, + { + "epoch": 0.34931363485723255, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 40053 + }, + { + "epoch": 0.34932235614240115, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 40054 + }, + { + "epoch": 0.34933107742756975, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 40055 + }, + { + "epoch": 0.3493397987127383, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 40056 + }, + { + "epoch": 0.3493485199979069, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 40057 + }, + { + "epoch": 0.3493572412830755, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 40058 + }, + { + "epoch": 0.34936596256824404, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 40059 + }, + { + "epoch": 0.34937468385341264, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 40060 + }, + { + "epoch": 0.34938340513858124, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 40061 + }, + { + "epoch": 0.3493921264237498, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 40062 + }, + { + "epoch": 0.3494008477089184, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 40063 + }, + { + "epoch": 0.349409568994087, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 40064 + }, + { + "epoch": 0.34941829027925553, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 40065 + }, + { + "epoch": 0.34942701156442413, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 40066 + }, + { + "epoch": 0.34943573284959273, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 40067 + }, + { + "epoch": 0.3494444541347613, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 40068 + }, + { + "epoch": 0.3494531754199299, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 40069 + }, + { + "epoch": 0.3494618967050985, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 40070 + }, + { + "epoch": 0.349470617990267, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 40071 + }, + { + "epoch": 0.3494793392754356, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 40072 + }, + { + "epoch": 0.3494880605606042, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 40073 + }, + { + "epoch": 0.3494967818457728, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 40074 + }, + { + "epoch": 0.34950550313094136, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 40075 + }, + { + "epoch": 0.34951422441610996, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 40076 + }, + { + "epoch": 0.34952294570127856, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 40077 + }, + { + "epoch": 0.3495316669864471, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 40078 + }, + { + "epoch": 0.3495403882716157, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 40079 + }, + { + "epoch": 0.3495491095567843, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 40080 + }, + { + "epoch": 0.34955783084195285, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40081 + }, + { + "epoch": 0.34956655212712145, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 40082 + }, + { + "epoch": 0.34957527341229006, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40083 + }, + { + "epoch": 0.3495839946974586, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 40084 + }, + { + "epoch": 0.3495927159826272, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 40085 + }, + { + "epoch": 0.3496014372677958, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 40086 + }, + { + "epoch": 0.34961015855296435, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 40087 + }, + { + "epoch": 0.34961887983813295, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 40088 + }, + { + "epoch": 0.34962760112330155, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 40089 + }, + { + "epoch": 0.3496363224084701, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 40090 + }, + { + "epoch": 0.3496450436936387, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 40091 + }, + { + "epoch": 0.3496537649788073, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 40092 + }, + { + "epoch": 0.34966248626397584, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 40093 + }, + { + "epoch": 0.34967120754914444, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 40094 + }, + { + "epoch": 0.34967992883431304, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 40095 + }, + { + "epoch": 0.3496886501194816, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40096 + }, + { + "epoch": 0.3496973714046502, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 40097 + }, + { + "epoch": 0.3497060926898188, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 40098 + }, + { + "epoch": 0.3497148139749874, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 40099 + }, + { + "epoch": 0.3497235352601559, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 40100 + }, + { + "epoch": 0.3497322565453245, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 40101 + }, + { + "epoch": 0.3497409778304931, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 40102 + }, + { + "epoch": 0.34974969911566167, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 40103 + }, + { + "epoch": 0.34975842040083027, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 40104 + }, + { + "epoch": 0.3497671416859989, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 40105 + }, + { + "epoch": 0.3497758629711674, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 40106 + }, + { + "epoch": 0.349784584256336, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 40107 + }, + { + "epoch": 0.3497933055415046, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 40108 + }, + { + "epoch": 0.34980202682667316, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 40109 + }, + { + "epoch": 0.34981074811184176, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 40110 + }, + { + "epoch": 0.34981946939701036, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 40111 + }, + { + "epoch": 0.3498281906821789, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 40112 + }, + { + "epoch": 0.3498369119673475, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 40113 + }, + { + "epoch": 0.3498456332525161, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 40114 + }, + { + "epoch": 0.34985435453768465, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 40115 + }, + { + "epoch": 0.34986307582285325, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 40116 + }, + { + "epoch": 0.34987179710802185, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 40117 + }, + { + "epoch": 0.3498805183931904, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40118 + }, + { + "epoch": 0.349889239678359, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 40119 + }, + { + "epoch": 0.3498979609635276, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 40120 + }, + { + "epoch": 0.34990668224869614, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 40121 + }, + { + "epoch": 0.34991540353386474, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 40122 + }, + { + "epoch": 0.34992412481903334, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 40123 + }, + { + "epoch": 0.3499328461042019, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 40124 + }, + { + "epoch": 0.3499415673893705, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 40125 + }, + { + "epoch": 0.3499502886745391, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 40126 + }, + { + "epoch": 0.3499590099597077, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 40127 + }, + { + "epoch": 0.34996773124487623, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 40128 + }, + { + "epoch": 0.34997645253004483, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 40129 + }, + { + "epoch": 0.34998517381521344, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 40130 + }, + { + "epoch": 0.349993895100382, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 40131 + }, + { + "epoch": 0.3500026163855506, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 40132 + }, + { + "epoch": 0.3500113376707192, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 40133 + }, + { + "epoch": 0.3500200589558877, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 40134 + }, + { + "epoch": 0.3500287802410563, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 40135 + }, + { + "epoch": 0.3500375015262249, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 40136 + }, + { + "epoch": 0.35004622281139347, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 40137 + }, + { + "epoch": 0.35005494409656207, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 40138 + }, + { + "epoch": 0.35006366538173067, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 40139 + }, + { + "epoch": 0.3500723866668992, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 40140 + }, + { + "epoch": 0.3500811079520678, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 40141 + }, + { + "epoch": 0.3500898292372364, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 40142 + }, + { + "epoch": 0.35009855052240496, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 40143 + }, + { + "epoch": 0.35010727180757356, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 40144 + }, + { + "epoch": 0.35011599309274216, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 40145 + }, + { + "epoch": 0.3501247143779107, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 40146 + }, + { + "epoch": 0.3501334356630793, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 40147 + }, + { + "epoch": 0.3501421569482479, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40148 + }, + { + "epoch": 0.35015087823341645, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 40149 + }, + { + "epoch": 0.35015959951858505, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 40150 + }, + { + "epoch": 0.35016832080375365, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9803, + "step": 40151 + }, + { + "epoch": 0.3501770420889222, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40152 + }, + { + "epoch": 0.3501857633740908, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 40153 + }, + { + "epoch": 0.3501944846592594, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 40154 + }, + { + "epoch": 0.350203205944428, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 40155 + }, + { + "epoch": 0.35021192722959654, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 40156 + }, + { + "epoch": 0.35022064851476514, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 40157 + }, + { + "epoch": 0.35022936979993374, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 40158 + }, + { + "epoch": 0.3502380910851023, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 40159 + }, + { + "epoch": 0.3502468123702709, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 40160 + }, + { + "epoch": 0.3502555336554395, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 40161 + }, + { + "epoch": 0.35026425494060803, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 40162 + }, + { + "epoch": 0.35027297622577663, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 40163 + }, + { + "epoch": 0.35028169751094523, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 40164 + }, + { + "epoch": 0.3502904187961138, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 40165 + }, + { + "epoch": 0.3502991400812824, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 40166 + }, + { + "epoch": 0.350307861366451, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 40167 + }, + { + "epoch": 0.3503165826516195, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 40168 + }, + { + "epoch": 0.3503253039367881, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 40169 + }, + { + "epoch": 0.3503340252219567, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 40170 + }, + { + "epoch": 0.35034274650712527, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 40171 + }, + { + "epoch": 0.35035146779229387, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 40172 + }, + { + "epoch": 0.35036018907746247, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 40173 + }, + { + "epoch": 0.350368910362631, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 40174 + }, + { + "epoch": 0.3503776316477996, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 40175 + }, + { + "epoch": 0.3503863529329682, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 40176 + }, + { + "epoch": 0.35039507421813676, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 40177 + }, + { + "epoch": 0.35040379550330536, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 40178 + }, + { + "epoch": 0.35041251678847396, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 40179 + }, + { + "epoch": 0.3504212380736425, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 40180 + }, + { + "epoch": 0.3504299593588111, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 40181 + }, + { + "epoch": 0.3504386806439797, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 40182 + }, + { + "epoch": 0.3504474019291483, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 40183 + }, + { + "epoch": 0.35045612321431685, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 40184 + }, + { + "epoch": 0.35046484449948545, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9744, + "step": 40185 + }, + { + "epoch": 0.35047356578465405, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 40186 + }, + { + "epoch": 0.3504822870698226, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 40187 + }, + { + "epoch": 0.3504910083549912, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 40188 + }, + { + "epoch": 0.3504997296401598, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 40189 + }, + { + "epoch": 0.35050845092532834, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 40190 + }, + { + "epoch": 0.35051717221049694, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 40191 + }, + { + "epoch": 0.35052589349566554, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 40192 + }, + { + "epoch": 0.3505346147808341, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 40193 + }, + { + "epoch": 0.3505433360660027, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 40194 + }, + { + "epoch": 0.3505520573511713, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 40195 + }, + { + "epoch": 0.35056077863633983, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 40196 + }, + { + "epoch": 0.35056949992150843, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 40197 + }, + { + "epoch": 0.35057822120667703, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 40198 + }, + { + "epoch": 0.3505869424918456, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 40199 + }, + { + "epoch": 0.3505956637770142, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 40200 + }, + { + "epoch": 0.3506043850621828, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40201 + }, + { + "epoch": 0.3506131063473513, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 40202 + }, + { + "epoch": 0.3506218276325199, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 40203 + }, + { + "epoch": 0.3506305489176885, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 40204 + }, + { + "epoch": 0.35063927020285707, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 40205 + }, + { + "epoch": 0.35064799148802567, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 40206 + }, + { + "epoch": 0.35065671277319427, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 40207 + }, + { + "epoch": 0.3506654340583628, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 40208 + }, + { + "epoch": 0.3506741553435314, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 40209 + }, + { + "epoch": 0.3506828766287, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 40210 + }, + { + "epoch": 0.3506915979138686, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 40211 + }, + { + "epoch": 0.35070031919903716, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 40212 + }, + { + "epoch": 0.35070904048420576, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 40213 + }, + { + "epoch": 0.35071776176937436, + "grad_norm": 0.380859375, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 40214 + }, + { + "epoch": 0.3507264830545429, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 40215 + }, + { + "epoch": 0.3507352043397115, + "grad_norm": 0.392578125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 40216 + }, + { + "epoch": 0.3507439256248801, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 40217 + }, + { + "epoch": 0.35075264691004865, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 40218 + }, + { + "epoch": 0.35076136819521725, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 40219 + }, + { + "epoch": 0.35077008948038585, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 40220 + }, + { + "epoch": 0.3507788107655544, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 40221 + }, + { + "epoch": 0.350787532050723, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 40222 + }, + { + "epoch": 0.3507962533358916, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 40223 + }, + { + "epoch": 0.35080497462106014, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 40224 + }, + { + "epoch": 0.35081369590622874, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 40225 + }, + { + "epoch": 0.35082241719139734, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 40226 + }, + { + "epoch": 0.3508311384765659, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 40227 + }, + { + "epoch": 0.3508398597617345, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 40228 + }, + { + "epoch": 0.3508485810469031, + "grad_norm": 0.353515625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 40229 + }, + { + "epoch": 0.35085730233207163, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 40230 + }, + { + "epoch": 0.35086602361724023, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 40231 + }, + { + "epoch": 0.35087474490240883, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 40232 + }, + { + "epoch": 0.3508834661875774, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 40233 + }, + { + "epoch": 0.350892187472746, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9789, + "step": 40234 + }, + { + "epoch": 0.3509009087579146, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 40235 + }, + { + "epoch": 0.3509096300430832, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 40236 + }, + { + "epoch": 0.3509183513282517, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 40237 + }, + { + "epoch": 0.3509270726134203, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 40238 + }, + { + "epoch": 0.3509357938985889, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 40239 + }, + { + "epoch": 0.35094451518375747, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 40240 + }, + { + "epoch": 0.35095323646892607, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 40241 + }, + { + "epoch": 0.35096195775409467, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 40242 + }, + { + "epoch": 0.3509706790392632, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 40243 + }, + { + "epoch": 0.3509794003244318, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 40244 + }, + { + "epoch": 0.3509881216096004, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 40245 + }, + { + "epoch": 0.35099684289476896, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 40246 + }, + { + "epoch": 0.35100556417993756, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 40247 + }, + { + "epoch": 0.35101428546510616, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 40248 + }, + { + "epoch": 0.3510230067502747, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 40249 + }, + { + "epoch": 0.3510317280354433, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 40250 + }, + { + "epoch": 0.3510404493206119, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 40251 + }, + { + "epoch": 0.35104917060578045, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 40252 + }, + { + "epoch": 0.35105789189094905, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 40253 + }, + { + "epoch": 0.35106661317611765, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 40254 + }, + { + "epoch": 0.3510753344612862, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 40255 + }, + { + "epoch": 0.3510840557464548, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9786, + "step": 40256 + }, + { + "epoch": 0.3510927770316234, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 40257 + }, + { + "epoch": 0.35110149831679194, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40258 + }, + { + "epoch": 0.35111021960196054, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 40259 + }, + { + "epoch": 0.35111894088712914, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 40260 + }, + { + "epoch": 0.3511276621722977, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 40261 + }, + { + "epoch": 0.3511363834574663, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 40262 + }, + { + "epoch": 0.3511451047426349, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 40263 + }, + { + "epoch": 0.3511538260278035, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 40264 + }, + { + "epoch": 0.35116254731297203, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 40265 + }, + { + "epoch": 0.35117126859814063, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 40266 + }, + { + "epoch": 0.35117998988330923, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 40267 + }, + { + "epoch": 0.3511887111684778, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 40268 + }, + { + "epoch": 0.3511974324536464, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 40269 + }, + { + "epoch": 0.351206153738815, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 40270 + }, + { + "epoch": 0.3512148750239835, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 40271 + }, + { + "epoch": 0.3512235963091521, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 40272 + }, + { + "epoch": 0.3512323175943207, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 40273 + }, + { + "epoch": 0.35124103887948926, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 40274 + }, + { + "epoch": 0.35124976016465786, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 40275 + }, + { + "epoch": 0.35125848144982647, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 40276 + }, + { + "epoch": 0.351267202734995, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 40277 + }, + { + "epoch": 0.3512759240201636, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 40278 + }, + { + "epoch": 0.3512846453053322, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 40279 + }, + { + "epoch": 0.35129336659050076, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 40280 + }, + { + "epoch": 0.35130208787566936, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 40281 + }, + { + "epoch": 0.35131080916083796, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 40282 + }, + { + "epoch": 0.3513195304460065, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 40283 + }, + { + "epoch": 0.3513282517311751, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 40284 + }, + { + "epoch": 0.3513369730163437, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 40285 + }, + { + "epoch": 0.35134569430151225, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 40286 + }, + { + "epoch": 0.35135441558668085, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 40287 + }, + { + "epoch": 0.35136313687184945, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 40288 + }, + { + "epoch": 0.351371858157018, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 40289 + }, + { + "epoch": 0.3513805794421866, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9803, + "step": 40290 + }, + { + "epoch": 0.3513893007273552, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 40291 + }, + { + "epoch": 0.3513980220125238, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 40292 + }, + { + "epoch": 0.35140674329769234, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 40293 + }, + { + "epoch": 0.35141546458286094, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 40294 + }, + { + "epoch": 0.35142418586802954, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 40295 + }, + { + "epoch": 0.3514329071531981, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 40296 + }, + { + "epoch": 0.3514416284383667, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 40297 + }, + { + "epoch": 0.3514503497235353, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 40298 + }, + { + "epoch": 0.3514590710087038, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 40299 + }, + { + "epoch": 0.3514677922938724, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 40300 + }, + { + "epoch": 0.35147651357904103, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 40301 + }, + { + "epoch": 0.3514852348642096, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40302 + }, + { + "epoch": 0.3514939561493782, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 40303 + }, + { + "epoch": 0.3515026774345468, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 40304 + }, + { + "epoch": 0.3515113987197153, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 40305 + }, + { + "epoch": 0.3515201200048839, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 40306 + }, + { + "epoch": 0.3515288412900525, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 40307 + }, + { + "epoch": 0.35153756257522106, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 40308 + }, + { + "epoch": 0.35154628386038966, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 40309 + }, + { + "epoch": 0.35155500514555826, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 40310 + }, + { + "epoch": 0.3515637264307268, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 40311 + }, + { + "epoch": 0.3515724477158954, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 40312 + }, + { + "epoch": 0.351581169001064, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 40313 + }, + { + "epoch": 0.35158989028623255, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 40314 + }, + { + "epoch": 0.35159861157140115, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 40315 + }, + { + "epoch": 0.35160733285656975, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 40316 + }, + { + "epoch": 0.3516160541417383, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 40317 + }, + { + "epoch": 0.3516247754269069, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40318 + }, + { + "epoch": 0.3516334967120755, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 40319 + }, + { + "epoch": 0.3516422179972441, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 40320 + }, + { + "epoch": 0.35165093928241264, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 40321 + }, + { + "epoch": 0.35165966056758124, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 40322 + }, + { + "epoch": 0.35166838185274985, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 40323 + }, + { + "epoch": 0.3516771031379184, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 40324 + }, + { + "epoch": 0.351685824423087, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 40325 + }, + { + "epoch": 0.3516945457082556, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 40326 + }, + { + "epoch": 0.35170326699342414, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 40327 + }, + { + "epoch": 0.35171198827859274, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 40328 + }, + { + "epoch": 0.35172070956376134, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 40329 + }, + { + "epoch": 0.3517294308489299, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 40330 + }, + { + "epoch": 0.3517381521340985, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 40331 + }, + { + "epoch": 0.3517468734192671, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 40332 + }, + { + "epoch": 0.3517555947044356, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 40333 + }, + { + "epoch": 0.3517643159896042, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 40334 + }, + { + "epoch": 0.3517730372747728, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 40335 + }, + { + "epoch": 0.35178175855994137, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 40336 + }, + { + "epoch": 0.35179047984510997, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 40337 + }, + { + "epoch": 0.35179920113027857, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 40338 + }, + { + "epoch": 0.3518079224154471, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 40339 + }, + { + "epoch": 0.3518166437006157, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 40340 + }, + { + "epoch": 0.3518253649857843, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 40341 + }, + { + "epoch": 0.35183408627095286, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 40342 + }, + { + "epoch": 0.35184280755612146, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 40343 + }, + { + "epoch": 0.35185152884129006, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 40344 + }, + { + "epoch": 0.35186025012645866, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 40345 + }, + { + "epoch": 0.3518689714116272, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 40346 + }, + { + "epoch": 0.3518776926967958, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 40347 + }, + { + "epoch": 0.3518864139819644, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 40348 + }, + { + "epoch": 0.35189513526713295, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 40349 + }, + { + "epoch": 0.35190385655230155, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 40350 + }, + { + "epoch": 0.35191257783747015, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 40351 + }, + { + "epoch": 0.3519212991226387, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 40352 + }, + { + "epoch": 0.3519300204078073, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 40353 + }, + { + "epoch": 0.3519387416929759, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 40354 + }, + { + "epoch": 0.35194746297814444, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 40355 + }, + { + "epoch": 0.35195618426331304, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 40356 + }, + { + "epoch": 0.35196490554848164, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 40357 + }, + { + "epoch": 0.3519736268336502, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 40358 + }, + { + "epoch": 0.3519823481188188, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 40359 + }, + { + "epoch": 0.3519910694039874, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 40360 + }, + { + "epoch": 0.35199979068915593, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 40361 + }, + { + "epoch": 0.35200851197432453, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 40362 + }, + { + "epoch": 0.35201723325949313, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 40363 + }, + { + "epoch": 0.3520259545446617, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 40364 + }, + { + "epoch": 0.3520346758298303, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 40365 + }, + { + "epoch": 0.3520433971149989, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 40366 + }, + { + "epoch": 0.3520521184001674, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 40367 + }, + { + "epoch": 0.352060839685336, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 40368 + }, + { + "epoch": 0.3520695609705046, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 40369 + }, + { + "epoch": 0.35207828225567317, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 40370 + }, + { + "epoch": 0.35208700354084177, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 40371 + }, + { + "epoch": 0.35209572482601037, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 40372 + }, + { + "epoch": 0.35210444611117897, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 40373 + }, + { + "epoch": 0.3521131673963475, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 40374 + }, + { + "epoch": 0.3521218886815161, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 40375 + }, + { + "epoch": 0.3521306099666847, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 40376 + }, + { + "epoch": 0.35213933125185326, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 40377 + }, + { + "epoch": 0.35214805253702186, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 40378 + }, + { + "epoch": 0.35215677382219046, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40379 + }, + { + "epoch": 0.352165495107359, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 40380 + }, + { + "epoch": 0.3521742163925276, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 40381 + }, + { + "epoch": 0.3521829376776962, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 40382 + }, + { + "epoch": 0.35219165896286475, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 40383 + }, + { + "epoch": 0.35220038024803335, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40384 + }, + { + "epoch": 0.35220910153320195, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 40385 + }, + { + "epoch": 0.3522178228183705, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 40386 + }, + { + "epoch": 0.3522265441035391, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 40387 + }, + { + "epoch": 0.3522352653887077, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40388 + }, + { + "epoch": 0.35224398667387624, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 40389 + }, + { + "epoch": 0.35225270795904484, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 40390 + }, + { + "epoch": 0.35226142924421344, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 40391 + }, + { + "epoch": 0.352270150529382, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 40392 + }, + { + "epoch": 0.3522788718145506, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 40393 + }, + { + "epoch": 0.3522875930997192, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 40394 + }, + { + "epoch": 0.35229631438488773, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 40395 + }, + { + "epoch": 0.35230503567005633, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 40396 + }, + { + "epoch": 0.35231375695522493, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 40397 + }, + { + "epoch": 0.3523224782403935, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 40398 + }, + { + "epoch": 0.3523311995255621, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 40399 + }, + { + "epoch": 0.3523399208107307, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 40400 + }, + { + "epoch": 0.3523486420958993, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 40401 + }, + { + "epoch": 0.3523573633810678, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 40402 + }, + { + "epoch": 0.3523660846662364, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 40403 + }, + { + "epoch": 0.352374805951405, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 40404 + }, + { + "epoch": 0.35238352723657357, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 40405 + }, + { + "epoch": 0.35239224852174217, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 40406 + }, + { + "epoch": 0.35240096980691077, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 40407 + }, + { + "epoch": 0.3524096910920793, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 40408 + }, + { + "epoch": 0.3524184123772479, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 40409 + }, + { + "epoch": 0.3524271336624165, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 40410 + }, + { + "epoch": 0.35243585494758506, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 40411 + }, + { + "epoch": 0.35244457623275366, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 40412 + }, + { + "epoch": 0.35245329751792226, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 40413 + }, + { + "epoch": 0.3524620188030908, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 40414 + }, + { + "epoch": 0.3524707400882594, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 40415 + }, + { + "epoch": 0.352479461373428, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 40416 + }, + { + "epoch": 0.35248818265859655, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 40417 + }, + { + "epoch": 0.35249690394376515, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 40418 + }, + { + "epoch": 0.35250562522893375, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 40419 + }, + { + "epoch": 0.3525143465141023, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 40420 + }, + { + "epoch": 0.3525230677992709, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40421 + }, + { + "epoch": 0.3525317890844395, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 40422 + }, + { + "epoch": 0.35254051036960804, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 40423 + }, + { + "epoch": 0.35254923165477664, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 40424 + }, + { + "epoch": 0.35255795293994524, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 40425 + }, + { + "epoch": 0.3525666742251138, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 40426 + }, + { + "epoch": 0.3525753955102824, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 40427 + }, + { + "epoch": 0.352584116795451, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 40428 + }, + { + "epoch": 0.3525928380806196, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 40429 + }, + { + "epoch": 0.35260155936578813, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 40430 + }, + { + "epoch": 0.35261028065095673, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 40431 + }, + { + "epoch": 0.35261900193612533, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 40432 + }, + { + "epoch": 0.3526277232212939, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 40433 + }, + { + "epoch": 0.3526364445064625, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 40434 + }, + { + "epoch": 0.3526451657916311, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40435 + }, + { + "epoch": 0.3526538870767996, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 40436 + }, + { + "epoch": 0.3526626083619682, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40437 + }, + { + "epoch": 0.3526713296471368, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 40438 + }, + { + "epoch": 0.35268005093230537, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 40439 + }, + { + "epoch": 0.35268877221747397, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 40440 + }, + { + "epoch": 0.35269749350264257, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 40441 + }, + { + "epoch": 0.3527062147878111, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 40442 + }, + { + "epoch": 0.3527149360729797, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 40443 + }, + { + "epoch": 0.3527236573581483, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 40444 + }, + { + "epoch": 0.35273237864331686, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 40445 + }, + { + "epoch": 0.35274109992848546, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 40446 + }, + { + "epoch": 0.35274982121365406, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 40447 + }, + { + "epoch": 0.3527585424988226, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 40448 + }, + { + "epoch": 0.3527672637839912, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 40449 + }, + { + "epoch": 0.3527759850691598, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 40450 + }, + { + "epoch": 0.35278470635432835, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9709, + "step": 40451 + }, + { + "epoch": 0.35279342763949695, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 40452 + }, + { + "epoch": 0.35280214892466555, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 40453 + }, + { + "epoch": 0.35281087020983415, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 40454 + }, + { + "epoch": 0.3528195914950027, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 40455 + }, + { + "epoch": 0.3528283127801713, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 40456 + }, + { + "epoch": 0.3528370340653399, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 40457 + }, + { + "epoch": 0.35284575535050844, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 40458 + }, + { + "epoch": 0.35285447663567704, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 40459 + }, + { + "epoch": 0.35286319792084564, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 40460 + }, + { + "epoch": 0.3528719192060142, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 40461 + }, + { + "epoch": 0.3528806404911828, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 40462 + }, + { + "epoch": 0.3528893617763514, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 40463 + }, + { + "epoch": 0.35289808306151993, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 40464 + }, + { + "epoch": 0.35290680434668853, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 40465 + }, + { + "epoch": 0.35291552563185713, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 40466 + }, + { + "epoch": 0.3529242469170257, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 40467 + }, + { + "epoch": 0.3529329682021943, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 40468 + }, + { + "epoch": 0.3529416894873629, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 40469 + }, + { + "epoch": 0.3529504107725314, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 40470 + }, + { + "epoch": 0.3529591320577, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 40471 + }, + { + "epoch": 0.3529678533428686, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 40472 + }, + { + "epoch": 0.35297657462803717, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 40473 + }, + { + "epoch": 0.35298529591320577, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 40474 + }, + { + "epoch": 0.35299401719837437, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 40475 + }, + { + "epoch": 0.3530027384835429, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 40476 + }, + { + "epoch": 0.3530114597687115, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 40477 + }, + { + "epoch": 0.3530201810538801, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 40478 + }, + { + "epoch": 0.35302890233904866, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 40479 + }, + { + "epoch": 0.35303762362421726, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 40480 + }, + { + "epoch": 0.35304634490938586, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 40481 + }, + { + "epoch": 0.35305506619455446, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 40482 + }, + { + "epoch": 0.353063787479723, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 40483 + }, + { + "epoch": 0.3530725087648916, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 40484 + }, + { + "epoch": 0.3530812300500602, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 40485 + }, + { + "epoch": 0.35308995133522875, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 40486 + }, + { + "epoch": 0.35309867262039735, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 40487 + }, + { + "epoch": 0.35310739390556595, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 40488 + }, + { + "epoch": 0.3531161151907345, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 40489 + }, + { + "epoch": 0.3531248364759031, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 40490 + }, + { + "epoch": 0.3531335577610717, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 40491 + }, + { + "epoch": 0.35314227904624024, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 40492 + }, + { + "epoch": 0.35315100033140884, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 40493 + }, + { + "epoch": 0.35315972161657744, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40494 + }, + { + "epoch": 0.353168442901746, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 40495 + }, + { + "epoch": 0.3531771641869146, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 40496 + }, + { + "epoch": 0.3531858854720832, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 40497 + }, + { + "epoch": 0.35319460675725173, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 40498 + }, + { + "epoch": 0.35320332804242033, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 40499 + }, + { + "epoch": 0.35321204932758893, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 40500 + }, + { + "epoch": 0.3532207706127575, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 40501 + }, + { + "epoch": 0.3532294918979261, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 40502 + }, + { + "epoch": 0.3532382131830947, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 40503 + }, + { + "epoch": 0.3532469344682632, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 40504 + }, + { + "epoch": 0.3532556557534318, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 40505 + }, + { + "epoch": 0.3532643770386004, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 40506 + }, + { + "epoch": 0.35327309832376896, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 40507 + }, + { + "epoch": 0.35328181960893756, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 40508 + }, + { + "epoch": 0.35329054089410616, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 0.978, + "step": 40509 + }, + { + "epoch": 0.35329926217927476, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.983, + "step": 40510 + }, + { + "epoch": 0.3533079834644433, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 40511 + }, + { + "epoch": 0.3533167047496119, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 40512 + }, + { + "epoch": 0.3533254260347805, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 40513 + }, + { + "epoch": 0.35333414731994905, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 40514 + }, + { + "epoch": 0.35334286860511765, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 40515 + }, + { + "epoch": 0.35335158989028626, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 40516 + }, + { + "epoch": 0.3533603111754548, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 40517 + }, + { + "epoch": 0.3533690324606234, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 40518 + }, + { + "epoch": 0.353377753745792, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 40519 + }, + { + "epoch": 0.35338647503096055, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 40520 + }, + { + "epoch": 0.35339519631612915, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 40521 + }, + { + "epoch": 0.35340391760129775, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 40522 + }, + { + "epoch": 0.3534126388864663, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 40523 + }, + { + "epoch": 0.3534213601716349, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 40524 + }, + { + "epoch": 0.3534300814568035, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 40525 + }, + { + "epoch": 0.35343880274197204, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 40526 + }, + { + "epoch": 0.35344752402714064, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 40527 + }, + { + "epoch": 0.35345624531230924, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 40528 + }, + { + "epoch": 0.3534649665974778, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 40529 + }, + { + "epoch": 0.3534736878826464, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 40530 + }, + { + "epoch": 0.353482409167815, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 40531 + }, + { + "epoch": 0.3534911304529835, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 40532 + }, + { + "epoch": 0.3534998517381521, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 40533 + }, + { + "epoch": 0.3535085730233207, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 40534 + }, + { + "epoch": 0.35351729430848927, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 40535 + }, + { + "epoch": 0.35352601559365787, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 40536 + }, + { + "epoch": 0.35353473687882647, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 40537 + }, + { + "epoch": 0.3535434581639951, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 40538 + }, + { + "epoch": 0.3535521794491636, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 40539 + }, + { + "epoch": 0.3535609007343322, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 40540 + }, + { + "epoch": 0.3535696220195008, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 40541 + }, + { + "epoch": 0.35357834330466936, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.983, + "step": 40542 + }, + { + "epoch": 0.35358706458983796, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 40543 + }, + { + "epoch": 0.35359578587500656, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 40544 + }, + { + "epoch": 0.3536045071601751, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 40545 + }, + { + "epoch": 0.3536132284453437, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 40546 + }, + { + "epoch": 0.3536219497305123, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 40547 + }, + { + "epoch": 0.35363067101568085, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 40548 + }, + { + "epoch": 0.35363939230084945, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 40549 + }, + { + "epoch": 0.35364811358601805, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 40550 + }, + { + "epoch": 0.3536568348711866, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 40551 + }, + { + "epoch": 0.3536655561563552, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40552 + }, + { + "epoch": 0.3536742774415238, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 40553 + }, + { + "epoch": 0.35368299872669234, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 40554 + }, + { + "epoch": 0.35369172001186094, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 40555 + }, + { + "epoch": 0.35370044129702954, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 40556 + }, + { + "epoch": 0.3537091625821981, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 40557 + }, + { + "epoch": 0.3537178838673667, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 40558 + }, + { + "epoch": 0.3537266051525353, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 40559 + }, + { + "epoch": 0.35373532643770383, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 40560 + }, + { + "epoch": 0.35374404772287243, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 40561 + }, + { + "epoch": 0.35375276900804103, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 40562 + }, + { + "epoch": 0.35376149029320964, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 40563 + }, + { + "epoch": 0.3537702115783782, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 40564 + }, + { + "epoch": 0.3537789328635468, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 40565 + }, + { + "epoch": 0.3537876541487154, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 40566 + }, + { + "epoch": 0.3537963754338839, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 40567 + }, + { + "epoch": 0.3538050967190525, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40568 + }, + { + "epoch": 0.3538138180042211, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 40569 + }, + { + "epoch": 0.35382253928938967, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 40570 + }, + { + "epoch": 0.35383126057455827, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 40571 + }, + { + "epoch": 0.35383998185972687, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 40572 + }, + { + "epoch": 0.3538487031448954, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 40573 + }, + { + "epoch": 0.353857424430064, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 40574 + }, + { + "epoch": 0.3538661457152326, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 40575 + }, + { + "epoch": 0.35387486700040116, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 40576 + }, + { + "epoch": 0.35388358828556976, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 40577 + }, + { + "epoch": 0.35389230957073836, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 40578 + }, + { + "epoch": 0.3539010308559069, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 40579 + }, + { + "epoch": 0.3539097521410755, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 40580 + }, + { + "epoch": 0.3539184734262441, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 40581 + }, + { + "epoch": 0.35392719471141265, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 40582 + }, + { + "epoch": 0.35393591599658125, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 40583 + }, + { + "epoch": 0.35394463728174985, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 40584 + }, + { + "epoch": 0.3539533585669184, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 40585 + }, + { + "epoch": 0.353962079852087, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 40586 + }, + { + "epoch": 0.3539708011372556, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 40587 + }, + { + "epoch": 0.35397952242242414, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 40588 + }, + { + "epoch": 0.35398824370759274, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 40589 + }, + { + "epoch": 0.35399696499276134, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40590 + }, + { + "epoch": 0.35400568627792994, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 40591 + }, + { + "epoch": 0.3540144075630985, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 40592 + }, + { + "epoch": 0.3540231288482671, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 40593 + }, + { + "epoch": 0.3540318501334357, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 40594 + }, + { + "epoch": 0.35404057141860423, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 40595 + }, + { + "epoch": 0.35404929270377283, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 40596 + }, + { + "epoch": 0.35405801398894143, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40597 + }, + { + "epoch": 0.35406673527411, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 40598 + }, + { + "epoch": 0.3540754565592786, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 40599 + }, + { + "epoch": 0.3540841778444472, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 40600 + }, + { + "epoch": 0.3540928991296157, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 40601 + }, + { + "epoch": 0.3541016204147843, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 40602 + }, + { + "epoch": 0.3541103416999529, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 40603 + }, + { + "epoch": 0.35411906298512147, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 40604 + }, + { + "epoch": 0.35412778427029007, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 40605 + }, + { + "epoch": 0.35413650555545867, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 40606 + }, + { + "epoch": 0.3541452268406272, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9765, + "step": 40607 + }, + { + "epoch": 0.3541539481257958, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 40608 + }, + { + "epoch": 0.3541626694109644, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 40609 + }, + { + "epoch": 0.35417139069613296, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 40610 + }, + { + "epoch": 0.35418011198130156, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40611 + }, + { + "epoch": 0.35418883326647016, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 40612 + }, + { + "epoch": 0.3541975545516387, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 40613 + }, + { + "epoch": 0.3542062758368073, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 40614 + }, + { + "epoch": 0.3542149971219759, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 40615 + }, + { + "epoch": 0.35422371840714445, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 40616 + }, + { + "epoch": 0.35423243969231305, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 40617 + }, + { + "epoch": 0.35424116097748165, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 40618 + }, + { + "epoch": 0.35424988226265025, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 40619 + }, + { + "epoch": 0.3542586035478188, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 40620 + }, + { + "epoch": 0.3542673248329874, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 40621 + }, + { + "epoch": 0.354276046118156, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 40622 + }, + { + "epoch": 0.35428476740332454, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 40623 + }, + { + "epoch": 0.35429348868849314, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 40624 + }, + { + "epoch": 0.35430220997366174, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 40625 + }, + { + "epoch": 0.3543109312588303, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40626 + }, + { + "epoch": 0.3543196525439989, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 40627 + }, + { + "epoch": 0.3543283738291675, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 40628 + }, + { + "epoch": 0.35433709511433603, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 40629 + }, + { + "epoch": 0.35434581639950463, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 40630 + }, + { + "epoch": 0.35435453768467323, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 40631 + }, + { + "epoch": 0.3543632589698418, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 40632 + }, + { + "epoch": 0.3543719802550104, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 40633 + }, + { + "epoch": 0.354380701540179, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 40634 + }, + { + "epoch": 0.3543894228253475, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 40635 + }, + { + "epoch": 0.3543981441105161, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 40636 + }, + { + "epoch": 0.3544068653956847, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 40637 + }, + { + "epoch": 0.35441558668085327, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 40638 + }, + { + "epoch": 0.35442430796602187, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 40639 + }, + { + "epoch": 0.35443302925119047, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 40640 + }, + { + "epoch": 0.354441750536359, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 40641 + }, + { + "epoch": 0.3544504718215276, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 40642 + }, + { + "epoch": 0.3544591931066962, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 40643 + }, + { + "epoch": 0.35446791439186476, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 40644 + }, + { + "epoch": 0.35447663567703336, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 40645 + }, + { + "epoch": 0.35448535696220196, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 40646 + }, + { + "epoch": 0.35449407824737056, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9774, + "step": 40647 + }, + { + "epoch": 0.3545027995325391, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 40648 + }, + { + "epoch": 0.3545115208177077, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40649 + }, + { + "epoch": 0.3545202421028763, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 40650 + }, + { + "epoch": 0.35452896338804485, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 40651 + }, + { + "epoch": 0.35453768467321345, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 40652 + }, + { + "epoch": 0.35454640595838205, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 40653 + }, + { + "epoch": 0.3545551272435506, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 40654 + }, + { + "epoch": 0.3545638485287192, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40655 + }, + { + "epoch": 0.3545725698138878, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 40656 + }, + { + "epoch": 0.35458129109905634, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 40657 + }, + { + "epoch": 0.35459001238422494, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 40658 + }, + { + "epoch": 0.35459873366939354, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 40659 + }, + { + "epoch": 0.3546074549545621, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 40660 + }, + { + "epoch": 0.3546161762397307, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 40661 + }, + { + "epoch": 0.3546248975248993, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 40662 + }, + { + "epoch": 0.35463361881006783, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 40663 + }, + { + "epoch": 0.35464234009523643, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 40664 + }, + { + "epoch": 0.35465106138040503, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 40665 + }, + { + "epoch": 0.3546597826655736, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 40666 + }, + { + "epoch": 0.3546685039507422, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 40667 + }, + { + "epoch": 0.3546772252359108, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 40668 + }, + { + "epoch": 0.3546859465210793, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 40669 + }, + { + "epoch": 0.3546946678062479, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 40670 + }, + { + "epoch": 0.3547033890914165, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 40671 + }, + { + "epoch": 0.3547121103765851, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9832, + "step": 40672 + }, + { + "epoch": 0.35472083166175367, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 40673 + }, + { + "epoch": 0.35472955294692227, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 40674 + }, + { + "epoch": 0.35473827423209087, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 40675 + }, + { + "epoch": 0.3547469955172594, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 40676 + }, + { + "epoch": 0.354755716802428, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 40677 + }, + { + "epoch": 0.3547644380875966, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 40678 + }, + { + "epoch": 0.35477315937276516, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40679 + }, + { + "epoch": 0.35478188065793376, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 40680 + }, + { + "epoch": 0.35479060194310236, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 40681 + }, + { + "epoch": 0.3547993232282709, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 40682 + }, + { + "epoch": 0.3548080445134395, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 40683 + }, + { + "epoch": 0.3548167657986081, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 40684 + }, + { + "epoch": 0.35482548708377665, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 40685 + }, + { + "epoch": 0.35483420836894525, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 40686 + }, + { + "epoch": 0.35484292965411385, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40687 + }, + { + "epoch": 0.3548516509392824, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 40688 + }, + { + "epoch": 0.354860372224451, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 40689 + }, + { + "epoch": 0.3548690935096196, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 40690 + }, + { + "epoch": 0.35487781479478814, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 40691 + }, + { + "epoch": 0.35488653607995674, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 40692 + }, + { + "epoch": 0.35489525736512534, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 40693 + }, + { + "epoch": 0.3549039786502939, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 40694 + }, + { + "epoch": 0.3549126999354625, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 40695 + }, + { + "epoch": 0.3549214212206311, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 40696 + }, + { + "epoch": 0.35493014250579963, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 40697 + }, + { + "epoch": 0.35493886379096823, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 40698 + }, + { + "epoch": 0.35494758507613683, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 40699 + }, + { + "epoch": 0.35495630636130543, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 40700 + }, + { + "epoch": 0.354965027646474, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 40701 + }, + { + "epoch": 0.3549737489316426, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40702 + }, + { + "epoch": 0.3549824702168112, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 40703 + }, + { + "epoch": 0.3549911915019797, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 40704 + }, + { + "epoch": 0.3549999127871483, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 40705 + }, + { + "epoch": 0.3550086340723169, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 40706 + }, + { + "epoch": 0.35501735535748546, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40707 + }, + { + "epoch": 0.35502607664265406, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 40708 + }, + { + "epoch": 0.35503479792782267, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 40709 + }, + { + "epoch": 0.3550435192129912, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 40710 + }, + { + "epoch": 0.3550522404981598, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 40711 + }, + { + "epoch": 0.3550609617833284, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 40712 + }, + { + "epoch": 0.35506968306849696, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 40713 + }, + { + "epoch": 0.35507840435366556, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 40714 + }, + { + "epoch": 0.35508712563883416, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 40715 + }, + { + "epoch": 0.3550958469240027, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 40716 + }, + { + "epoch": 0.3551045682091713, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 40717 + }, + { + "epoch": 0.3551132894943399, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 40718 + }, + { + "epoch": 0.35512201077950845, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 40719 + }, + { + "epoch": 0.35513073206467705, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 40720 + }, + { + "epoch": 0.35513945334984565, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 40721 + }, + { + "epoch": 0.3551481746350142, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 40722 + }, + { + "epoch": 0.3551568959201828, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 40723 + }, + { + "epoch": 0.3551656172053514, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 40724 + }, + { + "epoch": 0.35517433849051994, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 40725 + }, + { + "epoch": 0.35518305977568854, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40726 + }, + { + "epoch": 0.35519178106085714, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 40727 + }, + { + "epoch": 0.35520050234602574, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 40728 + }, + { + "epoch": 0.3552092236311943, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 40729 + }, + { + "epoch": 0.3552179449163629, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 40730 + }, + { + "epoch": 0.3552266662015315, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 40731 + }, + { + "epoch": 0.3552353874867, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 40732 + }, + { + "epoch": 0.3552441087718686, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 40733 + }, + { + "epoch": 0.3552528300570372, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 40734 + }, + { + "epoch": 0.3552615513422058, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 40735 + }, + { + "epoch": 0.3552702726273744, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 40736 + }, + { + "epoch": 0.355278993912543, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 40737 + }, + { + "epoch": 0.3552877151977115, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 40738 + }, + { + "epoch": 0.3552964364828801, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 40739 + }, + { + "epoch": 0.3553051577680487, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 40740 + }, + { + "epoch": 0.35531387905321726, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 40741 + }, + { + "epoch": 0.35532260033838586, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 40742 + }, + { + "epoch": 0.35533132162355446, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 40743 + }, + { + "epoch": 0.355340042908723, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 40744 + }, + { + "epoch": 0.3553487641938916, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 40745 + }, + { + "epoch": 0.3553574854790602, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 40746 + }, + { + "epoch": 0.35536620676422875, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 40747 + }, + { + "epoch": 0.35537492804939735, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 40748 + }, + { + "epoch": 0.35538364933456595, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 40749 + }, + { + "epoch": 0.3553923706197345, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 40750 + }, + { + "epoch": 0.3554010919049031, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 40751 + }, + { + "epoch": 0.3554098131900717, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 40752 + }, + { + "epoch": 0.35541853447524024, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 40753 + }, + { + "epoch": 0.35542725576040884, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 40754 + }, + { + "epoch": 0.35543597704557744, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 40755 + }, + { + "epoch": 0.35544469833074605, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 40756 + }, + { + "epoch": 0.3554534196159146, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 40757 + }, + { + "epoch": 0.3554621409010832, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 40758 + }, + { + "epoch": 0.3554708621862518, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 40759 + }, + { + "epoch": 0.35547958347142034, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 40760 + }, + { + "epoch": 0.35548830475658894, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 40761 + }, + { + "epoch": 0.35549702604175754, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 40762 + }, + { + "epoch": 0.3555057473269261, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 40763 + }, + { + "epoch": 0.3555144686120947, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 40764 + }, + { + "epoch": 0.3555231898972633, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 40765 + }, + { + "epoch": 0.3555319111824318, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.975, + "step": 40766 + }, + { + "epoch": 0.3555406324676004, + "grad_norm": 0.275390625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40767 + }, + { + "epoch": 0.355549353752769, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 40768 + }, + { + "epoch": 0.35555807503793757, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 40769 + }, + { + "epoch": 0.35556679632310617, + "grad_norm": 0.294921875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 40770 + }, + { + "epoch": 0.35557551760827477, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 40771 + }, + { + "epoch": 0.3555842388934433, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 40772 + }, + { + "epoch": 0.3555929601786119, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 40773 + }, + { + "epoch": 0.3556016814637805, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 40774 + }, + { + "epoch": 0.35561040274894906, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 40775 + }, + { + "epoch": 0.35561912403411766, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 40776 + }, + { + "epoch": 0.35562784531928626, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 40777 + }, + { + "epoch": 0.3556365666044548, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 40778 + }, + { + "epoch": 0.3556452878896234, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 40779 + }, + { + "epoch": 0.355654009174792, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 40780 + }, + { + "epoch": 0.35566273045996055, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 40781 + }, + { + "epoch": 0.35567145174512915, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 40782 + }, + { + "epoch": 0.35568017303029775, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 40783 + }, + { + "epoch": 0.35568889431546635, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 40784 + }, + { + "epoch": 0.3556976156006349, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 40785 + }, + { + "epoch": 0.3557063368858035, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 40786 + }, + { + "epoch": 0.3557150581709721, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 40787 + }, + { + "epoch": 0.35572377945614064, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 40788 + }, + { + "epoch": 0.35573250074130924, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 40789 + }, + { + "epoch": 0.35574122202647784, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 40790 + }, + { + "epoch": 0.3557499433116464, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 40791 + }, + { + "epoch": 0.355758664596815, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 40792 + }, + { + "epoch": 0.3557673858819836, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 40793 + }, + { + "epoch": 0.35577610716715213, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 40794 + }, + { + "epoch": 0.35578482845232073, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 40795 + }, + { + "epoch": 0.35579354973748933, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 40796 + }, + { + "epoch": 0.3558022710226579, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 40797 + }, + { + "epoch": 0.3558109923078265, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 40798 + }, + { + "epoch": 0.3558197135929951, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 40799 + }, + { + "epoch": 0.3558284348781636, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 40800 + }, + { + "epoch": 0.3558371561633322, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 40801 + }, + { + "epoch": 0.3558458774485008, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 40802 + }, + { + "epoch": 0.35585459873366937, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 40803 + }, + { + "epoch": 0.35586332001883797, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 40804 + }, + { + "epoch": 0.35587204130400657, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 40805 + }, + { + "epoch": 0.3558807625891751, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 40806 + }, + { + "epoch": 0.3558894838743437, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 40807 + }, + { + "epoch": 0.3558982051595123, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 40808 + }, + { + "epoch": 0.3559069264446809, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 40809 + }, + { + "epoch": 0.35591564772984946, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 40810 + }, + { + "epoch": 0.35592436901501806, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 40811 + }, + { + "epoch": 0.35593309030018666, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 40812 + }, + { + "epoch": 0.3559418115853552, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 40813 + }, + { + "epoch": 0.3559505328705238, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 40814 + }, + { + "epoch": 0.3559592541556924, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 40815 + }, + { + "epoch": 0.35596797544086095, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 40816 + }, + { + "epoch": 0.35597669672602955, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 40817 + }, + { + "epoch": 0.35598541801119815, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 40818 + }, + { + "epoch": 0.3559941392963667, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 40819 + }, + { + "epoch": 0.3560028605815353, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 40820 + }, + { + "epoch": 0.3560115818667039, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 40821 + }, + { + "epoch": 0.35602030315187244, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 40822 + }, + { + "epoch": 0.35602902443704104, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 40823 + }, + { + "epoch": 0.35603774572220964, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 40824 + }, + { + "epoch": 0.3560464670073782, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 40825 + }, + { + "epoch": 0.3560551882925468, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 40826 + }, + { + "epoch": 0.3560639095777154, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 40827 + }, + { + "epoch": 0.35607263086288393, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 40828 + }, + { + "epoch": 0.35608135214805253, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 40829 + }, + { + "epoch": 0.35609007343322113, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 40830 + }, + { + "epoch": 0.3560987947183897, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 40831 + }, + { + "epoch": 0.3561075160035583, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 40832 + }, + { + "epoch": 0.3561162372887269, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 40833 + }, + { + "epoch": 0.3561249585738954, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 40834 + }, + { + "epoch": 0.356133679859064, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 40835 + }, + { + "epoch": 0.3561424011442326, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 40836 + }, + { + "epoch": 0.3561511224294012, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 40837 + }, + { + "epoch": 0.35615984371456977, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 40838 + }, + { + "epoch": 0.35616856499973837, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 40839 + }, + { + "epoch": 0.35617728628490697, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 40840 + }, + { + "epoch": 0.3561860075700755, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 40841 + }, + { + "epoch": 0.3561947288552441, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 40842 + }, + { + "epoch": 0.3562034501404127, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 40843 + }, + { + "epoch": 0.35621217142558126, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 40844 + }, + { + "epoch": 0.35622089271074986, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 40845 + }, + { + "epoch": 0.35622961399591846, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 40846 + }, + { + "epoch": 0.356238335281087, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40847 + }, + { + "epoch": 0.3562470565662556, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40848 + }, + { + "epoch": 0.3562557778514242, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 40849 + }, + { + "epoch": 0.35626449913659275, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40850 + }, + { + "epoch": 0.35627322042176135, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 40851 + }, + { + "epoch": 0.35628194170692995, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 40852 + }, + { + "epoch": 0.3562906629920985, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 40853 + }, + { + "epoch": 0.3562993842772671, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 40854 + }, + { + "epoch": 0.3563081055624357, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 40855 + }, + { + "epoch": 0.35631682684760424, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 40856 + }, + { + "epoch": 0.35632554813277284, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 40857 + }, + { + "epoch": 0.35633426941794144, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 40858 + }, + { + "epoch": 0.35634299070311, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 40859 + }, + { + "epoch": 0.3563517119882786, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 40860 + }, + { + "epoch": 0.3563604332734472, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 40861 + }, + { + "epoch": 0.35636915455861573, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 40862 + }, + { + "epoch": 0.35637787584378433, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 40863 + }, + { + "epoch": 0.35638659712895293, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40864 + }, + { + "epoch": 0.35639531841412153, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 40865 + }, + { + "epoch": 0.3564040396992901, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 40866 + }, + { + "epoch": 0.3564127609844587, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 40867 + }, + { + "epoch": 0.3564214822696273, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 40868 + }, + { + "epoch": 0.3564302035547958, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 40869 + }, + { + "epoch": 0.3564389248399644, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 40870 + }, + { + "epoch": 0.356447646125133, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 40871 + }, + { + "epoch": 0.35645636741030157, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 40872 + }, + { + "epoch": 0.35646508869547017, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 40873 + }, + { + "epoch": 0.35647380998063877, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 40874 + }, + { + "epoch": 0.3564825312658073, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 40875 + }, + { + "epoch": 0.3564912525509759, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 40876 + }, + { + "epoch": 0.3564999738361445, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 40877 + }, + { + "epoch": 0.35650869512131306, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 40878 + }, + { + "epoch": 0.35651741640648166, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 40879 + }, + { + "epoch": 0.35652613769165026, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 40880 + }, + { + "epoch": 0.3565348589768188, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 40881 + }, + { + "epoch": 0.3565435802619874, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 40882 + }, + { + "epoch": 0.356552301547156, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 40883 + }, + { + "epoch": 0.35656102283232455, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 40884 + }, + { + "epoch": 0.35656974411749315, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 40885 + }, + { + "epoch": 0.35657846540266175, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 40886 + }, + { + "epoch": 0.3565871866878303, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 40887 + }, + { + "epoch": 0.3565959079729989, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 40888 + }, + { + "epoch": 0.3566046292581675, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 40889 + }, + { + "epoch": 0.35661335054333604, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 40890 + }, + { + "epoch": 0.35662207182850464, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 40891 + }, + { + "epoch": 0.35663079311367324, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 40892 + }, + { + "epoch": 0.35663951439884184, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40893 + }, + { + "epoch": 0.3566482356840104, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 40894 + }, + { + "epoch": 0.356656956969179, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 40895 + }, + { + "epoch": 0.3566656782543476, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 40896 + }, + { + "epoch": 0.35667439953951613, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 40897 + }, + { + "epoch": 0.35668312082468473, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 40898 + }, + { + "epoch": 0.35669184210985333, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 40899 + }, + { + "epoch": 0.3567005633950219, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 40900 + }, + { + "epoch": 0.3567092846801905, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 40901 + }, + { + "epoch": 0.3567180059653591, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 40902 + }, + { + "epoch": 0.3567267272505276, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 40903 + }, + { + "epoch": 0.3567354485356962, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 40904 + }, + { + "epoch": 0.3567441698208648, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 40905 + }, + { + "epoch": 0.35675289110603337, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 40906 + }, + { + "epoch": 0.35676161239120197, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 40907 + }, + { + "epoch": 0.35677033367637057, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 40908 + }, + { + "epoch": 0.3567790549615391, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 40909 + }, + { + "epoch": 0.3567877762467077, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 40910 + }, + { + "epoch": 0.3567964975318763, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 40911 + }, + { + "epoch": 0.35680521881704486, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 40912 + }, + { + "epoch": 0.35681394010221346, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 40913 + }, + { + "epoch": 0.35682266138738206, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 40914 + }, + { + "epoch": 0.3568313826725506, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 40915 + }, + { + "epoch": 0.3568401039577192, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 40916 + }, + { + "epoch": 0.3568488252428878, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 40917 + }, + { + "epoch": 0.3568575465280564, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 40918 + }, + { + "epoch": 0.35686626781322495, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 40919 + }, + { + "epoch": 0.35687498909839355, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 40920 + }, + { + "epoch": 0.35688371038356215, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 40921 + }, + { + "epoch": 0.3568924316687307, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 40922 + }, + { + "epoch": 0.3569011529538993, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 40923 + }, + { + "epoch": 0.3569098742390679, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 40924 + }, + { + "epoch": 0.35691859552423644, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 40925 + }, + { + "epoch": 0.35692731680940504, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 40926 + }, + { + "epoch": 0.35693603809457364, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 40927 + }, + { + "epoch": 0.3569447593797422, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 40928 + }, + { + "epoch": 0.3569534806649108, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 40929 + }, + { + "epoch": 0.3569622019500794, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 40930 + }, + { + "epoch": 0.35697092323524793, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 40931 + }, + { + "epoch": 0.35697964452041653, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 40932 + }, + { + "epoch": 0.35698836580558513, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 40933 + }, + { + "epoch": 0.3569970870907537, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 40934 + }, + { + "epoch": 0.3570058083759223, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 40935 + }, + { + "epoch": 0.3570145296610909, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 40936 + }, + { + "epoch": 0.3570232509462594, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 40937 + }, + { + "epoch": 0.357031972231428, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 40938 + }, + { + "epoch": 0.3570406935165966, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 40939 + }, + { + "epoch": 0.35704941480176516, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 40940 + }, + { + "epoch": 0.35705813608693376, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 40941 + }, + { + "epoch": 0.35706685737210236, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 40942 + }, + { + "epoch": 0.3570755786572709, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 40943 + }, + { + "epoch": 0.3570842999424395, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 40944 + }, + { + "epoch": 0.3570930212276081, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 40945 + }, + { + "epoch": 0.3571017425127767, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 40946 + }, + { + "epoch": 0.35711046379794525, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 40947 + }, + { + "epoch": 0.35711918508311385, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 40948 + }, + { + "epoch": 0.35712790636828246, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 40949 + }, + { + "epoch": 0.357136627653451, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 40950 + }, + { + "epoch": 0.3571453489386196, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 40951 + }, + { + "epoch": 0.3571540702237882, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 40952 + }, + { + "epoch": 0.35716279150895675, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 40953 + }, + { + "epoch": 0.35717151279412535, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 40954 + }, + { + "epoch": 0.35718023407929395, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 40955 + }, + { + "epoch": 0.3571889553644625, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 40956 + }, + { + "epoch": 0.3571976766496311, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 40957 + }, + { + "epoch": 0.3572063979347997, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 40958 + }, + { + "epoch": 0.35721511921996824, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 40959 + }, + { + "epoch": 0.35722384050513684, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 40960 + }, + { + "epoch": 0.35723256179030544, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 40961 + }, + { + "epoch": 0.357241283075474, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 40962 + }, + { + "epoch": 0.3572500043606426, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 40963 + }, + { + "epoch": 0.3572587256458112, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 40964 + }, + { + "epoch": 0.3572674469309797, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 40965 + }, + { + "epoch": 0.3572761682161483, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 40966 + }, + { + "epoch": 0.3572848895013169, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 40967 + }, + { + "epoch": 0.35729361078648547, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 40968 + }, + { + "epoch": 0.35730233207165407, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 40969 + }, + { + "epoch": 0.35731105335682267, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 40970 + }, + { + "epoch": 0.3573197746419912, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.977, + "step": 40971 + }, + { + "epoch": 0.3573284959271598, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 40972 + }, + { + "epoch": 0.3573372172123284, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 40973 + }, + { + "epoch": 0.357345938497497, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 40974 + }, + { + "epoch": 0.35735465978266556, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 40975 + }, + { + "epoch": 0.35736338106783416, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 40976 + }, + { + "epoch": 0.35737210235300276, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 40977 + }, + { + "epoch": 0.3573808236381713, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 40978 + }, + { + "epoch": 0.3573895449233399, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 40979 + }, + { + "epoch": 0.3573982662085085, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 40980 + }, + { + "epoch": 0.35740698749367705, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 40981 + }, + { + "epoch": 0.35741570877884565, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 40982 + }, + { + "epoch": 0.35742443006401425, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 40983 + }, + { + "epoch": 0.3574331513491828, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 40984 + }, + { + "epoch": 0.3574418726343514, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 40985 + }, + { + "epoch": 0.35745059391952, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 40986 + }, + { + "epoch": 0.35745931520468854, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 40987 + }, + { + "epoch": 0.35746803648985714, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 40988 + }, + { + "epoch": 0.35747675777502574, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 40989 + }, + { + "epoch": 0.3574854790601943, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 40990 + }, + { + "epoch": 0.3574942003453629, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 40991 + }, + { + "epoch": 0.3575029216305315, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 40992 + }, + { + "epoch": 0.35751164291570003, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 40993 + }, + { + "epoch": 0.35752036420086863, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 40994 + }, + { + "epoch": 0.35752908548603723, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 40995 + }, + { + "epoch": 0.3575378067712058, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 40996 + }, + { + "epoch": 0.3575465280563744, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 40997 + }, + { + "epoch": 0.357555249341543, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 40998 + }, + { + "epoch": 0.3575639706267115, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 40999 + }, + { + "epoch": 0.3575726919118801, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 41000 + }, + { + "epoch": 0.3575814131970487, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 41001 + }, + { + "epoch": 0.3575901344822173, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 41002 + }, + { + "epoch": 0.35759885576738587, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 41003 + }, + { + "epoch": 0.35760757705255447, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 41004 + }, + { + "epoch": 0.35761629833772307, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41005 + }, + { + "epoch": 0.3576250196228916, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 41006 + }, + { + "epoch": 0.3576337409080602, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 41007 + }, + { + "epoch": 0.3576424621932288, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 41008 + }, + { + "epoch": 0.35765118347839736, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 41009 + }, + { + "epoch": 0.35765990476356596, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 41010 + }, + { + "epoch": 0.35766862604873456, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 41011 + }, + { + "epoch": 0.3576773473339031, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 41012 + }, + { + "epoch": 0.3576860686190717, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 41013 + }, + { + "epoch": 0.3576947899042403, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 41014 + }, + { + "epoch": 0.35770351118940885, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9815, + "step": 41015 + }, + { + "epoch": 0.35771223247457745, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 41016 + }, + { + "epoch": 0.35772095375974605, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 41017 + }, + { + "epoch": 0.3577296750449146, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41018 + }, + { + "epoch": 0.3577383963300832, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 41019 + }, + { + "epoch": 0.3577471176152518, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 41020 + }, + { + "epoch": 0.35775583890042034, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 41021 + }, + { + "epoch": 0.35776456018558894, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 41022 + }, + { + "epoch": 0.35777328147075754, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 41023 + }, + { + "epoch": 0.3577820027559261, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 41024 + }, + { + "epoch": 0.3577907240410947, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 41025 + }, + { + "epoch": 0.3577994453262633, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 41026 + }, + { + "epoch": 0.3578081666114319, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 41027 + }, + { + "epoch": 0.35781688789660043, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 41028 + }, + { + "epoch": 0.35782560918176903, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 41029 + }, + { + "epoch": 0.35783433046693763, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 41030 + }, + { + "epoch": 0.3578430517521062, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 41031 + }, + { + "epoch": 0.3578517730372748, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 41032 + }, + { + "epoch": 0.3578604943224434, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 41033 + }, + { + "epoch": 0.3578692156076119, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 41034 + }, + { + "epoch": 0.3578779368927805, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 41035 + }, + { + "epoch": 0.3578866581779491, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 41036 + }, + { + "epoch": 0.35789537946311767, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 41037 + }, + { + "epoch": 0.35790410074828627, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41038 + }, + { + "epoch": 0.35791282203345487, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 41039 + }, + { + "epoch": 0.3579215433186234, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 41040 + }, + { + "epoch": 0.357930264603792, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 41041 + }, + { + "epoch": 0.3579389858889606, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41042 + }, + { + "epoch": 0.35794770717412916, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 41043 + }, + { + "epoch": 0.35795642845929776, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41044 + }, + { + "epoch": 0.35796514974446636, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 41045 + }, + { + "epoch": 0.3579738710296349, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 41046 + }, + { + "epoch": 0.3579825923148035, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 41047 + }, + { + "epoch": 0.3579913135999721, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 41048 + }, + { + "epoch": 0.35800003488514065, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 41049 + }, + { + "epoch": 0.35800875617030925, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41050 + }, + { + "epoch": 0.35801747745547785, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 41051 + }, + { + "epoch": 0.3580261987406464, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 41052 + }, + { + "epoch": 0.358034920025815, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 41053 + }, + { + "epoch": 0.3580436413109836, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41054 + }, + { + "epoch": 0.3580523625961522, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 41055 + }, + { + "epoch": 0.35806108388132074, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 41056 + }, + { + "epoch": 0.35806980516648934, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 41057 + }, + { + "epoch": 0.35807852645165794, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41058 + }, + { + "epoch": 0.3580872477368265, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 41059 + }, + { + "epoch": 0.3580959690219951, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 41060 + }, + { + "epoch": 0.3581046903071637, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 41061 + }, + { + "epoch": 0.35811341159233223, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 41062 + }, + { + "epoch": 0.35812213287750083, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 41063 + }, + { + "epoch": 0.35813085416266943, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 41064 + }, + { + "epoch": 0.358139575447838, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 41065 + }, + { + "epoch": 0.3581482967330066, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 41066 + }, + { + "epoch": 0.3581570180181752, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 41067 + }, + { + "epoch": 0.3581657393033437, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41068 + }, + { + "epoch": 0.3581744605885123, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 41069 + }, + { + "epoch": 0.3581831818736809, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 41070 + }, + { + "epoch": 0.35819190315884947, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 41071 + }, + { + "epoch": 0.35820062444401807, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41072 + }, + { + "epoch": 0.35820934572918667, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 41073 + }, + { + "epoch": 0.3582180670143552, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 41074 + }, + { + "epoch": 0.3582267882995238, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 41075 + }, + { + "epoch": 0.3582355095846924, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 41076 + }, + { + "epoch": 0.35824423086986096, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 41077 + }, + { + "epoch": 0.35825295215502956, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41078 + }, + { + "epoch": 0.35826167344019816, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 41079 + }, + { + "epoch": 0.3582703947253667, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 41080 + }, + { + "epoch": 0.3582791160105353, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 41081 + }, + { + "epoch": 0.3582878372957039, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 41082 + }, + { + "epoch": 0.3582965585808725, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 41083 + }, + { + "epoch": 0.35830527986604105, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 41084 + }, + { + "epoch": 0.35831400115120965, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 41085 + }, + { + "epoch": 0.35832272243637825, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 41086 + }, + { + "epoch": 0.3583314437215468, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 41087 + }, + { + "epoch": 0.3583401650067154, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 41088 + }, + { + "epoch": 0.358348886291884, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 41089 + }, + { + "epoch": 0.35835760757705254, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 41090 + }, + { + "epoch": 0.35836632886222114, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 41091 + }, + { + "epoch": 0.35837505014738974, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 41092 + }, + { + "epoch": 0.3583837714325583, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 41093 + }, + { + "epoch": 0.3583924927177269, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 41094 + }, + { + "epoch": 0.3584012140028955, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9762, + "step": 41095 + }, + { + "epoch": 0.35840993528806403, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 41096 + }, + { + "epoch": 0.35841865657323263, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 41097 + }, + { + "epoch": 0.35842737785840123, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41098 + }, + { + "epoch": 0.3584360991435698, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 41099 + }, + { + "epoch": 0.3584448204287384, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 41100 + }, + { + "epoch": 0.358453541713907, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 41101 + }, + { + "epoch": 0.3584622629990755, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 41102 + }, + { + "epoch": 0.3584709842842441, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 41103 + }, + { + "epoch": 0.3584797055694127, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 41104 + }, + { + "epoch": 0.35848842685458127, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 41105 + }, + { + "epoch": 0.35849714813974987, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 41106 + }, + { + "epoch": 0.35850586942491847, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 41107 + }, + { + "epoch": 0.358514590710087, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 41108 + }, + { + "epoch": 0.3585233119952556, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 41109 + }, + { + "epoch": 0.3585320332804242, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 41110 + }, + { + "epoch": 0.3585407545655928, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 41111 + }, + { + "epoch": 0.35854947585076136, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 41112 + }, + { + "epoch": 0.35855819713592996, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 41113 + }, + { + "epoch": 0.35856691842109856, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 41114 + }, + { + "epoch": 0.3585756397062671, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 41115 + }, + { + "epoch": 0.3585843609914357, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 41116 + }, + { + "epoch": 0.3585930822766043, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 41117 + }, + { + "epoch": 0.35860180356177285, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 41118 + }, + { + "epoch": 0.35861052484694145, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 41119 + }, + { + "epoch": 0.35861924613211005, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 41120 + }, + { + "epoch": 0.3586279674172786, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 41121 + }, + { + "epoch": 0.3586366887024472, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 41122 + }, + { + "epoch": 0.3586454099876158, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 41123 + }, + { + "epoch": 0.35865413127278434, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 41124 + }, + { + "epoch": 0.35866285255795294, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 41125 + }, + { + "epoch": 0.35867157384312154, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 41126 + }, + { + "epoch": 0.3586802951282901, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 41127 + }, + { + "epoch": 0.3586890164134587, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 41128 + }, + { + "epoch": 0.3586977376986273, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 41129 + }, + { + "epoch": 0.35870645898379583, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41130 + }, + { + "epoch": 0.35871518026896443, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 41131 + }, + { + "epoch": 0.35872390155413303, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 41132 + }, + { + "epoch": 0.3587326228393016, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9809, + "step": 41133 + }, + { + "epoch": 0.3587413441244702, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 41134 + }, + { + "epoch": 0.3587500654096388, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 41135 + }, + { + "epoch": 0.3587587866948074, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9826, + "step": 41136 + }, + { + "epoch": 0.3587675079799759, + "grad_norm": 0.3203125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 41137 + }, + { + "epoch": 0.3587762292651445, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 41138 + }, + { + "epoch": 0.3587849505503131, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 41139 + }, + { + "epoch": 0.35879367183548166, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 41140 + }, + { + "epoch": 0.35880239312065026, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 41141 + }, + { + "epoch": 0.35881111440581887, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 41142 + }, + { + "epoch": 0.3588198356909874, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 41143 + }, + { + "epoch": 0.358828556976156, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 41144 + }, + { + "epoch": 0.3588372782613246, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 41145 + }, + { + "epoch": 0.35884599954649316, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 41146 + }, + { + "epoch": 0.35885472083166176, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 41147 + }, + { + "epoch": 0.35886344211683036, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 41148 + }, + { + "epoch": 0.3588721634019989, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 41149 + }, + { + "epoch": 0.3588808846871675, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 41150 + }, + { + "epoch": 0.3588896059723361, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 41151 + }, + { + "epoch": 0.35889832725750465, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 41152 + }, + { + "epoch": 0.35890704854267325, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 41153 + }, + { + "epoch": 0.35891576982784185, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 41154 + }, + { + "epoch": 0.3589244911130104, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 41155 + }, + { + "epoch": 0.358933212398179, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 41156 + }, + { + "epoch": 0.3589419336833476, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 41157 + }, + { + "epoch": 0.35895065496851614, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 41158 + }, + { + "epoch": 0.35895937625368474, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41159 + }, + { + "epoch": 0.35896809753885334, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 41160 + }, + { + "epoch": 0.3589768188240219, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9751, + "step": 41161 + }, + { + "epoch": 0.3589855401091905, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 41162 + }, + { + "epoch": 0.3589942613943591, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 41163 + }, + { + "epoch": 0.3590029826795277, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 41164 + }, + { + "epoch": 0.3590117039646962, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 41165 + }, + { + "epoch": 0.3590204252498648, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41166 + }, + { + "epoch": 0.3590291465350334, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 41167 + }, + { + "epoch": 0.359037867820202, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 41168 + }, + { + "epoch": 0.3590465891053706, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 41169 + }, + { + "epoch": 0.3590553103905392, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 41170 + }, + { + "epoch": 0.3590640316757077, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 41171 + }, + { + "epoch": 0.3590727529608763, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 41172 + }, + { + "epoch": 0.3590814742460449, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 41173 + }, + { + "epoch": 0.35909019553121346, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 41174 + }, + { + "epoch": 0.35909891681638206, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 41175 + }, + { + "epoch": 0.35910763810155066, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 41176 + }, + { + "epoch": 0.3591163593867192, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 41177 + }, + { + "epoch": 0.3591250806718878, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9791, + "step": 41178 + }, + { + "epoch": 0.3591338019570564, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 41179 + }, + { + "epoch": 0.35914252324222495, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 41180 + }, + { + "epoch": 0.35915124452739355, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 41181 + }, + { + "epoch": 0.35915996581256215, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41182 + }, + { + "epoch": 0.3591686870977307, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41183 + }, + { + "epoch": 0.3591774083828993, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41184 + }, + { + "epoch": 0.3591861296680679, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 41185 + }, + { + "epoch": 0.35919485095323644, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 41186 + }, + { + "epoch": 0.35920357223840504, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 41187 + }, + { + "epoch": 0.35921229352357364, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 41188 + }, + { + "epoch": 0.3592210148087422, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 41189 + }, + { + "epoch": 0.3592297360939108, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41190 + }, + { + "epoch": 0.3592384573790794, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 41191 + }, + { + "epoch": 0.359247178664248, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 41192 + }, + { + "epoch": 0.35925589994941654, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 41193 + }, + { + "epoch": 0.35926462123458514, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 41194 + }, + { + "epoch": 0.35927334251975374, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 41195 + }, + { + "epoch": 0.3592820638049223, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 41196 + }, + { + "epoch": 0.3592907850900909, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 41197 + }, + { + "epoch": 0.3592995063752595, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 41198 + }, + { + "epoch": 0.359308227660428, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 41199 + }, + { + "epoch": 0.3593169489455966, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 41200 + }, + { + "epoch": 0.3593256702307652, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 41201 + }, + { + "epoch": 0.35933439151593377, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 41202 + }, + { + "epoch": 0.35934311280110237, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 41203 + }, + { + "epoch": 0.35935183408627097, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 41204 + }, + { + "epoch": 0.3593605553714395, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 41205 + }, + { + "epoch": 0.3593692766566081, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 41206 + }, + { + "epoch": 0.3593779979417767, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 41207 + }, + { + "epoch": 0.35938671922694526, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 41208 + }, + { + "epoch": 0.35939544051211386, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 41209 + }, + { + "epoch": 0.35940416179728246, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 41210 + }, + { + "epoch": 0.359412883082451, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 41211 + }, + { + "epoch": 0.3594216043676196, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 41212 + }, + { + "epoch": 0.3594303256527882, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 41213 + }, + { + "epoch": 0.35943904693795675, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 41214 + }, + { + "epoch": 0.35944776822312535, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 41215 + }, + { + "epoch": 0.35945648950829395, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 41216 + }, + { + "epoch": 0.3594652107934625, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 41217 + }, + { + "epoch": 0.3594739320786311, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 41218 + }, + { + "epoch": 0.3594826533637997, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 41219 + }, + { + "epoch": 0.3594913746489683, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 41220 + }, + { + "epoch": 0.35950009593413684, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 41221 + }, + { + "epoch": 0.35950881721930544, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 41222 + }, + { + "epoch": 0.35951753850447404, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 41223 + }, + { + "epoch": 0.3595262597896426, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 41224 + }, + { + "epoch": 0.3595349810748112, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 41225 + }, + { + "epoch": 0.3595437023599798, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 41226 + }, + { + "epoch": 0.35955242364514833, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 41227 + }, + { + "epoch": 0.35956114493031693, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 41228 + }, + { + "epoch": 0.35956986621548553, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 41229 + }, + { + "epoch": 0.3595785875006541, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 41230 + }, + { + "epoch": 0.3595873087858227, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 41231 + }, + { + "epoch": 0.3595960300709913, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 41232 + }, + { + "epoch": 0.3596047513561598, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 41233 + }, + { + "epoch": 0.3596134726413284, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 41234 + }, + { + "epoch": 0.359622193926497, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41235 + }, + { + "epoch": 0.35963091521166557, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 41236 + }, + { + "epoch": 0.35963963649683417, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 41237 + }, + { + "epoch": 0.35964835778200277, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 41238 + }, + { + "epoch": 0.3596570790671713, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 41239 + }, + { + "epoch": 0.3596658003523399, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 41240 + }, + { + "epoch": 0.3596745216375085, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 41241 + }, + { + "epoch": 0.35968324292267706, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 41242 + }, + { + "epoch": 0.35969196420784566, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 41243 + }, + { + "epoch": 0.35970068549301426, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 41244 + }, + { + "epoch": 0.3597094067781828, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 41245 + }, + { + "epoch": 0.3597181280633514, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 41246 + }, + { + "epoch": 0.35972684934852, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 41247 + }, + { + "epoch": 0.3597355706336886, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 41248 + }, + { + "epoch": 0.35974429191885715, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 41249 + }, + { + "epoch": 0.35975301320402575, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 41250 + }, + { + "epoch": 0.35976173448919435, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 41251 + }, + { + "epoch": 0.3597704557743629, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41252 + }, + { + "epoch": 0.3597791770595315, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 41253 + }, + { + "epoch": 0.3597878983447001, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 41254 + }, + { + "epoch": 0.35979661962986864, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 41255 + }, + { + "epoch": 0.35980534091503724, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 41256 + }, + { + "epoch": 0.35981406220020584, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 41257 + }, + { + "epoch": 0.3598227834853744, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 41258 + }, + { + "epoch": 0.359831504770543, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 41259 + }, + { + "epoch": 0.3598402260557116, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 41260 + }, + { + "epoch": 0.35984894734088013, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 41261 + }, + { + "epoch": 0.35985766862604873, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 41262 + }, + { + "epoch": 0.35986638991121733, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 41263 + }, + { + "epoch": 0.3598751111963859, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 41264 + }, + { + "epoch": 0.3598838324815545, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 41265 + }, + { + "epoch": 0.3598925537667231, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 41266 + }, + { + "epoch": 0.3599012750518916, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9799, + "step": 41267 + }, + { + "epoch": 0.3599099963370602, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 41268 + }, + { + "epoch": 0.3599187176222288, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 41269 + }, + { + "epoch": 0.35992743890739737, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 41270 + }, + { + "epoch": 0.35993616019256597, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 41271 + }, + { + "epoch": 0.35994488147773457, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 41272 + }, + { + "epoch": 0.35995360276290317, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 41273 + }, + { + "epoch": 0.3599623240480717, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 41274 + }, + { + "epoch": 0.3599710453332403, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 41275 + }, + { + "epoch": 0.3599797666184089, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 41276 + }, + { + "epoch": 0.35998848790357746, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 41277 + }, + { + "epoch": 0.35999720918874606, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 41278 + }, + { + "epoch": 0.36000593047391466, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 41279 + }, + { + "epoch": 0.3600146517590832, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 41280 + }, + { + "epoch": 0.3600233730442518, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 41281 + }, + { + "epoch": 0.3600320943294204, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 41282 + }, + { + "epoch": 0.36004081561458895, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 41283 + }, + { + "epoch": 0.36004953689975755, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41284 + }, + { + "epoch": 0.36005825818492615, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 41285 + }, + { + "epoch": 0.3600669794700947, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 41286 + }, + { + "epoch": 0.3600757007552633, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 41287 + }, + { + "epoch": 0.3600844220404319, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 41288 + }, + { + "epoch": 0.36009314332560044, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 41289 + }, + { + "epoch": 0.36010186461076904, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 41290 + }, + { + "epoch": 0.36011058589593764, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 41291 + }, + { + "epoch": 0.3601193071811062, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 41292 + }, + { + "epoch": 0.3601280284662748, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 41293 + }, + { + "epoch": 0.3601367497514434, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 41294 + }, + { + "epoch": 0.36014547103661193, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 41295 + }, + { + "epoch": 0.36015419232178053, + "grad_norm": 0.79296875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 41296 + }, + { + "epoch": 0.36016291360694913, + "grad_norm": 0.796875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 41297 + }, + { + "epoch": 0.3601716348921177, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 41298 + }, + { + "epoch": 0.3601803561772863, + "grad_norm": 0.54296875, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 41299 + }, + { + "epoch": 0.3601890774624549, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 41300 + }, + { + "epoch": 0.3601977987476235, + "grad_norm": 0.55859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 41301 + }, + { + "epoch": 0.360206520032792, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41302 + }, + { + "epoch": 0.3602152413179606, + "grad_norm": 0.451171875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 41303 + }, + { + "epoch": 0.3602239626031292, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 41304 + }, + { + "epoch": 0.36023268388829777, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 41305 + }, + { + "epoch": 0.36024140517346637, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 41306 + }, + { + "epoch": 0.36025012645863497, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 41307 + }, + { + "epoch": 0.3602588477438035, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 41308 + }, + { + "epoch": 0.3602675690289721, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 41309 + }, + { + "epoch": 0.3602762903141407, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 41310 + }, + { + "epoch": 0.36028501159930926, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 41311 + }, + { + "epoch": 0.36029373288447786, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 41312 + }, + { + "epoch": 0.36030245416964646, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 41313 + }, + { + "epoch": 0.360311175454815, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 41314 + }, + { + "epoch": 0.3603198967399836, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 41315 + }, + { + "epoch": 0.3603286180251522, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 41316 + }, + { + "epoch": 0.36033733931032075, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 41317 + }, + { + "epoch": 0.36034606059548935, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 41318 + }, + { + "epoch": 0.36035478188065795, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 41319 + }, + { + "epoch": 0.3603635031658265, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 41320 + }, + { + "epoch": 0.3603722244509951, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 41321 + }, + { + "epoch": 0.3603809457361637, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 41322 + }, + { + "epoch": 0.36038966702133224, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 41323 + }, + { + "epoch": 0.36039838830650084, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41324 + }, + { + "epoch": 0.36040710959166944, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 41325 + }, + { + "epoch": 0.360415830876838, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 41326 + }, + { + "epoch": 0.3604245521620066, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 41327 + }, + { + "epoch": 0.3604332734471752, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 41328 + }, + { + "epoch": 0.3604419947323438, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 41329 + }, + { + "epoch": 0.36045071601751233, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 41330 + }, + { + "epoch": 0.36045943730268093, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 41331 + }, + { + "epoch": 0.36046815858784953, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41332 + }, + { + "epoch": 0.3604768798730181, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 41333 + }, + { + "epoch": 0.3604856011581867, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41334 + }, + { + "epoch": 0.3604943224433553, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 41335 + }, + { + "epoch": 0.3605030437285238, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 41336 + }, + { + "epoch": 0.3605117650136924, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 41337 + }, + { + "epoch": 0.360520486298861, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 41338 + }, + { + "epoch": 0.36052920758402957, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 41339 + }, + { + "epoch": 0.36053792886919817, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41340 + }, + { + "epoch": 0.36054665015436677, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41341 + }, + { + "epoch": 0.3605553714395353, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 41342 + }, + { + "epoch": 0.3605640927247039, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 41343 + }, + { + "epoch": 0.3605728140098725, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 41344 + }, + { + "epoch": 0.36058153529504106, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 41345 + }, + { + "epoch": 0.36059025658020966, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 41346 + }, + { + "epoch": 0.36059897786537826, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 41347 + }, + { + "epoch": 0.3606076991505468, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 41348 + }, + { + "epoch": 0.3606164204357154, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 41349 + }, + { + "epoch": 0.360625141720884, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 41350 + }, + { + "epoch": 0.36063386300605255, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 41351 + }, + { + "epoch": 0.36064258429122115, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 41352 + }, + { + "epoch": 0.36065130557638975, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 41353 + }, + { + "epoch": 0.3606600268615583, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 41354 + }, + { + "epoch": 0.3606687481467269, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 41355 + }, + { + "epoch": 0.3606774694318955, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 41356 + }, + { + "epoch": 0.3606861907170641, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 41357 + }, + { + "epoch": 0.36069491200223264, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 41358 + }, + { + "epoch": 0.36070363328740124, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 41359 + }, + { + "epoch": 0.36071235457256984, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 41360 + }, + { + "epoch": 0.3607210758577384, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 41361 + }, + { + "epoch": 0.360729797142907, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 41362 + }, + { + "epoch": 0.3607385184280756, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 41363 + }, + { + "epoch": 0.3607472397132441, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 41364 + }, + { + "epoch": 0.36075596099841273, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 41365 + }, + { + "epoch": 0.36076468228358133, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 41366 + }, + { + "epoch": 0.3607734035687499, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41367 + }, + { + "epoch": 0.3607821248539185, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 41368 + }, + { + "epoch": 0.3607908461390871, + "grad_norm": 0.072265625, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 41369 + }, + { + "epoch": 0.3607995674242556, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 41370 + }, + { + "epoch": 0.3608082887094242, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 41371 + }, + { + "epoch": 0.3608170099945928, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 41372 + }, + { + "epoch": 0.36082573127976136, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9737, + "step": 41373 + }, + { + "epoch": 0.36083445256492996, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 41374 + }, + { + "epoch": 0.36084317385009856, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 41375 + }, + { + "epoch": 0.3608518951352671, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 41376 + }, + { + "epoch": 0.3608606164204357, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 41377 + }, + { + "epoch": 0.3608693377056043, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 41378 + }, + { + "epoch": 0.36087805899077285, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 41379 + }, + { + "epoch": 0.36088678027594145, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 41380 + }, + { + "epoch": 0.36089550156111005, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 41381 + }, + { + "epoch": 0.36090422284627866, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 41382 + }, + { + "epoch": 0.3609129441314472, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 41383 + }, + { + "epoch": 0.3609216654166158, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 41384 + }, + { + "epoch": 0.3609303867017844, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 41385 + }, + { + "epoch": 0.36093910798695295, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 41386 + }, + { + "epoch": 0.36094782927212155, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 41387 + }, + { + "epoch": 0.36095655055729015, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 41388 + }, + { + "epoch": 0.3609652718424587, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 41389 + }, + { + "epoch": 0.3609739931276273, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 41390 + }, + { + "epoch": 0.3609827144127959, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 41391 + }, + { + "epoch": 0.36099143569796444, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 41392 + }, + { + "epoch": 0.36100015698313304, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 41393 + }, + { + "epoch": 0.36100887826830164, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 41394 + }, + { + "epoch": 0.3610175995534702, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 41395 + }, + { + "epoch": 0.3610263208386388, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41396 + }, + { + "epoch": 0.3610350421238074, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 41397 + }, + { + "epoch": 0.3610437634089759, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 41398 + }, + { + "epoch": 0.3610524846941445, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 41399 + }, + { + "epoch": 0.3610612059793131, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 41400 + }, + { + "epoch": 0.36106992726448167, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 41401 + }, + { + "epoch": 0.36107864854965027, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 41402 + }, + { + "epoch": 0.36108736983481887, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 41403 + }, + { + "epoch": 0.3610960911199874, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 41404 + }, + { + "epoch": 0.361104812405156, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 41405 + }, + { + "epoch": 0.3611135336903246, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 41406 + }, + { + "epoch": 0.36112225497549316, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 41407 + }, + { + "epoch": 0.36113097626066176, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 41408 + }, + { + "epoch": 0.36113969754583036, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 41409 + }, + { + "epoch": 0.36114841883099896, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 41410 + }, + { + "epoch": 0.3611571401161675, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 41411 + }, + { + "epoch": 0.3611658614013361, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 41412 + }, + { + "epoch": 0.3611745826865047, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 41413 + }, + { + "epoch": 0.36118330397167325, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 41414 + }, + { + "epoch": 0.36119202525684185, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 41415 + }, + { + "epoch": 0.36120074654201045, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 41416 + }, + { + "epoch": 0.361209467827179, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 41417 + }, + { + "epoch": 0.3612181891123476, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 41418 + }, + { + "epoch": 0.3612269103975162, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 41419 + }, + { + "epoch": 0.36123563168268474, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 41420 + }, + { + "epoch": 0.36124435296785334, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 41421 + }, + { + "epoch": 0.36125307425302194, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 41422 + }, + { + "epoch": 0.3612617955381905, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41423 + }, + { + "epoch": 0.3612705168233591, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 41424 + }, + { + "epoch": 0.3612792381085277, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 41425 + }, + { + "epoch": 0.36128795939369623, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 41426 + }, + { + "epoch": 0.36129668067886483, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 41427 + }, + { + "epoch": 0.36130540196403343, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 41428 + }, + { + "epoch": 0.361314123249202, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 41429 + }, + { + "epoch": 0.3613228445343706, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41430 + }, + { + "epoch": 0.3613315658195392, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41431 + }, + { + "epoch": 0.3613402871047077, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 41432 + }, + { + "epoch": 0.3613490083898763, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 41433 + }, + { + "epoch": 0.3613577296750449, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 41434 + }, + { + "epoch": 0.36136645096021347, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 41435 + }, + { + "epoch": 0.36137517224538207, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41436 + }, + { + "epoch": 0.36138389353055067, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 41437 + }, + { + "epoch": 0.36139261481571927, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41438 + }, + { + "epoch": 0.3614013361008878, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 41439 + }, + { + "epoch": 0.3614100573860564, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 41440 + }, + { + "epoch": 0.361418778671225, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 41441 + }, + { + "epoch": 0.36142749995639356, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 41442 + }, + { + "epoch": 0.36143622124156216, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 41443 + }, + { + "epoch": 0.36144494252673076, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 41444 + }, + { + "epoch": 0.3614536638118993, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 41445 + }, + { + "epoch": 0.3614623850970679, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 41446 + }, + { + "epoch": 0.3614711063822365, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 41447 + }, + { + "epoch": 0.36147982766740505, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 41448 + }, + { + "epoch": 0.36148854895257365, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 41449 + }, + { + "epoch": 0.36149727023774225, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 41450 + }, + { + "epoch": 0.3615059915229108, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 41451 + }, + { + "epoch": 0.3615147128080794, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 41452 + }, + { + "epoch": 0.361523434093248, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 41453 + }, + { + "epoch": 0.36153215537841654, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 41454 + }, + { + "epoch": 0.36154087666358514, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 41455 + }, + { + "epoch": 0.36154959794875374, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41456 + }, + { + "epoch": 0.3615583192339223, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 41457 + }, + { + "epoch": 0.3615670405190909, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9727, + "step": 41458 + }, + { + "epoch": 0.3615757618042595, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 41459 + }, + { + "epoch": 0.36158448308942803, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 41460 + }, + { + "epoch": 0.36159320437459663, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 41461 + }, + { + "epoch": 0.36160192565976523, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 41462 + }, + { + "epoch": 0.3616106469449338, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41463 + }, + { + "epoch": 0.3616193682301024, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 41464 + }, + { + "epoch": 0.361628089515271, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41465 + }, + { + "epoch": 0.3616368108004396, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 41466 + }, + { + "epoch": 0.3616455320856081, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41467 + }, + { + "epoch": 0.3616542533707767, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 41468 + }, + { + "epoch": 0.3616629746559453, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 41469 + }, + { + "epoch": 0.36167169594111387, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 41470 + }, + { + "epoch": 0.36168041722628247, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41471 + }, + { + "epoch": 0.36168913851145107, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41472 + }, + { + "epoch": 0.3616978597966196, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 41473 + }, + { + "epoch": 0.3617065810817882, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 41474 + }, + { + "epoch": 0.3617153023669568, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 41475 + }, + { + "epoch": 0.36172402365212536, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 41476 + }, + { + "epoch": 0.36173274493729396, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 41477 + }, + { + "epoch": 0.36174146622246256, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 41478 + }, + { + "epoch": 0.3617501875076311, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41479 + }, + { + "epoch": 0.3617589087927997, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 41480 + }, + { + "epoch": 0.3617676300779683, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 41481 + }, + { + "epoch": 0.36177635136313685, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 41482 + }, + { + "epoch": 0.36178507264830545, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 41483 + }, + { + "epoch": 0.36179379393347405, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 41484 + }, + { + "epoch": 0.3618025152186426, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 41485 + }, + { + "epoch": 0.3618112365038112, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41486 + }, + { + "epoch": 0.3618199577889798, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 41487 + }, + { + "epoch": 0.36182867907414834, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 41488 + }, + { + "epoch": 0.36183740035931694, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 41489 + }, + { + "epoch": 0.36184612164448554, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41490 + }, + { + "epoch": 0.36185484292965414, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 41491 + }, + { + "epoch": 0.3618635642148227, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 41492 + }, + { + "epoch": 0.3618722854999913, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 41493 + }, + { + "epoch": 0.3618810067851599, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 41494 + }, + { + "epoch": 0.36188972807032843, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 41495 + }, + { + "epoch": 0.36189844935549703, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 41496 + }, + { + "epoch": 0.36190717064066563, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 41497 + }, + { + "epoch": 0.3619158919258342, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 41498 + }, + { + "epoch": 0.3619246132110028, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 41499 + }, + { + "epoch": 0.3619333344961714, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 41500 + }, + { + "epoch": 0.3619420557813399, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 41501 + }, + { + "epoch": 0.3619507770665085, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 41502 + }, + { + "epoch": 0.3619594983516771, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 41503 + }, + { + "epoch": 0.36196821963684567, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 41504 + }, + { + "epoch": 0.36197694092201427, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 41505 + }, + { + "epoch": 0.36198566220718287, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 41506 + }, + { + "epoch": 0.3619943834923514, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 41507 + }, + { + "epoch": 0.36200310477752, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 41508 + }, + { + "epoch": 0.3620118260626886, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 41509 + }, + { + "epoch": 0.36202054734785716, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 41510 + }, + { + "epoch": 0.36202926863302576, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 41511 + }, + { + "epoch": 0.36203798991819436, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 41512 + }, + { + "epoch": 0.3620467112033629, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 41513 + }, + { + "epoch": 0.3620554324885315, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 41514 + }, + { + "epoch": 0.3620641537737001, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 41515 + }, + { + "epoch": 0.36207287505886865, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 41516 + }, + { + "epoch": 0.36208159634403725, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 41517 + }, + { + "epoch": 0.36209031762920585, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 41518 + }, + { + "epoch": 0.36209903891437445, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 41519 + }, + { + "epoch": 0.362107760199543, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 41520 + }, + { + "epoch": 0.3621164814847116, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 41521 + }, + { + "epoch": 0.3621252027698802, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 41522 + }, + { + "epoch": 0.36213392405504874, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 41523 + }, + { + "epoch": 0.36214264534021734, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 41524 + }, + { + "epoch": 0.36215136662538594, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 41525 + }, + { + "epoch": 0.3621600879105545, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 41526 + }, + { + "epoch": 0.3621688091957231, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 41527 + }, + { + "epoch": 0.3621775304808917, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 41528 + }, + { + "epoch": 0.36218625176606023, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 41529 + }, + { + "epoch": 0.36219497305122883, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 41530 + }, + { + "epoch": 0.36220369433639743, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 41531 + }, + { + "epoch": 0.362212415621566, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 41532 + }, + { + "epoch": 0.3622211369067346, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 41533 + }, + { + "epoch": 0.3622298581919032, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 41534 + }, + { + "epoch": 0.3622385794770717, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 41535 + }, + { + "epoch": 0.3622473007622403, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 41536 + }, + { + "epoch": 0.3622560220474089, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 41537 + }, + { + "epoch": 0.36226474333257747, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 41538 + }, + { + "epoch": 0.36227346461774607, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 41539 + }, + { + "epoch": 0.36228218590291467, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 41540 + }, + { + "epoch": 0.3622909071880832, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 41541 + }, + { + "epoch": 0.3622996284732518, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 41542 + }, + { + "epoch": 0.3623083497584204, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 41543 + }, + { + "epoch": 0.36231707104358896, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 41544 + }, + { + "epoch": 0.36232579232875756, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 41545 + }, + { + "epoch": 0.36233451361392616, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 41546 + }, + { + "epoch": 0.36234323489909476, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 41547 + }, + { + "epoch": 0.3623519561842633, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 41548 + }, + { + "epoch": 0.3623606774694319, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 41549 + }, + { + "epoch": 0.3623693987546005, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 41550 + }, + { + "epoch": 0.36237812003976905, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 41551 + }, + { + "epoch": 0.36238684132493765, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 41552 + }, + { + "epoch": 0.36239556261010625, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 41553 + }, + { + "epoch": 0.3624042838952748, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9795, + "step": 41554 + }, + { + "epoch": 0.3624130051804434, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 41555 + }, + { + "epoch": 0.362421726465612, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41556 + }, + { + "epoch": 0.36243044775078054, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 41557 + }, + { + "epoch": 0.36243916903594914, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 41558 + }, + { + "epoch": 0.36244789032111774, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 41559 + }, + { + "epoch": 0.3624566116062863, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41560 + }, + { + "epoch": 0.3624653328914549, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 41561 + }, + { + "epoch": 0.3624740541766235, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 41562 + }, + { + "epoch": 0.36248277546179203, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 41563 + }, + { + "epoch": 0.36249149674696063, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 41564 + }, + { + "epoch": 0.36250021803212923, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 41565 + }, + { + "epoch": 0.3625089393172978, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 41566 + }, + { + "epoch": 0.3625176606024664, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 41567 + }, + { + "epoch": 0.362526381887635, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 41568 + }, + { + "epoch": 0.3625351031728035, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 41569 + }, + { + "epoch": 0.3625438244579721, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 41570 + }, + { + "epoch": 0.3625525457431407, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 41571 + }, + { + "epoch": 0.36256126702830926, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 41572 + }, + { + "epoch": 0.36256998831347786, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 41573 + }, + { + "epoch": 0.36257870959864646, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 41574 + }, + { + "epoch": 0.36258743088381507, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 41575 + }, + { + "epoch": 0.3625961521689836, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 41576 + }, + { + "epoch": 0.3626048734541522, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 41577 + }, + { + "epoch": 0.3626135947393208, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41578 + }, + { + "epoch": 0.36262231602448936, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41579 + }, + { + "epoch": 0.36263103730965796, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 41580 + }, + { + "epoch": 0.36263975859482656, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9807, + "step": 41581 + }, + { + "epoch": 0.3626484798799951, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 41582 + }, + { + "epoch": 0.3626572011651637, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 41583 + }, + { + "epoch": 0.3626659224503323, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 41584 + }, + { + "epoch": 0.36267464373550085, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 41585 + }, + { + "epoch": 0.36268336502066945, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 41586 + }, + { + "epoch": 0.36269208630583805, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 41587 + }, + { + "epoch": 0.3627008075910066, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 41588 + }, + { + "epoch": 0.3627095288761752, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 41589 + }, + { + "epoch": 0.3627182501613438, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 41590 + }, + { + "epoch": 0.36272697144651234, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 41591 + }, + { + "epoch": 0.36273569273168094, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41592 + }, + { + "epoch": 0.36274441401684954, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 41593 + }, + { + "epoch": 0.3627531353020181, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 41594 + }, + { + "epoch": 0.3627618565871867, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 41595 + }, + { + "epoch": 0.3627705778723553, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 41596 + }, + { + "epoch": 0.3627792991575238, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 41597 + }, + { + "epoch": 0.3627880204426924, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 41598 + }, + { + "epoch": 0.362796741727861, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 41599 + }, + { + "epoch": 0.3628054630130296, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 41600 + }, + { + "epoch": 0.3628141842981982, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 41601 + }, + { + "epoch": 0.3628229055833668, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 41602 + }, + { + "epoch": 0.3628316268685354, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 41603 + }, + { + "epoch": 0.3628403481537039, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 41604 + }, + { + "epoch": 0.3628490694388725, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 41605 + }, + { + "epoch": 0.3628577907240411, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 41606 + }, + { + "epoch": 0.36286651200920966, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41607 + }, + { + "epoch": 0.36287523329437826, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 41608 + }, + { + "epoch": 0.36288395457954686, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 41609 + }, + { + "epoch": 0.3628926758647154, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 41610 + }, + { + "epoch": 0.362901397149884, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 41611 + }, + { + "epoch": 0.3629101184350526, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 41612 + }, + { + "epoch": 0.36291883972022115, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 41613 + }, + { + "epoch": 0.36292756100538975, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41614 + }, + { + "epoch": 0.36293628229055835, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 41615 + }, + { + "epoch": 0.3629450035757269, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 41616 + }, + { + "epoch": 0.3629537248608955, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 41617 + }, + { + "epoch": 0.3629624461460641, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41618 + }, + { + "epoch": 0.36297116743123264, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 41619 + }, + { + "epoch": 0.36297988871640124, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 41620 + }, + { + "epoch": 0.36298861000156984, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 41621 + }, + { + "epoch": 0.3629973312867384, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 41622 + }, + { + "epoch": 0.363006052571907, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 41623 + }, + { + "epoch": 0.3630147738570756, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 41624 + }, + { + "epoch": 0.36302349514224413, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41625 + }, + { + "epoch": 0.36303221642741274, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 41626 + }, + { + "epoch": 0.36304093771258134, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 41627 + }, + { + "epoch": 0.36304965899774994, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 41628 + }, + { + "epoch": 0.3630583802829185, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 41629 + }, + { + "epoch": 0.3630671015680871, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 41630 + }, + { + "epoch": 0.3630758228532557, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 41631 + }, + { + "epoch": 0.3630845441384242, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 41632 + }, + { + "epoch": 0.3630932654235928, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 41633 + }, + { + "epoch": 0.3631019867087614, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 41634 + }, + { + "epoch": 0.36311070799392997, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 41635 + }, + { + "epoch": 0.36311942927909857, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 41636 + }, + { + "epoch": 0.36312815056426717, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 0.9784, + "step": 41637 + }, + { + "epoch": 0.3631368718494357, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41638 + }, + { + "epoch": 0.3631455931346043, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 41639 + }, + { + "epoch": 0.3631543144197729, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 41640 + }, + { + "epoch": 0.36316303570494146, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 41641 + }, + { + "epoch": 0.36317175699011006, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 41642 + }, + { + "epoch": 0.36318047827527866, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 41643 + }, + { + "epoch": 0.3631891995604472, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 41644 + }, + { + "epoch": 0.3631979208456158, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 41645 + }, + { + "epoch": 0.3632066421307844, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 41646 + }, + { + "epoch": 0.36321536341595295, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 41647 + }, + { + "epoch": 0.36322408470112155, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 41648 + }, + { + "epoch": 0.36323280598629015, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 41649 + }, + { + "epoch": 0.3632415272714587, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 41650 + }, + { + "epoch": 0.3632502485566273, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41651 + }, + { + "epoch": 0.3632589698417959, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 41652 + }, + { + "epoch": 0.36326769112696444, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 41653 + }, + { + "epoch": 0.36327641241213304, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 41654 + }, + { + "epoch": 0.36328513369730164, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 41655 + }, + { + "epoch": 0.36329385498247024, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 41656 + }, + { + "epoch": 0.3633025762676388, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 41657 + }, + { + "epoch": 0.3633112975528074, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 41658 + }, + { + "epoch": 0.363320018837976, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 41659 + }, + { + "epoch": 0.36332874012314453, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 41660 + }, + { + "epoch": 0.36333746140831313, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 41661 + }, + { + "epoch": 0.36334618269348173, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 41662 + }, + { + "epoch": 0.3633549039786503, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 41663 + }, + { + "epoch": 0.3633636252638189, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41664 + }, + { + "epoch": 0.3633723465489875, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 41665 + }, + { + "epoch": 0.363381067834156, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 41666 + }, + { + "epoch": 0.3633897891193246, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 41667 + }, + { + "epoch": 0.3633985104044932, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 41668 + }, + { + "epoch": 0.36340723168966177, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 41669 + }, + { + "epoch": 0.36341595297483037, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 41670 + }, + { + "epoch": 0.36342467425999897, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 41671 + }, + { + "epoch": 0.3634333955451675, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 41672 + }, + { + "epoch": 0.3634421168303361, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 41673 + }, + { + "epoch": 0.3634508381155047, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 41674 + }, + { + "epoch": 0.36345955940067326, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 41675 + }, + { + "epoch": 0.36346828068584186, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 41676 + }, + { + "epoch": 0.36347700197101046, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 41677 + }, + { + "epoch": 0.363485723256179, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 41678 + }, + { + "epoch": 0.3634944445413476, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 41679 + }, + { + "epoch": 0.3635031658265162, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 41680 + }, + { + "epoch": 0.36351188711168475, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 41681 + }, + { + "epoch": 0.36352060839685335, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 41682 + }, + { + "epoch": 0.36352932968202195, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 41683 + }, + { + "epoch": 0.36353805096719055, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 41684 + }, + { + "epoch": 0.3635467722523591, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 41685 + }, + { + "epoch": 0.3635554935375277, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 41686 + }, + { + "epoch": 0.3635642148226963, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 41687 + }, + { + "epoch": 0.36357293610786484, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 41688 + }, + { + "epoch": 0.36358165739303344, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 41689 + }, + { + "epoch": 0.36359037867820204, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 41690 + }, + { + "epoch": 0.3635990999633706, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41691 + }, + { + "epoch": 0.3636078212485392, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 41692 + }, + { + "epoch": 0.3636165425337078, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 41693 + }, + { + "epoch": 0.36362526381887633, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 41694 + }, + { + "epoch": 0.36363398510404493, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 41695 + }, + { + "epoch": 0.36364270638921353, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 41696 + }, + { + "epoch": 0.3636514276743821, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41697 + }, + { + "epoch": 0.3636601489595507, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 41698 + }, + { + "epoch": 0.3636688702447193, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 41699 + }, + { + "epoch": 0.3636775915298878, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 41700 + }, + { + "epoch": 0.3636863128150564, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 41701 + }, + { + "epoch": 0.363695034100225, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 41702 + }, + { + "epoch": 0.36370375538539357, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 41703 + }, + { + "epoch": 0.36371247667056217, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 41704 + }, + { + "epoch": 0.36372119795573077, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 41705 + }, + { + "epoch": 0.3637299192408993, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 41706 + }, + { + "epoch": 0.3637386405260679, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 41707 + }, + { + "epoch": 0.3637473618112365, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 41708 + }, + { + "epoch": 0.3637560830964051, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 41709 + }, + { + "epoch": 0.36376480438157366, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 41710 + }, + { + "epoch": 0.36377352566674226, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 41711 + }, + { + "epoch": 0.36378224695191086, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 41712 + }, + { + "epoch": 0.3637909682370794, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 41713 + }, + { + "epoch": 0.363799689522248, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 41714 + }, + { + "epoch": 0.3638084108074166, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 41715 + }, + { + "epoch": 0.36381713209258515, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41716 + }, + { + "epoch": 0.36382585337775375, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 41717 + }, + { + "epoch": 0.36383457466292235, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 41718 + }, + { + "epoch": 0.3638432959480909, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 41719 + }, + { + "epoch": 0.3638520172332595, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 41720 + }, + { + "epoch": 0.3638607385184281, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 41721 + }, + { + "epoch": 0.36386945980359664, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 41722 + }, + { + "epoch": 0.36387818108876524, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9815, + "step": 41723 + }, + { + "epoch": 0.36388690237393384, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41724 + }, + { + "epoch": 0.3638956236591024, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 41725 + }, + { + "epoch": 0.363904344944271, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 41726 + }, + { + "epoch": 0.3639130662294396, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 41727 + }, + { + "epoch": 0.36392178751460813, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 41728 + }, + { + "epoch": 0.36393050879977673, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 41729 + }, + { + "epoch": 0.36393923008494533, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 41730 + }, + { + "epoch": 0.3639479513701139, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 41731 + }, + { + "epoch": 0.3639566726552825, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 41732 + }, + { + "epoch": 0.3639653939404511, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 41733 + }, + { + "epoch": 0.3639741152256196, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 41734 + }, + { + "epoch": 0.3639828365107882, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 41735 + }, + { + "epoch": 0.3639915577959568, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 41736 + }, + { + "epoch": 0.3640002790811254, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 41737 + }, + { + "epoch": 0.36400900036629397, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 41738 + }, + { + "epoch": 0.36401772165146257, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 41739 + }, + { + "epoch": 0.36402644293663117, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41740 + }, + { + "epoch": 0.3640351642217997, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 41741 + }, + { + "epoch": 0.3640438855069683, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9681, + "step": 41742 + }, + { + "epoch": 0.3640526067921369, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 41743 + }, + { + "epoch": 0.36406132807730546, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 41744 + }, + { + "epoch": 0.36407004936247406, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 41745 + }, + { + "epoch": 0.36407877064764266, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 41746 + }, + { + "epoch": 0.3640874919328112, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 41747 + }, + { + "epoch": 0.3640962132179798, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 41748 + }, + { + "epoch": 0.3641049345031484, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 41749 + }, + { + "epoch": 0.36411365578831695, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 41750 + }, + { + "epoch": 0.36412237707348555, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 41751 + }, + { + "epoch": 0.36413109835865415, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 41752 + }, + { + "epoch": 0.3641398196438227, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 41753 + }, + { + "epoch": 0.3641485409289913, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 41754 + }, + { + "epoch": 0.3641572622141599, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 41755 + }, + { + "epoch": 0.36416598349932844, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 41756 + }, + { + "epoch": 0.36417470478449704, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 41757 + }, + { + "epoch": 0.36418342606966564, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 41758 + }, + { + "epoch": 0.3641921473548342, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 41759 + }, + { + "epoch": 0.3642008686400028, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 41760 + }, + { + "epoch": 0.3642095899251714, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 41761 + }, + { + "epoch": 0.36421831121033993, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 41762 + }, + { + "epoch": 0.36422703249550853, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9736, + "step": 41763 + }, + { + "epoch": 0.36423575378067713, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 41764 + }, + { + "epoch": 0.36424447506584573, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 41765 + }, + { + "epoch": 0.3642531963510143, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 41766 + }, + { + "epoch": 0.3642619176361829, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 41767 + }, + { + "epoch": 0.3642706389213515, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 41768 + }, + { + "epoch": 0.36427936020652, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 41769 + }, + { + "epoch": 0.3642880814916886, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 41770 + }, + { + "epoch": 0.3642968027768572, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 41771 + }, + { + "epoch": 0.36430552406202577, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 41772 + }, + { + "epoch": 0.36431424534719437, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 41773 + }, + { + "epoch": 0.36432296663236297, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 41774 + }, + { + "epoch": 0.3643316879175315, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 41775 + }, + { + "epoch": 0.3643404092027001, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 41776 + }, + { + "epoch": 0.3643491304878687, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 41777 + }, + { + "epoch": 0.36435785177303726, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 41778 + }, + { + "epoch": 0.36436657305820586, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 41779 + }, + { + "epoch": 0.36437529434337446, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 41780 + }, + { + "epoch": 0.364384015628543, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 41781 + }, + { + "epoch": 0.3643927369137116, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 41782 + }, + { + "epoch": 0.3644014581988802, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 41783 + }, + { + "epoch": 0.36441017948404875, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 41784 + }, + { + "epoch": 0.36441890076921735, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 41785 + }, + { + "epoch": 0.36442762205438595, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 41786 + }, + { + "epoch": 0.3644363433395545, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 41787 + }, + { + "epoch": 0.3644450646247231, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 41788 + }, + { + "epoch": 0.3644537859098917, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 41789 + }, + { + "epoch": 0.36446250719506024, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 41790 + }, + { + "epoch": 0.36447122848022884, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 41791 + }, + { + "epoch": 0.36447994976539744, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 41792 + }, + { + "epoch": 0.36448867105056604, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 41793 + }, + { + "epoch": 0.3644973923357346, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 41794 + }, + { + "epoch": 0.3645061136209032, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 41795 + }, + { + "epoch": 0.3645148349060718, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 41796 + }, + { + "epoch": 0.3645235561912403, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 41797 + }, + { + "epoch": 0.36453227747640893, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 41798 + }, + { + "epoch": 0.36454099876157753, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 41799 + }, + { + "epoch": 0.3645497200467461, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 41800 + }, + { + "epoch": 0.3645584413319147, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 41801 + }, + { + "epoch": 0.3645671626170833, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 41802 + }, + { + "epoch": 0.3645758839022518, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 41803 + }, + { + "epoch": 0.3645846051874204, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 41804 + }, + { + "epoch": 0.364593326472589, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 41805 + }, + { + "epoch": 0.36460204775775756, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 41806 + }, + { + "epoch": 0.36461076904292616, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 41807 + }, + { + "epoch": 0.36461949032809476, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 41808 + }, + { + "epoch": 0.3646282116132633, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 41809 + }, + { + "epoch": 0.3646369328984319, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 41810 + }, + { + "epoch": 0.3646456541836005, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 41811 + }, + { + "epoch": 0.36465437546876905, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 41812 + }, + { + "epoch": 0.36466309675393765, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 41813 + }, + { + "epoch": 0.36467181803910625, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 41814 + }, + { + "epoch": 0.3646805393242748, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 41815 + }, + { + "epoch": 0.3646892606094434, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 41816 + }, + { + "epoch": 0.364697981894612, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 41817 + }, + { + "epoch": 0.36470670317978054, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 41818 + }, + { + "epoch": 0.36471542446494915, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 41819 + }, + { + "epoch": 0.36472414575011775, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 41820 + }, + { + "epoch": 0.36473286703528635, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 41821 + }, + { + "epoch": 0.3647415883204549, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 41822 + }, + { + "epoch": 0.3647503096056235, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 41823 + }, + { + "epoch": 0.3647590308907921, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 41824 + }, + { + "epoch": 0.36476775217596064, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 41825 + }, + { + "epoch": 0.36477647346112924, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 41826 + }, + { + "epoch": 0.36478519474629784, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 41827 + }, + { + "epoch": 0.3647939160314664, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 41828 + }, + { + "epoch": 0.364802637316635, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 41829 + }, + { + "epoch": 0.3648113586018036, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 41830 + }, + { + "epoch": 0.3648200798869721, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 41831 + }, + { + "epoch": 0.3648288011721407, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 41832 + }, + { + "epoch": 0.3648375224573093, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 41833 + }, + { + "epoch": 0.36484624374247787, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 41834 + }, + { + "epoch": 0.36485496502764647, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 41835 + }, + { + "epoch": 0.36486368631281507, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 41836 + }, + { + "epoch": 0.3648724075979836, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 41837 + }, + { + "epoch": 0.3648811288831522, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 41838 + }, + { + "epoch": 0.3648898501683208, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 41839 + }, + { + "epoch": 0.36489857145348936, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41840 + }, + { + "epoch": 0.36490729273865796, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 41841 + }, + { + "epoch": 0.36491601402382656, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 41842 + }, + { + "epoch": 0.3649247353089951, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 41843 + }, + { + "epoch": 0.3649334565941637, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 41844 + }, + { + "epoch": 0.3649421778793323, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 41845 + }, + { + "epoch": 0.3649508991645009, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 41846 + }, + { + "epoch": 0.36495962044966945, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 41847 + }, + { + "epoch": 0.36496834173483805, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 41848 + }, + { + "epoch": 0.36497706302000665, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 41849 + }, + { + "epoch": 0.3649857843051752, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 41850 + }, + { + "epoch": 0.3649945055903438, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 41851 + }, + { + "epoch": 0.3650032268755124, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 41852 + }, + { + "epoch": 0.36501194816068094, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 41853 + }, + { + "epoch": 0.36502066944584954, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 41854 + }, + { + "epoch": 0.36502939073101814, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 41855 + }, + { + "epoch": 0.3650381120161867, + "grad_norm": 0.2734375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 41856 + }, + { + "epoch": 0.3650468333013553, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 41857 + }, + { + "epoch": 0.3650555545865239, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 41858 + }, + { + "epoch": 0.36506427587169243, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 41859 + }, + { + "epoch": 0.36507299715686103, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 41860 + }, + { + "epoch": 0.36508171844202963, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 41861 + }, + { + "epoch": 0.3650904397271982, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 41862 + }, + { + "epoch": 0.3650991610123668, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 41863 + }, + { + "epoch": 0.3651078822975354, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 41864 + }, + { + "epoch": 0.3651166035827039, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 41865 + }, + { + "epoch": 0.3651253248678725, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 41866 + }, + { + "epoch": 0.3651340461530411, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 41867 + }, + { + "epoch": 0.36514276743820967, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 41868 + }, + { + "epoch": 0.36515148872337827, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 41869 + }, + { + "epoch": 0.36516021000854687, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 41870 + }, + { + "epoch": 0.3651689312937154, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 41871 + }, + { + "epoch": 0.365177652578884, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 41872 + }, + { + "epoch": 0.3651863738640526, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 41873 + }, + { + "epoch": 0.3651950951492212, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 41874 + }, + { + "epoch": 0.36520381643438976, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 41875 + }, + { + "epoch": 0.36521253771955836, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 41876 + }, + { + "epoch": 0.36522125900472696, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 41877 + }, + { + "epoch": 0.3652299802898955, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 41878 + }, + { + "epoch": 0.3652387015750641, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 41879 + }, + { + "epoch": 0.3652474228602327, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 41880 + }, + { + "epoch": 0.36525614414540125, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 41881 + }, + { + "epoch": 0.36526486543056985, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 41882 + }, + { + "epoch": 0.36527358671573845, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 41883 + }, + { + "epoch": 0.365282308000907, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 41884 + }, + { + "epoch": 0.3652910292860756, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 41885 + }, + { + "epoch": 0.3652997505712442, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 41886 + }, + { + "epoch": 0.36530847185641274, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 41887 + }, + { + "epoch": 0.36531719314158134, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 41888 + }, + { + "epoch": 0.36532591442674994, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 41889 + }, + { + "epoch": 0.3653346357119185, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 41890 + }, + { + "epoch": 0.3653433569970871, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 41891 + }, + { + "epoch": 0.3653520782822557, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 41892 + }, + { + "epoch": 0.36536079956742423, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 41893 + }, + { + "epoch": 0.36536952085259283, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 41894 + }, + { + "epoch": 0.36537824213776143, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 41895 + }, + { + "epoch": 0.36538696342293, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 41896 + }, + { + "epoch": 0.3653956847080986, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 41897 + }, + { + "epoch": 0.3654044059932672, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 41898 + }, + { + "epoch": 0.3654131272784357, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 41899 + }, + { + "epoch": 0.3654218485636043, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 41900 + }, + { + "epoch": 0.3654305698487729, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 41901 + }, + { + "epoch": 0.3654392911339415, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 41902 + }, + { + "epoch": 0.36544801241911007, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 41903 + }, + { + "epoch": 0.36545673370427867, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 41904 + }, + { + "epoch": 0.36546545498944727, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 41905 + }, + { + "epoch": 0.3654741762746158, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 41906 + }, + { + "epoch": 0.3654828975597844, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 41907 + }, + { + "epoch": 0.365491618844953, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 41908 + }, + { + "epoch": 0.36550034013012156, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 41909 + }, + { + "epoch": 0.36550906141529016, + "grad_norm": 0.0712890625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 41910 + }, + { + "epoch": 0.36551778270045876, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 41911 + }, + { + "epoch": 0.3655265039856273, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41912 + }, + { + "epoch": 0.3655352252707959, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 41913 + }, + { + "epoch": 0.3655439465559645, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 41914 + }, + { + "epoch": 0.36555266784113305, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 41915 + }, + { + "epoch": 0.36556138912630165, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 41916 + }, + { + "epoch": 0.36557011041147025, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 41917 + }, + { + "epoch": 0.3655788316966388, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 41918 + }, + { + "epoch": 0.3655875529818074, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 41919 + }, + { + "epoch": 0.365596274266976, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 41920 + }, + { + "epoch": 0.36560499555214454, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 41921 + }, + { + "epoch": 0.36561371683731314, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 41922 + }, + { + "epoch": 0.36562243812248174, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 41923 + }, + { + "epoch": 0.3656311594076503, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 41924 + }, + { + "epoch": 0.3656398806928189, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 41925 + }, + { + "epoch": 0.3656486019779875, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 41926 + }, + { + "epoch": 0.36565732326315603, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 41927 + }, + { + "epoch": 0.36566604454832463, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 41928 + }, + { + "epoch": 0.36567476583349323, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 41929 + }, + { + "epoch": 0.36568348711866183, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 41930 + }, + { + "epoch": 0.3656922084038304, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 41931 + }, + { + "epoch": 0.365700929688999, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 41932 + }, + { + "epoch": 0.3657096509741676, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 41933 + }, + { + "epoch": 0.3657183722593361, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 41934 + }, + { + "epoch": 0.3657270935445047, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 41935 + }, + { + "epoch": 0.3657358148296733, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 41936 + }, + { + "epoch": 0.36574453611484187, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 41937 + }, + { + "epoch": 0.36575325740001047, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 41938 + }, + { + "epoch": 0.36576197868517907, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 41939 + }, + { + "epoch": 0.3657706999703476, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 41940 + }, + { + "epoch": 0.3657794212555162, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 41941 + }, + { + "epoch": 0.3657881425406848, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 41942 + }, + { + "epoch": 0.36579686382585336, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 41943 + }, + { + "epoch": 0.36580558511102196, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41944 + }, + { + "epoch": 0.36581430639619056, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 41945 + }, + { + "epoch": 0.3658230276813591, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 41946 + }, + { + "epoch": 0.3658317489665277, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 41947 + }, + { + "epoch": 0.3658404702516963, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 41948 + }, + { + "epoch": 0.36584919153686485, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 41949 + }, + { + "epoch": 0.36585791282203345, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 41950 + }, + { + "epoch": 0.36586663410720205, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 41951 + }, + { + "epoch": 0.3658753553923706, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 41952 + }, + { + "epoch": 0.3658840766775392, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 41953 + }, + { + "epoch": 0.3658927979627078, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 41954 + }, + { + "epoch": 0.3659015192478764, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 41955 + }, + { + "epoch": 0.36591024053304494, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 41956 + }, + { + "epoch": 0.36591896181821354, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 41957 + }, + { + "epoch": 0.36592768310338214, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 41958 + }, + { + "epoch": 0.3659364043885507, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 41959 + }, + { + "epoch": 0.3659451256737193, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 41960 + }, + { + "epoch": 0.3659538469588879, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 41961 + }, + { + "epoch": 0.36596256824405643, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 41962 + }, + { + "epoch": 0.36597128952922503, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 41963 + }, + { + "epoch": 0.36598001081439363, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 41964 + }, + { + "epoch": 0.3659887320995622, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 41965 + }, + { + "epoch": 0.3659974533847308, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 41966 + }, + { + "epoch": 0.3660061746698994, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 41967 + }, + { + "epoch": 0.3660148959550679, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 41968 + }, + { + "epoch": 0.3660236172402365, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 41969 + }, + { + "epoch": 0.3660323385254051, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 41970 + }, + { + "epoch": 0.36604105981057367, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41971 + }, + { + "epoch": 0.36604978109574227, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 41972 + }, + { + "epoch": 0.36605850238091087, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41973 + }, + { + "epoch": 0.3660672236660794, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 41974 + }, + { + "epoch": 0.366075944951248, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 41975 + }, + { + "epoch": 0.3660846662364166, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 41976 + }, + { + "epoch": 0.36609338752158516, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 41977 + }, + { + "epoch": 0.36610210880675376, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 41978 + }, + { + "epoch": 0.36611083009192236, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 41979 + }, + { + "epoch": 0.3661195513770909, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 41980 + }, + { + "epoch": 0.3661282726622595, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 41981 + }, + { + "epoch": 0.3661369939474281, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 41982 + }, + { + "epoch": 0.3661457152325967, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 41983 + }, + { + "epoch": 0.36615443651776525, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 41984 + }, + { + "epoch": 0.36616315780293385, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 41985 + }, + { + "epoch": 0.36617187908810245, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 41986 + }, + { + "epoch": 0.366180600373271, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 41987 + }, + { + "epoch": 0.3661893216584396, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 41988 + }, + { + "epoch": 0.3661980429436082, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 41989 + }, + { + "epoch": 0.36620676422877674, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 41990 + }, + { + "epoch": 0.36621548551394534, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 41991 + }, + { + "epoch": 0.36622420679911394, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 41992 + }, + { + "epoch": 0.3662329280842825, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 41993 + }, + { + "epoch": 0.3662416493694511, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 41994 + }, + { + "epoch": 0.3662503706546197, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 41995 + }, + { + "epoch": 0.36625909193978823, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 41996 + }, + { + "epoch": 0.36626781322495683, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 41997 + }, + { + "epoch": 0.36627653451012543, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 41998 + }, + { + "epoch": 0.366285255795294, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 41999 + }, + { + "epoch": 0.3662939770804626, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 42000 + }, + { + "epoch": 0.3663026983656312, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 42001 + }, + { + "epoch": 0.3663114196507997, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 42002 + }, + { + "epoch": 0.3663201409359683, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42003 + }, + { + "epoch": 0.3663288622211369, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 42004 + }, + { + "epoch": 0.36633758350630546, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 42005 + }, + { + "epoch": 0.36634630479147406, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 42006 + }, + { + "epoch": 0.36635502607664266, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 42007 + }, + { + "epoch": 0.3663637473618112, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 42008 + }, + { + "epoch": 0.3663724686469798, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 42009 + }, + { + "epoch": 0.3663811899321484, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 42010 + }, + { + "epoch": 0.366389911217317, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 42011 + }, + { + "epoch": 0.36639863250248556, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 42012 + }, + { + "epoch": 0.36640735378765416, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 42013 + }, + { + "epoch": 0.36641607507282276, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 42014 + }, + { + "epoch": 0.3664247963579913, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 42015 + }, + { + "epoch": 0.3664335176431599, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 42016 + }, + { + "epoch": 0.3664422389283285, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 42017 + }, + { + "epoch": 0.36645096021349705, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 42018 + }, + { + "epoch": 0.36645968149866565, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 42019 + }, + { + "epoch": 0.36646840278383425, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42020 + }, + { + "epoch": 0.3664771240690028, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 42021 + }, + { + "epoch": 0.3664858453541714, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 42022 + }, + { + "epoch": 0.36649456663934, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 42023 + }, + { + "epoch": 0.36650328792450854, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 42024 + }, + { + "epoch": 0.36651200920967714, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 42025 + }, + { + "epoch": 0.36652073049484574, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 42026 + }, + { + "epoch": 0.3665294517800143, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 42027 + }, + { + "epoch": 0.3665381730651829, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 42028 + }, + { + "epoch": 0.3665468943503515, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 42029 + }, + { + "epoch": 0.36655561563552, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 42030 + }, + { + "epoch": 0.3665643369206886, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 42031 + }, + { + "epoch": 0.3665730582058572, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 42032 + }, + { + "epoch": 0.36658177949102577, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 42033 + }, + { + "epoch": 0.3665905007761944, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 42034 + }, + { + "epoch": 0.366599222061363, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 42035 + }, + { + "epoch": 0.3666079433465315, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 42036 + }, + { + "epoch": 0.3666166646317001, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 42037 + }, + { + "epoch": 0.3666253859168687, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 42038 + }, + { + "epoch": 0.3666341072020373, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 42039 + }, + { + "epoch": 0.36664282848720586, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 42040 + }, + { + "epoch": 0.36665154977237446, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 42041 + }, + { + "epoch": 0.36666027105754306, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 42042 + }, + { + "epoch": 0.3666689923427116, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 42043 + }, + { + "epoch": 0.3666777136278802, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 42044 + }, + { + "epoch": 0.3666864349130488, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 42045 + }, + { + "epoch": 0.36669515619821735, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 42046 + }, + { + "epoch": 0.36670387748338595, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 42047 + }, + { + "epoch": 0.36671259876855455, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 42048 + }, + { + "epoch": 0.3667213200537231, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 42049 + }, + { + "epoch": 0.3667300413388917, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 42050 + }, + { + "epoch": 0.3667387626240603, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 42051 + }, + { + "epoch": 0.36674748390922884, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 42052 + }, + { + "epoch": 0.36675620519439744, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 42053 + }, + { + "epoch": 0.36676492647956604, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42054 + }, + { + "epoch": 0.3667736477647346, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 42055 + }, + { + "epoch": 0.3667823690499032, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 42056 + }, + { + "epoch": 0.3667910903350718, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 42057 + }, + { + "epoch": 0.36679981162024033, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 42058 + }, + { + "epoch": 0.36680853290540893, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 42059 + }, + { + "epoch": 0.36681725419057754, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 42060 + }, + { + "epoch": 0.3668259754757461, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 42061 + }, + { + "epoch": 0.3668346967609147, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 42062 + }, + { + "epoch": 0.3668434180460833, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 42063 + }, + { + "epoch": 0.3668521393312519, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 42064 + }, + { + "epoch": 0.3668608606164204, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 42065 + }, + { + "epoch": 0.366869581901589, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 42066 + }, + { + "epoch": 0.3668783031867576, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 42067 + }, + { + "epoch": 0.36688702447192617, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 42068 + }, + { + "epoch": 0.36689574575709477, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 42069 + }, + { + "epoch": 0.36690446704226337, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 42070 + }, + { + "epoch": 0.3669131883274319, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 42071 + }, + { + "epoch": 0.3669219096126005, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42072 + }, + { + "epoch": 0.3669306308977691, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 42073 + }, + { + "epoch": 0.36693935218293766, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 42074 + }, + { + "epoch": 0.36694807346810626, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42075 + }, + { + "epoch": 0.36695679475327486, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 42076 + }, + { + "epoch": 0.3669655160384434, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 42077 + }, + { + "epoch": 0.366974237323612, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42078 + }, + { + "epoch": 0.3669829586087806, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 42079 + }, + { + "epoch": 0.36699167989394915, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 42080 + }, + { + "epoch": 0.36700040117911775, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 42081 + }, + { + "epoch": 0.36700912246428635, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 42082 + }, + { + "epoch": 0.3670178437494549, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 42083 + }, + { + "epoch": 0.3670265650346235, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42084 + }, + { + "epoch": 0.3670352863197921, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 42085 + }, + { + "epoch": 0.36704400760496064, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 42086 + }, + { + "epoch": 0.36705272889012924, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 42087 + }, + { + "epoch": 0.36706145017529784, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 42088 + }, + { + "epoch": 0.3670701714604664, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 42089 + }, + { + "epoch": 0.367078892745635, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 42090 + }, + { + "epoch": 0.3670876140308036, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42091 + }, + { + "epoch": 0.3670963353159722, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 42092 + }, + { + "epoch": 0.36710505660114073, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 42093 + }, + { + "epoch": 0.36711377788630933, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9836, + "step": 42094 + }, + { + "epoch": 0.36712249917147793, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 42095 + }, + { + "epoch": 0.3671312204566465, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 42096 + }, + { + "epoch": 0.3671399417418151, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 42097 + }, + { + "epoch": 0.3671486630269837, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 42098 + }, + { + "epoch": 0.3671573843121522, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42099 + }, + { + "epoch": 0.3671661055973208, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 42100 + }, + { + "epoch": 0.3671748268824894, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 42101 + }, + { + "epoch": 0.36718354816765797, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 42102 + }, + { + "epoch": 0.36719226945282657, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 42103 + }, + { + "epoch": 0.36720099073799517, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 42104 + }, + { + "epoch": 0.3672097120231637, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 42105 + }, + { + "epoch": 0.3672184333083323, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 42106 + }, + { + "epoch": 0.3672271545935009, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 42107 + }, + { + "epoch": 0.36723587587866946, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 42108 + }, + { + "epoch": 0.36724459716383806, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 42109 + }, + { + "epoch": 0.36725331844900666, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 42110 + }, + { + "epoch": 0.3672620397341752, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 42111 + }, + { + "epoch": 0.3672707610193438, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 42112 + }, + { + "epoch": 0.3672794823045124, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 42113 + }, + { + "epoch": 0.36728820358968095, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 42114 + }, + { + "epoch": 0.36729692487484955, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 42115 + }, + { + "epoch": 0.36730564616001815, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 42116 + }, + { + "epoch": 0.3673143674451867, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 42117 + }, + { + "epoch": 0.3673230887303553, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 42118 + }, + { + "epoch": 0.3673318100155239, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 42119 + }, + { + "epoch": 0.3673405313006925, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42120 + }, + { + "epoch": 0.36734925258586104, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 42121 + }, + { + "epoch": 0.36735797387102964, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 42122 + }, + { + "epoch": 0.36736669515619824, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 42123 + }, + { + "epoch": 0.3673754164413668, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 42124 + }, + { + "epoch": 0.3673841377265354, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 42125 + }, + { + "epoch": 0.367392859011704, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 42126 + }, + { + "epoch": 0.36740158029687253, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 42127 + }, + { + "epoch": 0.36741030158204113, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 42128 + }, + { + "epoch": 0.36741902286720973, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 42129 + }, + { + "epoch": 0.3674277441523783, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 42130 + }, + { + "epoch": 0.3674364654375469, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 42131 + }, + { + "epoch": 0.3674451867227155, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 42132 + }, + { + "epoch": 0.367453908007884, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 42133 + }, + { + "epoch": 0.3674626292930526, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 42134 + }, + { + "epoch": 0.3674713505782212, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 42135 + }, + { + "epoch": 0.36748007186338977, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 42136 + }, + { + "epoch": 0.36748879314855837, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 42137 + }, + { + "epoch": 0.36749751443372697, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 42138 + }, + { + "epoch": 0.3675062357188955, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 42139 + }, + { + "epoch": 0.3675149570040641, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 42140 + }, + { + "epoch": 0.3675236782892327, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 42141 + }, + { + "epoch": 0.36753239957440126, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 42142 + }, + { + "epoch": 0.36754112085956986, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42143 + }, + { + "epoch": 0.36754984214473846, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 42144 + }, + { + "epoch": 0.367558563429907, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 42145 + }, + { + "epoch": 0.3675672847150756, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 42146 + }, + { + "epoch": 0.3675760060002442, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 42147 + }, + { + "epoch": 0.3675847272854128, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 42148 + }, + { + "epoch": 0.36759344857058135, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 42149 + }, + { + "epoch": 0.36760216985574995, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42150 + }, + { + "epoch": 0.36761089114091855, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 42151 + }, + { + "epoch": 0.3676196124260871, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 42152 + }, + { + "epoch": 0.3676283337112557, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 42153 + }, + { + "epoch": 0.3676370549964243, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 42154 + }, + { + "epoch": 0.36764577628159284, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 42155 + }, + { + "epoch": 0.36765449756676144, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 42156 + }, + { + "epoch": 0.36766321885193004, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 42157 + }, + { + "epoch": 0.3676719401370986, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 42158 + }, + { + "epoch": 0.3676806614222672, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42159 + }, + { + "epoch": 0.3676893827074358, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 42160 + }, + { + "epoch": 0.36769810399260433, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 42161 + }, + { + "epoch": 0.36770682527777293, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 42162 + }, + { + "epoch": 0.36771554656294153, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42163 + }, + { + "epoch": 0.3677242678481101, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 42164 + }, + { + "epoch": 0.3677329891332787, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 42165 + }, + { + "epoch": 0.3677417104184473, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 42166 + }, + { + "epoch": 0.3677504317036158, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 42167 + }, + { + "epoch": 0.3677591529887844, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 42168 + }, + { + "epoch": 0.367767874273953, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 42169 + }, + { + "epoch": 0.36777659555912157, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 42170 + }, + { + "epoch": 0.36778531684429017, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42171 + }, + { + "epoch": 0.36779403812945877, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 42172 + }, + { + "epoch": 0.36780275941462737, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 42173 + }, + { + "epoch": 0.3678114806997959, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 42174 + }, + { + "epoch": 0.3678202019849645, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 42175 + }, + { + "epoch": 0.3678289232701331, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 42176 + }, + { + "epoch": 0.36783764455530166, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 42177 + }, + { + "epoch": 0.36784636584047026, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 42178 + }, + { + "epoch": 0.36785508712563886, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 42179 + }, + { + "epoch": 0.3678638084108074, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 42180 + }, + { + "epoch": 0.367872529695976, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 42181 + }, + { + "epoch": 0.3678812509811446, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 42182 + }, + { + "epoch": 0.36788997226631315, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 42183 + }, + { + "epoch": 0.36789869355148175, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 42184 + }, + { + "epoch": 0.36790741483665035, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 42185 + }, + { + "epoch": 0.3679161361218189, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 42186 + }, + { + "epoch": 0.3679248574069875, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 42187 + }, + { + "epoch": 0.3679335786921561, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 42188 + }, + { + "epoch": 0.36794229997732464, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 42189 + }, + { + "epoch": 0.36795102126249324, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9836, + "step": 42190 + }, + { + "epoch": 0.36795974254766184, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 42191 + }, + { + "epoch": 0.3679684638328304, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 42192 + }, + { + "epoch": 0.367977185117999, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 42193 + }, + { + "epoch": 0.3679859064031676, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9835, + "step": 42194 + }, + { + "epoch": 0.36799462768833613, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 42195 + }, + { + "epoch": 0.36800334897350473, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 42196 + }, + { + "epoch": 0.36801207025867333, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 42197 + }, + { + "epoch": 0.3680207915438419, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 42198 + }, + { + "epoch": 0.3680295128290105, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 42199 + }, + { + "epoch": 0.3680382341141791, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 42200 + }, + { + "epoch": 0.3680469553993477, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 42201 + }, + { + "epoch": 0.3680556766845162, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 42202 + }, + { + "epoch": 0.3680643979696848, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 42203 + }, + { + "epoch": 0.3680731192548534, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 42204 + }, + { + "epoch": 0.36808184054002197, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 42205 + }, + { + "epoch": 0.36809056182519057, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 42206 + }, + { + "epoch": 0.36809928311035917, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 42207 + }, + { + "epoch": 0.3681080043955277, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 42208 + }, + { + "epoch": 0.3681167256806963, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 42209 + }, + { + "epoch": 0.3681254469658649, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42210 + }, + { + "epoch": 0.36813416825103346, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 42211 + }, + { + "epoch": 0.36814288953620206, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 42212 + }, + { + "epoch": 0.36815161082137066, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 42213 + }, + { + "epoch": 0.3681603321065392, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42214 + }, + { + "epoch": 0.3681690533917078, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42215 + }, + { + "epoch": 0.3681777746768764, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 42216 + }, + { + "epoch": 0.36818649596204495, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9802, + "step": 42217 + }, + { + "epoch": 0.36819521724721355, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 42218 + }, + { + "epoch": 0.36820393853238215, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 42219 + }, + { + "epoch": 0.3682126598175507, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9833, + "step": 42220 + }, + { + "epoch": 0.3682213811027193, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42221 + }, + { + "epoch": 0.3682301023878879, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9821, + "step": 42222 + }, + { + "epoch": 0.36823882367305644, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 42223 + }, + { + "epoch": 0.36824754495822504, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 42224 + }, + { + "epoch": 0.36825626624339364, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 42225 + }, + { + "epoch": 0.3682649875285622, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 42226 + }, + { + "epoch": 0.3682737088137308, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 42227 + }, + { + "epoch": 0.3682824300988994, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 42228 + }, + { + "epoch": 0.368291151384068, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9808, + "step": 42229 + }, + { + "epoch": 0.3682998726692365, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 42230 + }, + { + "epoch": 0.36830859395440513, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 42231 + }, + { + "epoch": 0.36831731523957373, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 42232 + }, + { + "epoch": 0.3683260365247423, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 42233 + }, + { + "epoch": 0.3683347578099109, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 42234 + }, + { + "epoch": 0.3683434790950795, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 42235 + }, + { + "epoch": 0.368352200380248, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 42236 + }, + { + "epoch": 0.3683609216654166, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 42237 + }, + { + "epoch": 0.3683696429505852, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 42238 + }, + { + "epoch": 0.36837836423575376, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 42239 + }, + { + "epoch": 0.36838708552092236, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 42240 + }, + { + "epoch": 0.36839580680609096, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 42241 + }, + { + "epoch": 0.3684045280912595, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 42242 + }, + { + "epoch": 0.3684132493764281, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 42243 + }, + { + "epoch": 0.3684219706615967, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42244 + }, + { + "epoch": 0.36843069194676525, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 42245 + }, + { + "epoch": 0.36843941323193385, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 42246 + }, + { + "epoch": 0.36844813451710245, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 42247 + }, + { + "epoch": 0.368456855802271, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 42248 + }, + { + "epoch": 0.3684655770874396, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 42249 + }, + { + "epoch": 0.3684742983726082, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 42250 + }, + { + "epoch": 0.36848301965777674, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 42251 + }, + { + "epoch": 0.36849174094294534, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 42252 + }, + { + "epoch": 0.36850046222811395, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 42253 + }, + { + "epoch": 0.3685091835132825, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 42254 + }, + { + "epoch": 0.3685179047984511, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 42255 + }, + { + "epoch": 0.3685266260836197, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 42256 + }, + { + "epoch": 0.3685353473687883, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 42257 + }, + { + "epoch": 0.36854406865395684, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 42258 + }, + { + "epoch": 0.36855278993912544, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 42259 + }, + { + "epoch": 0.36856151122429404, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9731, + "step": 42260 + }, + { + "epoch": 0.3685702325094626, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 42261 + }, + { + "epoch": 0.3685789537946312, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 42262 + }, + { + "epoch": 0.3685876750797998, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 42263 + }, + { + "epoch": 0.3685963963649683, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 42264 + }, + { + "epoch": 0.3686051176501369, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 42265 + }, + { + "epoch": 0.3686138389353055, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 42266 + }, + { + "epoch": 0.36862256022047407, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 42267 + }, + { + "epoch": 0.36863128150564267, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 42268 + }, + { + "epoch": 0.36864000279081127, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 42269 + }, + { + "epoch": 0.3686487240759798, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 42270 + }, + { + "epoch": 0.3686574453611484, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 42271 + }, + { + "epoch": 0.368666166646317, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 42272 + }, + { + "epoch": 0.36867488793148556, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 42273 + }, + { + "epoch": 0.36868360921665416, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 42274 + }, + { + "epoch": 0.36869233050182276, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 42275 + }, + { + "epoch": 0.3687010517869913, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 42276 + }, + { + "epoch": 0.3687097730721599, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 42277 + }, + { + "epoch": 0.3687184943573285, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 42278 + }, + { + "epoch": 0.36872721564249705, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 42279 + }, + { + "epoch": 0.36873593692766565, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 42280 + }, + { + "epoch": 0.36874465821283425, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 42281 + }, + { + "epoch": 0.36875337949800285, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42282 + }, + { + "epoch": 0.3687621007831714, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 42283 + }, + { + "epoch": 0.36877082206834, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 42284 + }, + { + "epoch": 0.3687795433535086, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 42285 + }, + { + "epoch": 0.36878826463867714, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42286 + }, + { + "epoch": 0.36879698592384574, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 42287 + }, + { + "epoch": 0.36880570720901434, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 42288 + }, + { + "epoch": 0.3688144284941829, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 42289 + }, + { + "epoch": 0.3688231497793515, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 42290 + }, + { + "epoch": 0.3688318710645201, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 42291 + }, + { + "epoch": 0.36884059234968863, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 42292 + }, + { + "epoch": 0.36884931363485723, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 42293 + }, + { + "epoch": 0.36885803492002583, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 42294 + }, + { + "epoch": 0.3688667562051944, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 42295 + }, + { + "epoch": 0.368875477490363, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 42296 + }, + { + "epoch": 0.3688841987755316, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 42297 + }, + { + "epoch": 0.3688929200607001, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 42298 + }, + { + "epoch": 0.3689016413458687, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 42299 + }, + { + "epoch": 0.3689103626310373, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 42300 + }, + { + "epoch": 0.36891908391620587, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 42301 + }, + { + "epoch": 0.36892780520137447, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 42302 + }, + { + "epoch": 0.36893652648654307, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 42303 + }, + { + "epoch": 0.3689452477717116, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 42304 + }, + { + "epoch": 0.3689539690568802, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 42305 + }, + { + "epoch": 0.3689626903420488, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 42306 + }, + { + "epoch": 0.36897141162721736, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 42307 + }, + { + "epoch": 0.36898013291238596, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 42308 + }, + { + "epoch": 0.36898885419755456, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 42309 + }, + { + "epoch": 0.36899757548272316, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 42310 + }, + { + "epoch": 0.3690062967678917, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42311 + }, + { + "epoch": 0.3690150180530603, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 42312 + }, + { + "epoch": 0.3690237393382289, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 42313 + }, + { + "epoch": 0.36903246062339745, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 42314 + }, + { + "epoch": 0.36904118190856605, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 42315 + }, + { + "epoch": 0.36904990319373465, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 42316 + }, + { + "epoch": 0.3690586244789032, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 42317 + }, + { + "epoch": 0.3690673457640718, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 42318 + }, + { + "epoch": 0.3690760670492404, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 42319 + }, + { + "epoch": 0.36908478833440894, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 42320 + }, + { + "epoch": 0.36909350961957754, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 42321 + }, + { + "epoch": 0.36910223090474614, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42322 + }, + { + "epoch": 0.3691109521899147, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 42323 + }, + { + "epoch": 0.3691196734750833, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 42324 + }, + { + "epoch": 0.3691283947602519, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 42325 + }, + { + "epoch": 0.36913711604542043, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 42326 + }, + { + "epoch": 0.36914583733058903, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9836, + "step": 42327 + }, + { + "epoch": 0.36915455861575763, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 42328 + }, + { + "epoch": 0.3691632799009262, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 42329 + }, + { + "epoch": 0.3691720011860948, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 42330 + }, + { + "epoch": 0.3691807224712634, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 42331 + }, + { + "epoch": 0.3691894437564319, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 42332 + }, + { + "epoch": 0.3691981650416005, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 42333 + }, + { + "epoch": 0.3692068863267691, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 42334 + }, + { + "epoch": 0.36921560761193767, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 42335 + }, + { + "epoch": 0.36922432889710627, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 42336 + }, + { + "epoch": 0.36923305018227487, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 42337 + }, + { + "epoch": 0.36924177146744347, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 42338 + }, + { + "epoch": 0.369250492752612, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 42339 + }, + { + "epoch": 0.3692592140377806, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 42340 + }, + { + "epoch": 0.3692679353229492, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 42341 + }, + { + "epoch": 0.36927665660811776, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 42342 + }, + { + "epoch": 0.36928537789328636, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 42343 + }, + { + "epoch": 0.36929409917845496, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 42344 + }, + { + "epoch": 0.3693028204636235, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 42345 + }, + { + "epoch": 0.3693115417487921, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 42346 + }, + { + "epoch": 0.3693202630339607, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 42347 + }, + { + "epoch": 0.36932898431912925, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 42348 + }, + { + "epoch": 0.36933770560429785, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 42349 + }, + { + "epoch": 0.36934642688946645, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42350 + }, + { + "epoch": 0.369355148174635, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 42351 + }, + { + "epoch": 0.3693638694598036, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 42352 + }, + { + "epoch": 0.3693725907449722, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 42353 + }, + { + "epoch": 0.36938131203014074, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 42354 + }, + { + "epoch": 0.36939003331530934, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 42355 + }, + { + "epoch": 0.36939875460047794, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 42356 + }, + { + "epoch": 0.3694074758856465, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 42357 + }, + { + "epoch": 0.3694161971708151, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 42358 + }, + { + "epoch": 0.3694249184559837, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 42359 + }, + { + "epoch": 0.36943363974115223, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 42360 + }, + { + "epoch": 0.36944236102632083, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 42361 + }, + { + "epoch": 0.36945108231148943, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 42362 + }, + { + "epoch": 0.369459803596658, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 42363 + }, + { + "epoch": 0.3694685248818266, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42364 + }, + { + "epoch": 0.3694772461669952, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 42365 + }, + { + "epoch": 0.3694859674521638, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 42366 + }, + { + "epoch": 0.3694946887373323, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42367 + }, + { + "epoch": 0.3695034100225009, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 42368 + }, + { + "epoch": 0.3695121313076695, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 42369 + }, + { + "epoch": 0.36952085259283807, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 42370 + }, + { + "epoch": 0.36952957387800667, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 42371 + }, + { + "epoch": 0.36953829516317527, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 42372 + }, + { + "epoch": 0.3695470164483438, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 42373 + }, + { + "epoch": 0.3695557377335124, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 42374 + }, + { + "epoch": 0.369564459018681, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 42375 + }, + { + "epoch": 0.36957318030384956, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 42376 + }, + { + "epoch": 0.36958190158901816, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 42377 + }, + { + "epoch": 0.36959062287418676, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 42378 + }, + { + "epoch": 0.3695993441593553, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 42379 + }, + { + "epoch": 0.3696080654445239, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9795, + "step": 42380 + }, + { + "epoch": 0.3696167867296925, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 42381 + }, + { + "epoch": 0.36962550801486105, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 42382 + }, + { + "epoch": 0.36963422930002965, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 42383 + }, + { + "epoch": 0.36964295058519825, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 42384 + }, + { + "epoch": 0.3696516718703668, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42385 + }, + { + "epoch": 0.3696603931555354, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 42386 + }, + { + "epoch": 0.369669114440704, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9739, + "step": 42387 + }, + { + "epoch": 0.36967783572587254, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 42388 + }, + { + "epoch": 0.36968655701104114, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 42389 + }, + { + "epoch": 0.36969527829620974, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 42390 + }, + { + "epoch": 0.3697039995813783, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 42391 + }, + { + "epoch": 0.3697127208665469, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 42392 + }, + { + "epoch": 0.3697214421517155, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 42393 + }, + { + "epoch": 0.3697301634368841, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 42394 + }, + { + "epoch": 0.36973888472205263, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 42395 + }, + { + "epoch": 0.36974760600722123, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 42396 + }, + { + "epoch": 0.36975632729238983, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 42397 + }, + { + "epoch": 0.3697650485775584, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 42398 + }, + { + "epoch": 0.369773769862727, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 42399 + }, + { + "epoch": 0.3697824911478956, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 42400 + }, + { + "epoch": 0.3697912124330641, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 42401 + }, + { + "epoch": 0.3697999337182327, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 42402 + }, + { + "epoch": 0.3698086550034013, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 42403 + }, + { + "epoch": 0.36981737628856987, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 42404 + }, + { + "epoch": 0.36982609757373847, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 42405 + }, + { + "epoch": 0.36983481885890707, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 42406 + }, + { + "epoch": 0.3698435401440756, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 42407 + }, + { + "epoch": 0.3698522614292442, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9793, + "step": 42408 + }, + { + "epoch": 0.3698609827144128, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 42409 + }, + { + "epoch": 0.36986970399958136, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 42410 + }, + { + "epoch": 0.36987842528474996, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 42411 + }, + { + "epoch": 0.36988714656991856, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 42412 + }, + { + "epoch": 0.3698958678550871, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42413 + }, + { + "epoch": 0.3699045891402557, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 42414 + }, + { + "epoch": 0.3699133104254243, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 42415 + }, + { + "epoch": 0.36992203171059285, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 42416 + }, + { + "epoch": 0.36993075299576145, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 42417 + }, + { + "epoch": 0.36993947428093005, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42418 + }, + { + "epoch": 0.36994819556609865, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 42419 + }, + { + "epoch": 0.3699569168512672, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42420 + }, + { + "epoch": 0.3699656381364358, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 42421 + }, + { + "epoch": 0.3699743594216044, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 42422 + }, + { + "epoch": 0.36998308070677294, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 42423 + }, + { + "epoch": 0.36999180199194154, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 42424 + }, + { + "epoch": 0.37000052327711014, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 42425 + }, + { + "epoch": 0.3700092445622787, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 42426 + }, + { + "epoch": 0.3700179658474473, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42427 + }, + { + "epoch": 0.3700266871326159, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 42428 + }, + { + "epoch": 0.37003540841778443, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 42429 + }, + { + "epoch": 0.37004412970295303, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 42430 + }, + { + "epoch": 0.37005285098812163, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 42431 + }, + { + "epoch": 0.3700615722732902, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 42432 + }, + { + "epoch": 0.3700702935584588, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 42433 + }, + { + "epoch": 0.3700790148436274, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 42434 + }, + { + "epoch": 0.3700877361287959, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 42435 + }, + { + "epoch": 0.3700964574139645, + "grad_norm": 0.345703125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 42436 + }, + { + "epoch": 0.3701051786991331, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 42437 + }, + { + "epoch": 0.37011389998430166, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 42438 + }, + { + "epoch": 0.37012262126947026, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 42439 + }, + { + "epoch": 0.37013134255463886, + "grad_norm": 0.28125, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 42440 + }, + { + "epoch": 0.3701400638398074, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 42441 + }, + { + "epoch": 0.370148785124976, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 42442 + }, + { + "epoch": 0.3701575064101446, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 42443 + }, + { + "epoch": 0.37016622769531315, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 42444 + }, + { + "epoch": 0.37017494898048176, + "grad_norm": 0.333984375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 42445 + }, + { + "epoch": 0.37018367026565036, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 42446 + }, + { + "epoch": 0.37019239155081896, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 42447 + }, + { + "epoch": 0.3702011128359875, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 42448 + }, + { + "epoch": 0.3702098341211561, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 42449 + }, + { + "epoch": 0.3702185554063247, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 42450 + }, + { + "epoch": 0.37022727669149325, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 42451 + }, + { + "epoch": 0.37023599797666185, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 42452 + }, + { + "epoch": 0.37024471926183045, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 42453 + }, + { + "epoch": 0.370253440546999, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 42454 + }, + { + "epoch": 0.3702621618321676, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 42455 + }, + { + "epoch": 0.3702708831173362, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 42456 + }, + { + "epoch": 0.37027960440250474, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 42457 + }, + { + "epoch": 0.37028832568767334, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 42458 + }, + { + "epoch": 0.37029704697284194, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 42459 + }, + { + "epoch": 0.3703057682580105, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 42460 + }, + { + "epoch": 0.3703144895431791, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42461 + }, + { + "epoch": 0.3703232108283477, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 42462 + }, + { + "epoch": 0.3703319321135162, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 42463 + }, + { + "epoch": 0.3703406533986848, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 42464 + }, + { + "epoch": 0.3703493746838534, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 42465 + }, + { + "epoch": 0.37035809596902197, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 42466 + }, + { + "epoch": 0.3703668172541906, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 42467 + }, + { + "epoch": 0.3703755385393592, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 42468 + }, + { + "epoch": 0.3703842598245277, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 42469 + }, + { + "epoch": 0.3703929811096963, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 42470 + }, + { + "epoch": 0.3704017023948649, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 42471 + }, + { + "epoch": 0.37041042368003346, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 42472 + }, + { + "epoch": 0.37041914496520206, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 42473 + }, + { + "epoch": 0.37042786625037066, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42474 + }, + { + "epoch": 0.37043658753553926, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 42475 + }, + { + "epoch": 0.3704453088207078, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 42476 + }, + { + "epoch": 0.3704540301058764, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 42477 + }, + { + "epoch": 0.370462751391045, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 42478 + }, + { + "epoch": 0.37047147267621355, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 42479 + }, + { + "epoch": 0.37048019396138215, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 42480 + }, + { + "epoch": 0.37048891524655075, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 42481 + }, + { + "epoch": 0.3704976365317193, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 42482 + }, + { + "epoch": 0.3705063578168879, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 42483 + }, + { + "epoch": 0.3705150791020565, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 42484 + }, + { + "epoch": 0.37052380038722504, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 42485 + }, + { + "epoch": 0.37053252167239364, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 42486 + }, + { + "epoch": 0.37054124295756224, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 42487 + }, + { + "epoch": 0.3705499642427308, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 42488 + }, + { + "epoch": 0.3705586855278994, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 42489 + }, + { + "epoch": 0.370567406813068, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 42490 + }, + { + "epoch": 0.37057612809823653, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 42491 + }, + { + "epoch": 0.37058484938340513, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 42492 + }, + { + "epoch": 0.37059357066857374, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 42493 + }, + { + "epoch": 0.3706022919537423, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 42494 + }, + { + "epoch": 0.3706110132389109, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 42495 + }, + { + "epoch": 0.3706197345240795, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 42496 + }, + { + "epoch": 0.370628455809248, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 42497 + }, + { + "epoch": 0.3706371770944166, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 42498 + }, + { + "epoch": 0.3706458983795852, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9754, + "step": 42499 + }, + { + "epoch": 0.37065461966475377, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 42500 + }, + { + "epoch": 0.37066334094992237, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 42501 + }, + { + "epoch": 0.37067206223509097, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 42502 + }, + { + "epoch": 0.37068078352025957, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 42503 + }, + { + "epoch": 0.3706895048054281, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 42504 + }, + { + "epoch": 0.3706982260905967, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 42505 + }, + { + "epoch": 0.3707069473757653, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 42506 + }, + { + "epoch": 0.37071566866093386, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 42507 + }, + { + "epoch": 0.37072438994610246, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 42508 + }, + { + "epoch": 0.37073311123127106, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42509 + }, + { + "epoch": 0.3707418325164396, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 42510 + }, + { + "epoch": 0.3707505538016082, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 42511 + }, + { + "epoch": 0.3707592750867768, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 42512 + }, + { + "epoch": 0.37076799637194535, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 42513 + }, + { + "epoch": 0.37077671765711395, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 42514 + }, + { + "epoch": 0.37078543894228255, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 42515 + }, + { + "epoch": 0.3707941602274511, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42516 + }, + { + "epoch": 0.3708028815126197, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 42517 + }, + { + "epoch": 0.3708116027977883, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 42518 + }, + { + "epoch": 0.37082032408295684, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 42519 + }, + { + "epoch": 0.37082904536812544, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42520 + }, + { + "epoch": 0.37083776665329404, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 42521 + }, + { + "epoch": 0.3708464879384626, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 42522 + }, + { + "epoch": 0.3708552092236312, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 42523 + }, + { + "epoch": 0.3708639305087998, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42524 + }, + { + "epoch": 0.37087265179396833, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 42525 + }, + { + "epoch": 0.37088137307913693, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 42526 + }, + { + "epoch": 0.37089009436430553, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 42527 + }, + { + "epoch": 0.37089881564947413, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 42528 + }, + { + "epoch": 0.3709075369346427, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 42529 + }, + { + "epoch": 0.3709162582198113, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 42530 + }, + { + "epoch": 0.3709249795049799, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 42531 + }, + { + "epoch": 0.3709337007901484, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 42532 + }, + { + "epoch": 0.370942422075317, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 42533 + }, + { + "epoch": 0.3709511433604856, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 42534 + }, + { + "epoch": 0.37095986464565417, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 42535 + }, + { + "epoch": 0.37096858593082277, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 42536 + }, + { + "epoch": 0.37097730721599137, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 42537 + }, + { + "epoch": 0.3709860285011599, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 42538 + }, + { + "epoch": 0.3709947497863285, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 42539 + }, + { + "epoch": 0.3710034710714971, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 42540 + }, + { + "epoch": 0.37101219235666566, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 42541 + }, + { + "epoch": 0.37102091364183426, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 42542 + }, + { + "epoch": 0.37102963492700286, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 42543 + }, + { + "epoch": 0.3710383562121714, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 42544 + }, + { + "epoch": 0.37104707749734, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 42545 + }, + { + "epoch": 0.3710557987825086, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 42546 + }, + { + "epoch": 0.37106452006767715, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 42547 + }, + { + "epoch": 0.37107324135284575, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 42548 + }, + { + "epoch": 0.37108196263801435, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 42549 + }, + { + "epoch": 0.3710906839231829, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 42550 + }, + { + "epoch": 0.3710994052083515, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 42551 + }, + { + "epoch": 0.3711081264935201, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 42552 + }, + { + "epoch": 0.37111684777868864, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 42553 + }, + { + "epoch": 0.37112556906385724, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 42554 + }, + { + "epoch": 0.37113429034902584, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 42555 + }, + { + "epoch": 0.37114301163419444, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 42556 + }, + { + "epoch": 0.371151732919363, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 42557 + }, + { + "epoch": 0.3711604542045316, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 42558 + }, + { + "epoch": 0.3711691754897002, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0331, + "step": 42559 + }, + { + "epoch": 0.37117789677486873, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 42560 + }, + { + "epoch": 0.37118661806003733, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 42561 + }, + { + "epoch": 0.37119533934520593, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 42562 + }, + { + "epoch": 0.3712040606303745, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 42563 + }, + { + "epoch": 0.3712127819155431, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 42564 + }, + { + "epoch": 0.3712215032007117, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 42565 + }, + { + "epoch": 0.3712302244858802, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 42566 + }, + { + "epoch": 0.3712389457710488, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 42567 + }, + { + "epoch": 0.3712476670562174, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 42568 + }, + { + "epoch": 0.37125638834138597, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 42569 + }, + { + "epoch": 0.37126510962655457, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 42570 + }, + { + "epoch": 0.37127383091172317, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 42571 + }, + { + "epoch": 0.3712825521968917, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 42572 + }, + { + "epoch": 0.3712912734820603, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 42573 + }, + { + "epoch": 0.3712999947672289, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 42574 + }, + { + "epoch": 0.37130871605239746, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 42575 + }, + { + "epoch": 0.37131743733756606, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 42576 + }, + { + "epoch": 0.37132615862273466, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 42577 + }, + { + "epoch": 0.3713348799079032, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 42578 + }, + { + "epoch": 0.3713436011930718, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 42579 + }, + { + "epoch": 0.3713523224782404, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 42580 + }, + { + "epoch": 0.37136104376340895, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 42581 + }, + { + "epoch": 0.37136976504857755, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 42582 + }, + { + "epoch": 0.37137848633374615, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 42583 + }, + { + "epoch": 0.37138720761891475, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42584 + }, + { + "epoch": 0.3713959289040833, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 42585 + }, + { + "epoch": 0.3714046501892519, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 42586 + }, + { + "epoch": 0.3714133714744205, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 42587 + }, + { + "epoch": 0.37142209275958904, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 42588 + }, + { + "epoch": 0.37143081404475764, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 42589 + }, + { + "epoch": 0.37143953532992624, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 42590 + }, + { + "epoch": 0.3714482566150948, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 42591 + }, + { + "epoch": 0.3714569779002634, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 42592 + }, + { + "epoch": 0.371465699185432, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 42593 + }, + { + "epoch": 0.37147442047060053, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 42594 + }, + { + "epoch": 0.37148314175576913, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 42595 + }, + { + "epoch": 0.37149186304093773, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42596 + }, + { + "epoch": 0.3715005843261063, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 42597 + }, + { + "epoch": 0.3715093056112749, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 42598 + }, + { + "epoch": 0.3715180268964435, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 42599 + }, + { + "epoch": 0.371526748181612, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 42600 + }, + { + "epoch": 0.3715354694667806, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 42601 + }, + { + "epoch": 0.3715441907519492, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 42602 + }, + { + "epoch": 0.37155291203711777, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 42603 + }, + { + "epoch": 0.37156163332228637, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 42604 + }, + { + "epoch": 0.37157035460745497, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 42605 + }, + { + "epoch": 0.3715790758926235, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 42606 + }, + { + "epoch": 0.3715877971777921, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 42607 + }, + { + "epoch": 0.3715965184629607, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 42608 + }, + { + "epoch": 0.37160523974812926, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 42609 + }, + { + "epoch": 0.37161396103329786, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 42610 + }, + { + "epoch": 0.37162268231846646, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 42611 + }, + { + "epoch": 0.37163140360363506, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 42612 + }, + { + "epoch": 0.3716401248888036, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 42613 + }, + { + "epoch": 0.3716488461739722, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 42614 + }, + { + "epoch": 0.3716575674591408, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 42615 + }, + { + "epoch": 0.37166628874430935, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 42616 + }, + { + "epoch": 0.37167501002947795, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 42617 + }, + { + "epoch": 0.37168373131464655, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 42618 + }, + { + "epoch": 0.3716924525998151, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9804, + "step": 42619 + }, + { + "epoch": 0.3717011738849837, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 42620 + }, + { + "epoch": 0.3717098951701523, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 42621 + }, + { + "epoch": 0.37171861645532084, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 42622 + }, + { + "epoch": 0.37172733774048944, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 42623 + }, + { + "epoch": 0.37173605902565804, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 42624 + }, + { + "epoch": 0.3717447803108266, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 42625 + }, + { + "epoch": 0.3717535015959952, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42626 + }, + { + "epoch": 0.3717622228811638, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 42627 + }, + { + "epoch": 0.37177094416633233, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 42628 + }, + { + "epoch": 0.37177966545150093, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 42629 + }, + { + "epoch": 0.37178838673666953, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 42630 + }, + { + "epoch": 0.3717971080218381, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 42631 + }, + { + "epoch": 0.3718058293070067, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 42632 + }, + { + "epoch": 0.3718145505921753, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 42633 + }, + { + "epoch": 0.3718232718773438, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9816, + "step": 42634 + }, + { + "epoch": 0.3718319931625124, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 42635 + }, + { + "epoch": 0.371840714447681, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 42636 + }, + { + "epoch": 0.3718494357328496, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 42637 + }, + { + "epoch": 0.37185815701801817, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 42638 + }, + { + "epoch": 0.37186687830318677, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 42639 + }, + { + "epoch": 0.37187559958835537, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 42640 + }, + { + "epoch": 0.3718843208735239, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 42641 + }, + { + "epoch": 0.3718930421586925, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 42642 + }, + { + "epoch": 0.3719017634438611, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 42643 + }, + { + "epoch": 0.37191048472902966, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 42644 + }, + { + "epoch": 0.37191920601419826, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 42645 + }, + { + "epoch": 0.37192792729936686, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 42646 + }, + { + "epoch": 0.3719366485845354, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 42647 + }, + { + "epoch": 0.371945369869704, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 42648 + }, + { + "epoch": 0.3719540911548726, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 42649 + }, + { + "epoch": 0.37196281244004115, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 42650 + }, + { + "epoch": 0.37197153372520975, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 42651 + }, + { + "epoch": 0.37198025501037835, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 42652 + }, + { + "epoch": 0.3719889762955469, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 42653 + }, + { + "epoch": 0.3719976975807155, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 42654 + }, + { + "epoch": 0.3720064188658841, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 42655 + }, + { + "epoch": 0.37201514015105264, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 42656 + }, + { + "epoch": 0.37202386143622124, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 42657 + }, + { + "epoch": 0.37203258272138984, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 42658 + }, + { + "epoch": 0.3720413040065584, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 42659 + }, + { + "epoch": 0.372050025291727, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 42660 + }, + { + "epoch": 0.3720587465768956, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 42661 + }, + { + "epoch": 0.3720674678620641, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 42662 + }, + { + "epoch": 0.3720761891472327, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 42663 + }, + { + "epoch": 0.37208491043240133, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 42664 + }, + { + "epoch": 0.37209363171756993, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 42665 + }, + { + "epoch": 0.3721023530027385, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 42666 + }, + { + "epoch": 0.3721110742879071, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 42667 + }, + { + "epoch": 0.3721197955730757, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 42668 + }, + { + "epoch": 0.3721285168582442, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 42669 + }, + { + "epoch": 0.3721372381434128, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 42670 + }, + { + "epoch": 0.3721459594285814, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42671 + }, + { + "epoch": 0.37215468071374996, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 42672 + }, + { + "epoch": 0.37216340199891856, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 42673 + }, + { + "epoch": 0.37217212328408716, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 42674 + }, + { + "epoch": 0.3721808445692557, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 42675 + }, + { + "epoch": 0.3721895658544243, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 42676 + }, + { + "epoch": 0.3721982871395929, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 42677 + }, + { + "epoch": 0.37220700842476145, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 42678 + }, + { + "epoch": 0.37221572970993005, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 42679 + }, + { + "epoch": 0.37222445099509865, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42680 + }, + { + "epoch": 0.3722331722802672, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 42681 + }, + { + "epoch": 0.3722418935654358, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 42682 + }, + { + "epoch": 0.3722506148506044, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 42683 + }, + { + "epoch": 0.37225933613577294, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 42684 + }, + { + "epoch": 0.37226805742094154, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 42685 + }, + { + "epoch": 0.37227677870611015, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 42686 + }, + { + "epoch": 0.3722854999912787, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 42687 + }, + { + "epoch": 0.3722942212764473, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 42688 + }, + { + "epoch": 0.3723029425616159, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 42689 + }, + { + "epoch": 0.37231166384678444, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 42690 + }, + { + "epoch": 0.37232038513195304, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 42691 + }, + { + "epoch": 0.37232910641712164, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 42692 + }, + { + "epoch": 0.37233782770229024, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 42693 + }, + { + "epoch": 0.3723465489874588, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 42694 + }, + { + "epoch": 0.3723552702726274, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 42695 + }, + { + "epoch": 0.372363991557796, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 42696 + }, + { + "epoch": 0.3723727128429645, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 42697 + }, + { + "epoch": 0.3723814341281331, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 42698 + }, + { + "epoch": 0.3723901554133017, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 42699 + }, + { + "epoch": 0.37239887669847027, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 42700 + }, + { + "epoch": 0.37240759798363887, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 42701 + }, + { + "epoch": 0.37241631926880747, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 42702 + }, + { + "epoch": 0.372425040553976, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 42703 + }, + { + "epoch": 0.3724337618391446, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 42704 + }, + { + "epoch": 0.3724424831243132, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 42705 + }, + { + "epoch": 0.37245120440948176, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 42706 + }, + { + "epoch": 0.37245992569465036, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 42707 + }, + { + "epoch": 0.37246864697981896, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 42708 + }, + { + "epoch": 0.3724773682649875, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 42709 + }, + { + "epoch": 0.3724860895501561, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 42710 + }, + { + "epoch": 0.3724948108353247, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 42711 + }, + { + "epoch": 0.37250353212049325, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 42712 + }, + { + "epoch": 0.37251225340566185, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 42713 + }, + { + "epoch": 0.37252097469083045, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42714 + }, + { + "epoch": 0.372529695975999, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 42715 + }, + { + "epoch": 0.3725384172611676, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 42716 + }, + { + "epoch": 0.3725471385463362, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 42717 + }, + { + "epoch": 0.37255585983150474, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 42718 + }, + { + "epoch": 0.37256458111667334, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 42719 + }, + { + "epoch": 0.37257330240184194, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 42720 + }, + { + "epoch": 0.37258202368701054, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 42721 + }, + { + "epoch": 0.3725907449721791, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 42722 + }, + { + "epoch": 0.3725994662573477, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 42723 + }, + { + "epoch": 0.3726081875425163, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 42724 + }, + { + "epoch": 0.37261690882768483, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 42725 + }, + { + "epoch": 0.37262563011285343, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 42726 + }, + { + "epoch": 0.37263435139802203, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 42727 + }, + { + "epoch": 0.3726430726831906, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 42728 + }, + { + "epoch": 0.3726517939683592, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42729 + }, + { + "epoch": 0.3726605152535278, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 42730 + }, + { + "epoch": 0.3726692365386963, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 42731 + }, + { + "epoch": 0.3726779578238649, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 42732 + }, + { + "epoch": 0.3726866791090335, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 42733 + }, + { + "epoch": 0.37269540039420207, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 42734 + }, + { + "epoch": 0.37270412167937067, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 42735 + }, + { + "epoch": 0.37271284296453927, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 42736 + }, + { + "epoch": 0.3727215642497078, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 42737 + }, + { + "epoch": 0.3727302855348764, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 42738 + }, + { + "epoch": 0.372739006820045, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 42739 + }, + { + "epoch": 0.37274772810521356, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 42740 + }, + { + "epoch": 0.37275644939038216, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 42741 + }, + { + "epoch": 0.37276517067555076, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 42742 + }, + { + "epoch": 0.3727738919607193, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 42743 + }, + { + "epoch": 0.3727826132458879, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 42744 + }, + { + "epoch": 0.3727913345310565, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 42745 + }, + { + "epoch": 0.3728000558162251, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 42746 + }, + { + "epoch": 0.37280877710139365, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 42747 + }, + { + "epoch": 0.37281749838656225, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 42748 + }, + { + "epoch": 0.37282621967173085, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 42749 + }, + { + "epoch": 0.3728349409568994, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 42750 + }, + { + "epoch": 0.372843662242068, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 42751 + }, + { + "epoch": 0.3728523835272366, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 42752 + }, + { + "epoch": 0.37286110481240514, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 42753 + }, + { + "epoch": 0.37286982609757374, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42754 + }, + { + "epoch": 0.37287854738274234, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 42755 + }, + { + "epoch": 0.3728872686679109, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 42756 + }, + { + "epoch": 0.3728959899530795, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 42757 + }, + { + "epoch": 0.3729047112382481, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 42758 + }, + { + "epoch": 0.37291343252341663, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 42759 + }, + { + "epoch": 0.37292215380858523, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 42760 + }, + { + "epoch": 0.37293087509375383, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 42761 + }, + { + "epoch": 0.3729395963789224, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 42762 + }, + { + "epoch": 0.372948317664091, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 42763 + }, + { + "epoch": 0.3729570389492596, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 42764 + }, + { + "epoch": 0.3729657602344281, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 42765 + }, + { + "epoch": 0.3729744815195967, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 42766 + }, + { + "epoch": 0.3729832028047653, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 42767 + }, + { + "epoch": 0.37299192408993387, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 42768 + }, + { + "epoch": 0.37300064537510247, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 42769 + }, + { + "epoch": 0.37300936666027107, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 42770 + }, + { + "epoch": 0.3730180879454396, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 42771 + }, + { + "epoch": 0.3730268092306082, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 42772 + }, + { + "epoch": 0.3730355305157768, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 42773 + }, + { + "epoch": 0.3730442518009454, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9781, + "step": 42774 + }, + { + "epoch": 0.37305297308611396, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 42775 + }, + { + "epoch": 0.37306169437128256, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 42776 + }, + { + "epoch": 0.37307041565645116, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 42777 + }, + { + "epoch": 0.3730791369416197, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 42778 + }, + { + "epoch": 0.3730878582267883, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 42779 + }, + { + "epoch": 0.3730965795119569, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 42780 + }, + { + "epoch": 0.37310530079712545, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 42781 + }, + { + "epoch": 0.37311402208229405, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 42782 + }, + { + "epoch": 0.37312274336746265, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 42783 + }, + { + "epoch": 0.3731314646526312, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42784 + }, + { + "epoch": 0.3731401859377998, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 42785 + }, + { + "epoch": 0.3731489072229684, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 42786 + }, + { + "epoch": 0.37315762850813694, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 42787 + }, + { + "epoch": 0.37316634979330554, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 42788 + }, + { + "epoch": 0.37317507107847414, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 42789 + }, + { + "epoch": 0.3731837923636427, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 42790 + }, + { + "epoch": 0.3731925136488113, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 42791 + }, + { + "epoch": 0.3732012349339799, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 42792 + }, + { + "epoch": 0.37320995621914843, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 42793 + }, + { + "epoch": 0.37321867750431703, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 42794 + }, + { + "epoch": 0.37322739878948563, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 42795 + }, + { + "epoch": 0.3732361200746542, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 42796 + }, + { + "epoch": 0.3732448413598228, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 42797 + }, + { + "epoch": 0.3732535626449914, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 42798 + }, + { + "epoch": 0.3732622839301599, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 42799 + }, + { + "epoch": 0.3732710052153285, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 42800 + }, + { + "epoch": 0.3732797265004971, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 42801 + }, + { + "epoch": 0.3732884477856657, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 42802 + }, + { + "epoch": 0.37329716907083427, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 42803 + }, + { + "epoch": 0.37330589035600287, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 42804 + }, + { + "epoch": 0.37331461164117147, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 42805 + }, + { + "epoch": 0.37332333292634, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 42806 + }, + { + "epoch": 0.3733320542115086, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 42807 + }, + { + "epoch": 0.3733407754966772, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 42808 + }, + { + "epoch": 0.37334949678184576, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 42809 + }, + { + "epoch": 0.37335821806701436, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 42810 + }, + { + "epoch": 0.37336693935218296, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 42811 + }, + { + "epoch": 0.3733756606373515, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 42812 + }, + { + "epoch": 0.3733843819225201, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 42813 + }, + { + "epoch": 0.3733931032076887, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42814 + }, + { + "epoch": 0.37340182449285725, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 42815 + }, + { + "epoch": 0.37341054577802585, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 42816 + }, + { + "epoch": 0.37341926706319445, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42817 + }, + { + "epoch": 0.373427988348363, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 42818 + }, + { + "epoch": 0.3734367096335316, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 42819 + }, + { + "epoch": 0.3734454309187002, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 42820 + }, + { + "epoch": 0.37345415220386874, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 42821 + }, + { + "epoch": 0.37346287348903734, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 42822 + }, + { + "epoch": 0.37347159477420594, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 42823 + }, + { + "epoch": 0.3734803160593745, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 42824 + }, + { + "epoch": 0.3734890373445431, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 42825 + }, + { + "epoch": 0.3734977586297117, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 42826 + }, + { + "epoch": 0.37350647991488023, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 42827 + }, + { + "epoch": 0.37351520120004883, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 42828 + }, + { + "epoch": 0.37352392248521743, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 42829 + }, + { + "epoch": 0.37353264377038603, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 42830 + }, + { + "epoch": 0.3735413650555546, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 42831 + }, + { + "epoch": 0.3735500863407232, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 42832 + }, + { + "epoch": 0.3735588076258918, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 42833 + }, + { + "epoch": 0.3735675289110603, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 42834 + }, + { + "epoch": 0.3735762501962289, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 42835 + }, + { + "epoch": 0.3735849714813975, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 42836 + }, + { + "epoch": 0.37359369276656607, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 42837 + }, + { + "epoch": 0.37360241405173467, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 42838 + }, + { + "epoch": 0.37361113533690327, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 42839 + }, + { + "epoch": 0.3736198566220718, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 42840 + }, + { + "epoch": 0.3736285779072404, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 42841 + }, + { + "epoch": 0.373637299192409, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 42842 + }, + { + "epoch": 0.37364602047757756, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 42843 + }, + { + "epoch": 0.37365474176274616, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 42844 + }, + { + "epoch": 0.37366346304791476, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 42845 + }, + { + "epoch": 0.3736721843330833, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 42846 + }, + { + "epoch": 0.3736809056182519, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 42847 + }, + { + "epoch": 0.3736896269034205, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 42848 + }, + { + "epoch": 0.37369834818858905, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 42849 + }, + { + "epoch": 0.37370706947375765, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 42850 + }, + { + "epoch": 0.37371579075892625, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 42851 + }, + { + "epoch": 0.3737245120440948, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 42852 + }, + { + "epoch": 0.3737332333292634, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 42853 + }, + { + "epoch": 0.373741954614432, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 42854 + }, + { + "epoch": 0.3737506758996006, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 42855 + }, + { + "epoch": 0.37375939718476914, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 42856 + }, + { + "epoch": 0.37376811846993774, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 42857 + }, + { + "epoch": 0.37377683975510634, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 42858 + }, + { + "epoch": 0.3737855610402749, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 42859 + }, + { + "epoch": 0.3737942823254435, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 42860 + }, + { + "epoch": 0.3738030036106121, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42861 + }, + { + "epoch": 0.37381172489578063, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 42862 + }, + { + "epoch": 0.37382044618094923, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 42863 + }, + { + "epoch": 0.37382916746611783, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 42864 + }, + { + "epoch": 0.3738378887512864, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 42865 + }, + { + "epoch": 0.373846610036455, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 42866 + }, + { + "epoch": 0.3738553313216236, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 42867 + }, + { + "epoch": 0.3738640526067921, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 42868 + }, + { + "epoch": 0.3738727738919607, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 42869 + }, + { + "epoch": 0.3738814951771293, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 42870 + }, + { + "epoch": 0.37389021646229786, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 42871 + }, + { + "epoch": 0.37389893774746646, + "grad_norm": 0.416015625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 42872 + }, + { + "epoch": 0.37390765903263506, + "grad_norm": 0.453125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 42873 + }, + { + "epoch": 0.3739163803178036, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 42874 + }, + { + "epoch": 0.3739251016029722, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 42875 + }, + { + "epoch": 0.3739338228881408, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 42876 + }, + { + "epoch": 0.37394254417330935, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 42877 + }, + { + "epoch": 0.37395126545847795, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 42878 + }, + { + "epoch": 0.37395998674364656, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 42879 + }, + { + "epoch": 0.3739687080288151, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 42880 + }, + { + "epoch": 0.3739774293139837, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 42881 + }, + { + "epoch": 0.3739861505991523, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 42882 + }, + { + "epoch": 0.3739948718843209, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 42883 + }, + { + "epoch": 0.37400359316948945, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 42884 + }, + { + "epoch": 0.37401231445465805, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 42885 + }, + { + "epoch": 0.37402103573982665, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 42886 + }, + { + "epoch": 0.3740297570249952, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 42887 + }, + { + "epoch": 0.3740384783101638, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 42888 + }, + { + "epoch": 0.3740471995953324, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 42889 + }, + { + "epoch": 0.37405592088050094, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 42890 + }, + { + "epoch": 0.37406464216566954, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 42891 + }, + { + "epoch": 0.37407336345083814, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 42892 + }, + { + "epoch": 0.3740820847360067, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 42893 + }, + { + "epoch": 0.3740908060211753, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 42894 + }, + { + "epoch": 0.3740995273063439, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 42895 + }, + { + "epoch": 0.3741082485915124, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 42896 + }, + { + "epoch": 0.374116969876681, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 42897 + }, + { + "epoch": 0.3741256911618496, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 42898 + }, + { + "epoch": 0.37413441244701817, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 42899 + }, + { + "epoch": 0.3741431337321868, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 42900 + }, + { + "epoch": 0.3741518550173554, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 42901 + }, + { + "epoch": 0.3741605763025239, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 42902 + }, + { + "epoch": 0.3741692975876925, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 42903 + }, + { + "epoch": 0.3741780188728611, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 42904 + }, + { + "epoch": 0.37418674015802966, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 42905 + }, + { + "epoch": 0.37419546144319826, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 42906 + }, + { + "epoch": 0.37420418272836686, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 42907 + }, + { + "epoch": 0.3742129040135354, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 42908 + }, + { + "epoch": 0.374221625298704, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 42909 + }, + { + "epoch": 0.3742303465838726, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 42910 + }, + { + "epoch": 0.3742390678690412, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 42911 + }, + { + "epoch": 0.37424778915420975, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 42912 + }, + { + "epoch": 0.37425651043937835, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9805, + "step": 42913 + }, + { + "epoch": 0.37426523172454695, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42914 + }, + { + "epoch": 0.3742739530097155, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 42915 + }, + { + "epoch": 0.3742826742948841, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0338, + "step": 42916 + }, + { + "epoch": 0.3742913955800527, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 42917 + }, + { + "epoch": 0.37430011686522124, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 42918 + }, + { + "epoch": 0.37430883815038984, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 42919 + }, + { + "epoch": 0.37431755943555844, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 42920 + }, + { + "epoch": 0.374326280720727, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 42921 + }, + { + "epoch": 0.3743350020058956, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 42922 + }, + { + "epoch": 0.3743437232910642, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 42923 + }, + { + "epoch": 0.37435244457623273, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 42924 + }, + { + "epoch": 0.37436116586140133, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 42925 + }, + { + "epoch": 0.37436988714656994, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 42926 + }, + { + "epoch": 0.3743786084317385, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 42927 + }, + { + "epoch": 0.3743873297169071, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 42928 + }, + { + "epoch": 0.3743960510020757, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 42929 + }, + { + "epoch": 0.3744047722872442, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9839, + "step": 42930 + }, + { + "epoch": 0.3744134935724128, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 42931 + }, + { + "epoch": 0.3744222148575814, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 42932 + }, + { + "epoch": 0.37443093614274997, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 42933 + }, + { + "epoch": 0.37443965742791857, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 42934 + }, + { + "epoch": 0.37444837871308717, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 42935 + }, + { + "epoch": 0.3744570999982557, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 42936 + }, + { + "epoch": 0.3744658212834243, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0314, + "step": 42937 + }, + { + "epoch": 0.3744745425685929, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 42938 + }, + { + "epoch": 0.3744832638537615, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 42939 + }, + { + "epoch": 0.37449198513893006, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 42940 + }, + { + "epoch": 0.37450070642409866, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 42941 + }, + { + "epoch": 0.37450942770926726, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 42942 + }, + { + "epoch": 0.3745181489944358, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 42943 + }, + { + "epoch": 0.3745268702796044, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 42944 + }, + { + "epoch": 0.374535591564773, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42945 + }, + { + "epoch": 0.37454431284994155, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 42946 + }, + { + "epoch": 0.37455303413511015, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 42947 + }, + { + "epoch": 0.37456175542027875, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 42948 + }, + { + "epoch": 0.3745704767054473, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 42949 + }, + { + "epoch": 0.3745791979906159, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9817, + "step": 42950 + }, + { + "epoch": 0.3745879192757845, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 42951 + }, + { + "epoch": 0.37459664056095304, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42952 + }, + { + "epoch": 0.37460536184612164, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 42953 + }, + { + "epoch": 0.37461408313129024, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 42954 + }, + { + "epoch": 0.3746228044164588, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 42955 + }, + { + "epoch": 0.3746315257016274, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 42956 + }, + { + "epoch": 0.374640246986796, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 42957 + }, + { + "epoch": 0.37464896827196453, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 42958 + }, + { + "epoch": 0.37465768955713313, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 42959 + }, + { + "epoch": 0.37466641084230173, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 42960 + }, + { + "epoch": 0.3746751321274703, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 42961 + }, + { + "epoch": 0.3746838534126389, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 42962 + }, + { + "epoch": 0.3746925746978075, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 42963 + }, + { + "epoch": 0.374701295982976, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 42964 + }, + { + "epoch": 0.3747100172681446, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 42965 + }, + { + "epoch": 0.3747187385533132, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 42966 + }, + { + "epoch": 0.3747274598384818, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 42967 + }, + { + "epoch": 0.37473618112365037, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 42968 + }, + { + "epoch": 0.37474490240881897, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 42969 + }, + { + "epoch": 0.37475362369398757, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 42970 + }, + { + "epoch": 0.3747623449791561, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 42971 + }, + { + "epoch": 0.3747710662643247, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 42972 + }, + { + "epoch": 0.3747797875494933, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 42973 + }, + { + "epoch": 0.37478850883466186, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 42974 + }, + { + "epoch": 0.37479723011983046, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 42975 + }, + { + "epoch": 0.37480595140499906, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 42976 + }, + { + "epoch": 0.3748146726901676, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 42977 + }, + { + "epoch": 0.3748233939753362, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 42978 + }, + { + "epoch": 0.3748321152605048, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 42979 + }, + { + "epoch": 0.37484083654567335, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 42980 + }, + { + "epoch": 0.37484955783084195, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 42981 + }, + { + "epoch": 0.37485827911601055, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 42982 + }, + { + "epoch": 0.3748670004011791, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 42983 + }, + { + "epoch": 0.3748757216863477, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 42984 + }, + { + "epoch": 0.3748844429715163, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 42985 + }, + { + "epoch": 0.37489316425668484, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 42986 + }, + { + "epoch": 0.37490188554185344, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 42987 + }, + { + "epoch": 0.37491060682702204, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 42988 + }, + { + "epoch": 0.3749193281121906, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 42989 + }, + { + "epoch": 0.3749280493973592, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 42990 + }, + { + "epoch": 0.3749367706825278, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 42991 + }, + { + "epoch": 0.3749454919676964, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 42992 + }, + { + "epoch": 0.37495421325286493, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 42993 + }, + { + "epoch": 0.37496293453803353, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 42994 + }, + { + "epoch": 0.37497165582320213, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 42995 + }, + { + "epoch": 0.3749803771083707, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 42996 + }, + { + "epoch": 0.3749890983935393, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 42997 + }, + { + "epoch": 0.3749978196787079, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 42998 + }, + { + "epoch": 0.3750065409638764, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 42999 + }, + { + "epoch": 0.375015262249045, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 43000 + }, + { + "epoch": 0.3750239835342136, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9835, + "step": 43001 + }, + { + "epoch": 0.37503270481938217, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 43002 + }, + { + "epoch": 0.37504142610455077, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 43003 + }, + { + "epoch": 0.37505014738971937, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 43004 + }, + { + "epoch": 0.3750588686748879, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9818, + "step": 43005 + }, + { + "epoch": 0.3750675899600565, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 43006 + }, + { + "epoch": 0.3750763112452251, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 43007 + }, + { + "epoch": 0.37508503253039366, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 43008 + }, + { + "epoch": 0.37509375381556226, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 43009 + }, + { + "epoch": 0.37510247510073086, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 43010 + }, + { + "epoch": 0.3751111963858994, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 43011 + }, + { + "epoch": 0.375119917671068, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 43012 + }, + { + "epoch": 0.3751286389562366, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 43013 + }, + { + "epoch": 0.37513736024140515, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 43014 + }, + { + "epoch": 0.37514608152657375, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 43015 + }, + { + "epoch": 0.37515480281174235, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 43016 + }, + { + "epoch": 0.3751635240969109, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 43017 + }, + { + "epoch": 0.3751722453820795, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 43018 + }, + { + "epoch": 0.3751809666672481, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 43019 + }, + { + "epoch": 0.3751896879524167, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 43020 + }, + { + "epoch": 0.37519840923758524, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 43021 + }, + { + "epoch": 0.37520713052275384, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 43022 + }, + { + "epoch": 0.37521585180792244, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 43023 + }, + { + "epoch": 0.375224573093091, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 43024 + }, + { + "epoch": 0.3752332943782596, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 43025 + }, + { + "epoch": 0.3752420156634282, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 43026 + }, + { + "epoch": 0.37525073694859673, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 43027 + }, + { + "epoch": 0.37525945823376533, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 43028 + }, + { + "epoch": 0.37526817951893393, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 43029 + }, + { + "epoch": 0.3752769008041025, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 43030 + }, + { + "epoch": 0.3752856220892711, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 43031 + }, + { + "epoch": 0.3752943433744397, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 43032 + }, + { + "epoch": 0.3753030646596082, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 43033 + }, + { + "epoch": 0.3753117859447768, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 43034 + }, + { + "epoch": 0.3753205072299454, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 43035 + }, + { + "epoch": 0.37532922851511397, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 43036 + }, + { + "epoch": 0.37533794980028257, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43037 + }, + { + "epoch": 0.37534667108545117, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 43038 + }, + { + "epoch": 0.3753553923706197, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 43039 + }, + { + "epoch": 0.3753641136557883, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 43040 + }, + { + "epoch": 0.3753728349409569, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 43041 + }, + { + "epoch": 0.37538155622612546, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 43042 + }, + { + "epoch": 0.37539027751129406, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 43043 + }, + { + "epoch": 0.37539899879646266, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 43044 + }, + { + "epoch": 0.3754077200816312, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 43045 + }, + { + "epoch": 0.3754164413667998, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 43046 + }, + { + "epoch": 0.3754251626519684, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 43047 + }, + { + "epoch": 0.375433883937137, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 43048 + }, + { + "epoch": 0.37544260522230555, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 43049 + }, + { + "epoch": 0.37545132650747415, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 43050 + }, + { + "epoch": 0.37546004779264275, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 43051 + }, + { + "epoch": 0.3754687690778113, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 43052 + }, + { + "epoch": 0.3754774903629799, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 43053 + }, + { + "epoch": 0.3754862116481485, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 43054 + }, + { + "epoch": 0.37549493293331704, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 43055 + }, + { + "epoch": 0.37550365421848564, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9813, + "step": 43056 + }, + { + "epoch": 0.37551237550365424, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 43057 + }, + { + "epoch": 0.3755210967888228, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 43058 + }, + { + "epoch": 0.3755298180739914, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0348, + "step": 43059 + }, + { + "epoch": 0.37553853935916, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 43060 + }, + { + "epoch": 0.37554726064432853, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 43061 + }, + { + "epoch": 0.37555598192949713, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43062 + }, + { + "epoch": 0.37556470321466573, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 43063 + }, + { + "epoch": 0.3755734244998343, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 43064 + }, + { + "epoch": 0.3755821457850029, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 43065 + }, + { + "epoch": 0.3755908670701715, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 43066 + }, + { + "epoch": 0.37559958835534, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 43067 + }, + { + "epoch": 0.3756083096405086, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 43068 + }, + { + "epoch": 0.3756170309256772, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 43069 + }, + { + "epoch": 0.37562575221084576, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 43070 + }, + { + "epoch": 0.37563447349601436, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 43071 + }, + { + "epoch": 0.37564319478118297, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 43072 + }, + { + "epoch": 0.3756519160663515, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 43073 + }, + { + "epoch": 0.3756606373515201, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 43074 + }, + { + "epoch": 0.3756693586366887, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 43075 + }, + { + "epoch": 0.3756780799218573, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 43076 + }, + { + "epoch": 0.37568680120702586, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 43077 + }, + { + "epoch": 0.37569552249219446, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 43078 + }, + { + "epoch": 0.37570424377736306, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 43079 + }, + { + "epoch": 0.3757129650625316, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 43080 + }, + { + "epoch": 0.3757216863477002, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 43081 + }, + { + "epoch": 0.3757304076328688, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 43082 + }, + { + "epoch": 0.37573912891803735, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 43083 + }, + { + "epoch": 0.37574785020320595, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 43084 + }, + { + "epoch": 0.37575657148837455, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 43085 + }, + { + "epoch": 0.3757652927735431, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43086 + }, + { + "epoch": 0.3757740140587117, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 43087 + }, + { + "epoch": 0.3757827353438803, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 43088 + }, + { + "epoch": 0.37579145662904884, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 43089 + }, + { + "epoch": 0.37580017791421744, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 43090 + }, + { + "epoch": 0.37580889919938604, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 43091 + }, + { + "epoch": 0.3758176204845546, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 43092 + }, + { + "epoch": 0.3758263417697232, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 43093 + }, + { + "epoch": 0.3758350630548918, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 43094 + }, + { + "epoch": 0.3758437843400603, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 43095 + }, + { + "epoch": 0.3758525056252289, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 43096 + }, + { + "epoch": 0.37586122691039753, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 43097 + }, + { + "epoch": 0.3758699481955661, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 43098 + }, + { + "epoch": 0.3758786694807347, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 43099 + }, + { + "epoch": 0.3758873907659033, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 43100 + }, + { + "epoch": 0.3758961120510719, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 43101 + }, + { + "epoch": 0.3759048333362404, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 43102 + }, + { + "epoch": 0.375913554621409, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 43103 + }, + { + "epoch": 0.3759222759065776, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 43104 + }, + { + "epoch": 0.37593099719174616, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 43105 + }, + { + "epoch": 0.37593971847691476, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43106 + }, + { + "epoch": 0.37594843976208336, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 43107 + }, + { + "epoch": 0.3759571610472519, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 43108 + }, + { + "epoch": 0.3759658823324205, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 43109 + }, + { + "epoch": 0.3759746036175891, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 43110 + }, + { + "epoch": 0.37598332490275765, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 43111 + }, + { + "epoch": 0.37599204618792625, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 43112 + }, + { + "epoch": 0.37600076747309485, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 43113 + }, + { + "epoch": 0.3760094887582634, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 43114 + }, + { + "epoch": 0.376018210043432, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 43115 + }, + { + "epoch": 0.3760269313286006, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 43116 + }, + { + "epoch": 0.37603565261376914, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 43117 + }, + { + "epoch": 0.37604437389893774, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 43118 + }, + { + "epoch": 0.37605309518410635, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 43119 + }, + { + "epoch": 0.3760618164692749, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 43120 + }, + { + "epoch": 0.3760705377544435, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 43121 + }, + { + "epoch": 0.3760792590396121, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 43122 + }, + { + "epoch": 0.37608798032478064, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43123 + }, + { + "epoch": 0.37609670160994924, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 43124 + }, + { + "epoch": 0.37610542289511784, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 43125 + }, + { + "epoch": 0.3761141441802864, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 43126 + }, + { + "epoch": 0.376122865465455, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 43127 + }, + { + "epoch": 0.3761315867506236, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 43128 + }, + { + "epoch": 0.3761403080357922, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 43129 + }, + { + "epoch": 0.3761490293209607, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 43130 + }, + { + "epoch": 0.3761577506061293, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 43131 + }, + { + "epoch": 0.3761664718912979, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 43132 + }, + { + "epoch": 0.37617519317646647, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 43133 + }, + { + "epoch": 0.37618391446163507, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 43134 + }, + { + "epoch": 0.37619263574680367, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 43135 + }, + { + "epoch": 0.3762013570319722, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 43136 + }, + { + "epoch": 0.3762100783171408, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 43137 + }, + { + "epoch": 0.3762187996023094, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 43138 + }, + { + "epoch": 0.37622752088747796, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 43139 + }, + { + "epoch": 0.37623624217264656, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 43140 + }, + { + "epoch": 0.37624496345781516, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 43141 + }, + { + "epoch": 0.3762536847429837, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 43142 + }, + { + "epoch": 0.3762624060281523, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43143 + }, + { + "epoch": 0.3762711273133209, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 43144 + }, + { + "epoch": 0.37627984859848945, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 43145 + }, + { + "epoch": 0.37628856988365805, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 43146 + }, + { + "epoch": 0.37629729116882665, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 43147 + }, + { + "epoch": 0.3763060124539952, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 43148 + }, + { + "epoch": 0.3763147337391638, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 43149 + }, + { + "epoch": 0.3763234550243324, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 43150 + }, + { + "epoch": 0.37633217630950094, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 43151 + }, + { + "epoch": 0.37634089759466954, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 43152 + }, + { + "epoch": 0.37634961887983814, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 43153 + }, + { + "epoch": 0.3763583401650067, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 43154 + }, + { + "epoch": 0.3763670614501753, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 43155 + }, + { + "epoch": 0.3763757827353439, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 43156 + }, + { + "epoch": 0.3763845040205125, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 43157 + }, + { + "epoch": 0.37639322530568103, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 43158 + }, + { + "epoch": 0.37640194659084963, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 43159 + }, + { + "epoch": 0.37641066787601823, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 43160 + }, + { + "epoch": 0.3764193891611868, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 43161 + }, + { + "epoch": 0.3764281104463554, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 43162 + }, + { + "epoch": 0.376436831731524, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 43163 + }, + { + "epoch": 0.3764455530166925, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 43164 + }, + { + "epoch": 0.3764542743018611, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 43165 + }, + { + "epoch": 0.3764629955870297, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 43166 + }, + { + "epoch": 0.37647171687219827, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 43167 + }, + { + "epoch": 0.37648043815736687, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 43168 + }, + { + "epoch": 0.37648915944253547, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 43169 + }, + { + "epoch": 0.376497880727704, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 43170 + }, + { + "epoch": 0.3765066020128726, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 43171 + }, + { + "epoch": 0.3765153232980412, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 43172 + }, + { + "epoch": 0.37652404458320976, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 43173 + }, + { + "epoch": 0.37653276586837836, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 43174 + }, + { + "epoch": 0.37654148715354696, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 43175 + }, + { + "epoch": 0.3765502084387155, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 43176 + }, + { + "epoch": 0.3765589297238841, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 43177 + }, + { + "epoch": 0.3765676510090527, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 43178 + }, + { + "epoch": 0.37657637229422125, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 43179 + }, + { + "epoch": 0.37658509357938985, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 43180 + }, + { + "epoch": 0.37659381486455845, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 43181 + }, + { + "epoch": 0.376602536149727, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 43182 + }, + { + "epoch": 0.3766112574348956, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 43183 + }, + { + "epoch": 0.3766199787200642, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 43184 + }, + { + "epoch": 0.3766287000052328, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 43185 + }, + { + "epoch": 0.37663742129040134, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 43186 + }, + { + "epoch": 0.37664614257556994, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 43187 + }, + { + "epoch": 0.37665486386073854, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43188 + }, + { + "epoch": 0.3766635851459071, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 43189 + }, + { + "epoch": 0.3766723064310757, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 43190 + }, + { + "epoch": 0.3766810277162443, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 43191 + }, + { + "epoch": 0.37668974900141283, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 43192 + }, + { + "epoch": 0.37669847028658143, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 43193 + }, + { + "epoch": 0.37670719157175003, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 43194 + }, + { + "epoch": 0.3767159128569186, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 43195 + }, + { + "epoch": 0.3767246341420872, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 43196 + }, + { + "epoch": 0.3767333554272558, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 43197 + }, + { + "epoch": 0.3767420767124243, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 43198 + }, + { + "epoch": 0.3767507979975929, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 43199 + }, + { + "epoch": 0.3767595192827615, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 43200 + }, + { + "epoch": 0.37676824056793007, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 43201 + }, + { + "epoch": 0.37677696185309867, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 43202 + }, + { + "epoch": 0.37678568313826727, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 43203 + }, + { + "epoch": 0.3767944044234358, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 43204 + }, + { + "epoch": 0.3768031257086044, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 43205 + }, + { + "epoch": 0.376811846993773, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 43206 + }, + { + "epoch": 0.37682056827894156, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 43207 + }, + { + "epoch": 0.37682928956411016, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 43208 + }, + { + "epoch": 0.37683801084927876, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 43209 + }, + { + "epoch": 0.37684673213444736, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 43210 + }, + { + "epoch": 0.3768554534196159, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 43211 + }, + { + "epoch": 0.3768641747047845, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 43212 + }, + { + "epoch": 0.3768728959899531, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 43213 + }, + { + "epoch": 0.37688161727512165, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 43214 + }, + { + "epoch": 0.37689033856029025, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 43215 + }, + { + "epoch": 0.37689905984545885, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 43216 + }, + { + "epoch": 0.3769077811306274, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 43217 + }, + { + "epoch": 0.376916502415796, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 43218 + }, + { + "epoch": 0.3769252237009646, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 43219 + }, + { + "epoch": 0.37693394498613314, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 43220 + }, + { + "epoch": 0.37694266627130174, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 43221 + }, + { + "epoch": 0.37695138755647034, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 43222 + }, + { + "epoch": 0.3769601088416389, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 43223 + }, + { + "epoch": 0.3769688301268075, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 43224 + }, + { + "epoch": 0.3769775514119761, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9835, + "step": 43225 + }, + { + "epoch": 0.37698627269714463, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 43226 + }, + { + "epoch": 0.37699499398231323, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 43227 + }, + { + "epoch": 0.37700371526748183, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 43228 + }, + { + "epoch": 0.3770124365526504, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 43229 + }, + { + "epoch": 0.377021157837819, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 43230 + }, + { + "epoch": 0.3770298791229876, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 43231 + }, + { + "epoch": 0.3770386004081561, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 43232 + }, + { + "epoch": 0.3770473216933247, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 43233 + }, + { + "epoch": 0.3770560429784933, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 43234 + }, + { + "epoch": 0.37706476426366187, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 43235 + }, + { + "epoch": 0.37707348554883047, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 43236 + }, + { + "epoch": 0.37708220683399907, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 43237 + }, + { + "epoch": 0.37709092811916767, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 43238 + }, + { + "epoch": 0.3770996494043362, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 43239 + }, + { + "epoch": 0.3771083706895048, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 43240 + }, + { + "epoch": 0.3771170919746734, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 43241 + }, + { + "epoch": 0.37712581325984196, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 43242 + }, + { + "epoch": 0.37713453454501056, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 43243 + }, + { + "epoch": 0.37714325583017916, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 43244 + }, + { + "epoch": 0.3771519771153477, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 43245 + }, + { + "epoch": 0.3771606984005163, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43246 + }, + { + "epoch": 0.3771694196856849, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 43247 + }, + { + "epoch": 0.37717814097085345, + "grad_norm": 0.296875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 43248 + }, + { + "epoch": 0.37718686225602205, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 43249 + }, + { + "epoch": 0.37719558354119065, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 43250 + }, + { + "epoch": 0.3772043048263592, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 43251 + }, + { + "epoch": 0.3772130261115278, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 43252 + }, + { + "epoch": 0.3772217473966964, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 43253 + }, + { + "epoch": 0.37723046868186494, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 43254 + }, + { + "epoch": 0.37723918996703354, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 43255 + }, + { + "epoch": 0.37724791125220214, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 43256 + }, + { + "epoch": 0.3772566325373707, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 43257 + }, + { + "epoch": 0.3772653538225393, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 43258 + }, + { + "epoch": 0.3772740751077079, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 43259 + }, + { + "epoch": 0.37728279639287643, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 43260 + }, + { + "epoch": 0.37729151767804503, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 43261 + }, + { + "epoch": 0.37730023896321363, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 43262 + }, + { + "epoch": 0.3773089602483822, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 43263 + }, + { + "epoch": 0.3773176815335508, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 43264 + }, + { + "epoch": 0.3773264028187194, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 43265 + }, + { + "epoch": 0.377335124103888, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 43266 + }, + { + "epoch": 0.3773438453890565, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 43267 + }, + { + "epoch": 0.3773525666742251, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 43268 + }, + { + "epoch": 0.3773612879593937, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 43269 + }, + { + "epoch": 0.37737000924456227, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 43270 + }, + { + "epoch": 0.37737873052973087, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43271 + }, + { + "epoch": 0.37738745181489947, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 43272 + }, + { + "epoch": 0.377396173100068, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 43273 + }, + { + "epoch": 0.3774048943852366, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 43274 + }, + { + "epoch": 0.3774136156704052, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 43275 + }, + { + "epoch": 0.37742233695557376, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 43276 + }, + { + "epoch": 0.37743105824074236, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 43277 + }, + { + "epoch": 0.37743977952591096, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 43278 + }, + { + "epoch": 0.3774485008110795, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 43279 + }, + { + "epoch": 0.3774572220962481, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 43280 + }, + { + "epoch": 0.3774659433814167, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 43281 + }, + { + "epoch": 0.37747466466658525, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 43282 + }, + { + "epoch": 0.37748338595175385, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 43283 + }, + { + "epoch": 0.37749210723692245, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 43284 + }, + { + "epoch": 0.377500828522091, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 43285 + }, + { + "epoch": 0.3775095498072596, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 43286 + }, + { + "epoch": 0.3775182710924282, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 43287 + }, + { + "epoch": 0.37752699237759674, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 43288 + }, + { + "epoch": 0.37753571366276534, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 43289 + }, + { + "epoch": 0.37754443494793394, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 43290 + }, + { + "epoch": 0.3775531562331025, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 43291 + }, + { + "epoch": 0.3775618775182711, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 43292 + }, + { + "epoch": 0.3775705988034397, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 43293 + }, + { + "epoch": 0.3775793200886083, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 43294 + }, + { + "epoch": 0.37758804137377683, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 43295 + }, + { + "epoch": 0.37759676265894543, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 43296 + }, + { + "epoch": 0.37760548394411403, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 43297 + }, + { + "epoch": 0.3776142052292826, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 43298 + }, + { + "epoch": 0.3776229265144512, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 43299 + }, + { + "epoch": 0.3776316477996198, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 43300 + }, + { + "epoch": 0.3776403690847883, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 43301 + }, + { + "epoch": 0.3776490903699569, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 43302 + }, + { + "epoch": 0.3776578116551255, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 43303 + }, + { + "epoch": 0.37766653294029406, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 43304 + }, + { + "epoch": 0.37767525422546266, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 43305 + }, + { + "epoch": 0.37768397551063126, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 43306 + }, + { + "epoch": 0.3776926967957998, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 43307 + }, + { + "epoch": 0.3777014180809684, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 43308 + }, + { + "epoch": 0.377710139366137, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 43309 + }, + { + "epoch": 0.37771886065130555, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 43310 + }, + { + "epoch": 0.37772758193647415, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0322, + "step": 43311 + }, + { + "epoch": 0.37773630322164276, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 43312 + }, + { + "epoch": 0.3777450245068113, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 43313 + }, + { + "epoch": 0.3777537457919799, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 43314 + }, + { + "epoch": 0.3777624670771485, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 43315 + }, + { + "epoch": 0.37777118836231705, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 43316 + }, + { + "epoch": 0.37777990964748565, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 43317 + }, + { + "epoch": 0.37778863093265425, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 43318 + }, + { + "epoch": 0.37779735221782285, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 43319 + }, + { + "epoch": 0.3778060735029914, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 43320 + }, + { + "epoch": 0.37781479478816, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 43321 + }, + { + "epoch": 0.3778235160733286, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 43322 + }, + { + "epoch": 0.37783223735849714, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 43323 + }, + { + "epoch": 0.37784095864366574, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 43324 + }, + { + "epoch": 0.37784967992883434, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 43325 + }, + { + "epoch": 0.3778584012140029, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 43326 + }, + { + "epoch": 0.3778671224991715, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 43327 + }, + { + "epoch": 0.3778758437843401, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 43328 + }, + { + "epoch": 0.3778845650695086, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43329 + }, + { + "epoch": 0.3778932863546772, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0333, + "step": 43330 + }, + { + "epoch": 0.3779020076398458, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 43331 + }, + { + "epoch": 0.37791072892501437, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 43332 + }, + { + "epoch": 0.37791945021018297, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 43333 + }, + { + "epoch": 0.3779281714953516, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 43334 + }, + { + "epoch": 0.3779368927805201, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 43335 + }, + { + "epoch": 0.3779456140656887, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 43336 + }, + { + "epoch": 0.3779543353508573, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 43337 + }, + { + "epoch": 0.37796305663602586, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 43338 + }, + { + "epoch": 0.37797177792119446, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 43339 + }, + { + "epoch": 0.37798049920636306, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 43340 + }, + { + "epoch": 0.3779892204915316, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 43341 + }, + { + "epoch": 0.3779979417767002, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 43342 + }, + { + "epoch": 0.3780066630618688, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 43343 + }, + { + "epoch": 0.37801538434703735, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 43344 + }, + { + "epoch": 0.37802410563220595, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 43345 + }, + { + "epoch": 0.37803282691737455, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43346 + }, + { + "epoch": 0.37804154820254315, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 43347 + }, + { + "epoch": 0.3780502694877117, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 43348 + }, + { + "epoch": 0.3780589907728803, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 43349 + }, + { + "epoch": 0.3780677120580489, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 43350 + }, + { + "epoch": 0.37807643334321744, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9813, + "step": 43351 + }, + { + "epoch": 0.37808515462838604, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 43352 + }, + { + "epoch": 0.37809387591355464, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 43353 + }, + { + "epoch": 0.3781025971987232, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 43354 + }, + { + "epoch": 0.3781113184838918, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 43355 + }, + { + "epoch": 0.3781200397690604, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 43356 + }, + { + "epoch": 0.37812876105422893, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43357 + }, + { + "epoch": 0.37813748233939753, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 43358 + }, + { + "epoch": 0.37814620362456614, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 43359 + }, + { + "epoch": 0.3781549249097347, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 43360 + }, + { + "epoch": 0.3781636461949033, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 43361 + }, + { + "epoch": 0.3781723674800719, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 43362 + }, + { + "epoch": 0.3781810887652404, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 43363 + }, + { + "epoch": 0.378189810050409, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 43364 + }, + { + "epoch": 0.3781985313355776, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 43365 + }, + { + "epoch": 0.37820725262074617, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9762, + "step": 43366 + }, + { + "epoch": 0.37821597390591477, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 43367 + }, + { + "epoch": 0.37822469519108337, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 43368 + }, + { + "epoch": 0.3782334164762519, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 43369 + }, + { + "epoch": 0.3782421377614205, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 43370 + }, + { + "epoch": 0.3782508590465891, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43371 + }, + { + "epoch": 0.37825958033175766, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 43372 + }, + { + "epoch": 0.37826830161692626, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 43373 + }, + { + "epoch": 0.37827702290209486, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 43374 + }, + { + "epoch": 0.37828574418726346, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 43375 + }, + { + "epoch": 0.378294465472432, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9861, + "step": 43376 + }, + { + "epoch": 0.3783031867576006, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 43377 + }, + { + "epoch": 0.3783119080427692, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 43378 + }, + { + "epoch": 0.37832062932793775, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 43379 + }, + { + "epoch": 0.37832935061310635, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 43380 + }, + { + "epoch": 0.37833807189827495, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 43381 + }, + { + "epoch": 0.3783467931834435, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 43382 + }, + { + "epoch": 0.3783555144686121, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 43383 + }, + { + "epoch": 0.3783642357537807, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43384 + }, + { + "epoch": 0.37837295703894924, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 43385 + }, + { + "epoch": 0.37838167832411784, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 43386 + }, + { + "epoch": 0.37839039960928644, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 43387 + }, + { + "epoch": 0.378399120894455, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 43388 + }, + { + "epoch": 0.3784078421796236, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 43389 + }, + { + "epoch": 0.3784165634647922, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 43390 + }, + { + "epoch": 0.37842528474996073, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 43391 + }, + { + "epoch": 0.37843400603512933, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 43392 + }, + { + "epoch": 0.37844272732029793, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 43393 + }, + { + "epoch": 0.3784514486054665, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 43394 + }, + { + "epoch": 0.3784601698906351, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 43395 + }, + { + "epoch": 0.3784688911758037, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 43396 + }, + { + "epoch": 0.3784776124609722, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 43397 + }, + { + "epoch": 0.3784863337461408, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 43398 + }, + { + "epoch": 0.3784950550313094, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 43399 + }, + { + "epoch": 0.37850377631647797, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 43400 + }, + { + "epoch": 0.37851249760164657, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 43401 + }, + { + "epoch": 0.37852121888681517, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 43402 + }, + { + "epoch": 0.37852994017198377, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 43403 + }, + { + "epoch": 0.3785386614571523, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 43404 + }, + { + "epoch": 0.3785473827423209, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 43405 + }, + { + "epoch": 0.3785561040274895, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 43406 + }, + { + "epoch": 0.37856482531265806, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 43407 + }, + { + "epoch": 0.37857354659782666, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 43408 + }, + { + "epoch": 0.37858226788299526, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 43409 + }, + { + "epoch": 0.3785909891681638, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 43410 + }, + { + "epoch": 0.3785997104533324, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 43411 + }, + { + "epoch": 0.378608431738501, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 43412 + }, + { + "epoch": 0.37861715302366955, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 43413 + }, + { + "epoch": 0.37862587430883815, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 43414 + }, + { + "epoch": 0.37863459559400675, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 43415 + }, + { + "epoch": 0.3786433168791753, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 43416 + }, + { + "epoch": 0.3786520381643439, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 43417 + }, + { + "epoch": 0.3786607594495125, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 43418 + }, + { + "epoch": 0.37866948073468104, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 43419 + }, + { + "epoch": 0.37867820201984964, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 43420 + }, + { + "epoch": 0.37868692330501824, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 43421 + }, + { + "epoch": 0.3786956445901868, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 43422 + }, + { + "epoch": 0.3787043658753554, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 43423 + }, + { + "epoch": 0.378713087160524, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 43424 + }, + { + "epoch": 0.37872180844569253, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43425 + }, + { + "epoch": 0.37873052973086113, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 43426 + }, + { + "epoch": 0.37873925101602973, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43427 + }, + { + "epoch": 0.3787479723011983, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 43428 + }, + { + "epoch": 0.3787566935863669, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 43429 + }, + { + "epoch": 0.3787654148715355, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 43430 + }, + { + "epoch": 0.3787741361567041, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 43431 + }, + { + "epoch": 0.3787828574418726, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 43432 + }, + { + "epoch": 0.3787915787270412, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 43433 + }, + { + "epoch": 0.3788003000122098, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 43434 + }, + { + "epoch": 0.37880902129737837, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 43435 + }, + { + "epoch": 0.37881774258254697, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 43436 + }, + { + "epoch": 0.37882646386771557, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43437 + }, + { + "epoch": 0.3788351851528841, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 43438 + }, + { + "epoch": 0.3788439064380527, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 43439 + }, + { + "epoch": 0.3788526277232213, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 43440 + }, + { + "epoch": 0.37886134900838986, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 43441 + }, + { + "epoch": 0.37887007029355846, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 43442 + }, + { + "epoch": 0.37887879157872706, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 43443 + }, + { + "epoch": 0.3788875128638956, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 43444 + }, + { + "epoch": 0.3788962341490642, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43445 + }, + { + "epoch": 0.3789049554342328, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 43446 + }, + { + "epoch": 0.37891367671940135, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 43447 + }, + { + "epoch": 0.37892239800456995, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 43448 + }, + { + "epoch": 0.37893111928973855, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 43449 + }, + { + "epoch": 0.3789398405749071, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 43450 + }, + { + "epoch": 0.3789485618600757, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 43451 + }, + { + "epoch": 0.3789572831452443, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 43452 + }, + { + "epoch": 0.37896600443041284, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 43453 + }, + { + "epoch": 0.37897472571558144, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 43454 + }, + { + "epoch": 0.37898344700075004, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 43455 + }, + { + "epoch": 0.37899216828591864, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 43456 + }, + { + "epoch": 0.3790008895710872, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 43457 + }, + { + "epoch": 0.3790096108562558, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 43458 + }, + { + "epoch": 0.3790183321414244, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 43459 + }, + { + "epoch": 0.37902705342659293, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 43460 + }, + { + "epoch": 0.37903577471176153, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 43461 + }, + { + "epoch": 0.37904449599693013, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 43462 + }, + { + "epoch": 0.3790532172820987, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 43463 + }, + { + "epoch": 0.3790619385672673, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 43464 + }, + { + "epoch": 0.3790706598524359, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 43465 + }, + { + "epoch": 0.3790793811376044, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43466 + }, + { + "epoch": 0.379088102422773, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 43467 + }, + { + "epoch": 0.3790968237079416, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 43468 + }, + { + "epoch": 0.37910554499311017, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 43469 + }, + { + "epoch": 0.37911426627827877, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 43470 + }, + { + "epoch": 0.37912298756344737, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 43471 + }, + { + "epoch": 0.3791317088486159, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 43472 + }, + { + "epoch": 0.3791404301337845, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 43473 + }, + { + "epoch": 0.3791491514189531, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 43474 + }, + { + "epoch": 0.37915787270412166, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 43475 + }, + { + "epoch": 0.37916659398929026, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 43476 + }, + { + "epoch": 0.37917531527445886, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43477 + }, + { + "epoch": 0.3791840365596274, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 43478 + }, + { + "epoch": 0.379192757844796, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 43479 + }, + { + "epoch": 0.3792014791299646, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 43480 + }, + { + "epoch": 0.37921020041513315, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 43481 + }, + { + "epoch": 0.37921892170030175, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43482 + }, + { + "epoch": 0.37922764298547035, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 43483 + }, + { + "epoch": 0.37923636427063895, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 43484 + }, + { + "epoch": 0.3792450855558075, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 43485 + }, + { + "epoch": 0.3792538068409761, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 43486 + }, + { + "epoch": 0.3792625281261447, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 43487 + }, + { + "epoch": 0.37927124941131324, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 43488 + }, + { + "epoch": 0.37927997069648184, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 43489 + }, + { + "epoch": 0.37928869198165044, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 43490 + }, + { + "epoch": 0.379297413266819, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 43491 + }, + { + "epoch": 0.3793061345519876, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 43492 + }, + { + "epoch": 0.3793148558371562, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 43493 + }, + { + "epoch": 0.37932357712232473, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 43494 + }, + { + "epoch": 0.37933229840749333, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 43495 + }, + { + "epoch": 0.37934101969266193, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 43496 + }, + { + "epoch": 0.3793497409778305, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 43497 + }, + { + "epoch": 0.3793584622629991, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 43498 + }, + { + "epoch": 0.3793671835481677, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 43499 + }, + { + "epoch": 0.3793759048333362, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 43500 + }, + { + "epoch": 0.3793846261185048, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 43501 + }, + { + "epoch": 0.3793933474036734, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 43502 + }, + { + "epoch": 0.37940206868884196, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 43503 + }, + { + "epoch": 0.37941078997401056, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 43504 + }, + { + "epoch": 0.37941951125917917, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 43505 + }, + { + "epoch": 0.3794282325443477, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 43506 + }, + { + "epoch": 0.3794369538295163, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 43507 + }, + { + "epoch": 0.3794456751146849, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 43508 + }, + { + "epoch": 0.37945439639985346, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 43509 + }, + { + "epoch": 0.37946311768502206, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 43510 + }, + { + "epoch": 0.37947183897019066, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 43511 + }, + { + "epoch": 0.37948056025535926, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 43512 + }, + { + "epoch": 0.3794892815405278, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 43513 + }, + { + "epoch": 0.3794980028256964, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 43514 + }, + { + "epoch": 0.379506724110865, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 43515 + }, + { + "epoch": 0.37951544539603355, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 43516 + }, + { + "epoch": 0.37952416668120215, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 43517 + }, + { + "epoch": 0.37953288796637075, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43518 + }, + { + "epoch": 0.3795416092515393, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 43519 + }, + { + "epoch": 0.3795503305367079, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 43520 + }, + { + "epoch": 0.3795590518218765, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 43521 + }, + { + "epoch": 0.37956777310704504, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 43522 + }, + { + "epoch": 0.37957649439221364, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 43523 + }, + { + "epoch": 0.37958521567738224, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 43524 + }, + { + "epoch": 0.3795939369625508, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 43525 + }, + { + "epoch": 0.3796026582477194, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 43526 + }, + { + "epoch": 0.379611379532888, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 43527 + }, + { + "epoch": 0.3796201008180565, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 43528 + }, + { + "epoch": 0.3796288221032251, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 43529 + }, + { + "epoch": 0.37963754338839373, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 43530 + }, + { + "epoch": 0.3796462646735623, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 43531 + }, + { + "epoch": 0.3796549859587309, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 43532 + }, + { + "epoch": 0.3796637072438995, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 43533 + }, + { + "epoch": 0.379672428529068, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 43534 + }, + { + "epoch": 0.3796811498142366, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 43535 + }, + { + "epoch": 0.3796898710994052, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 43536 + }, + { + "epoch": 0.37969859238457376, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 43537 + }, + { + "epoch": 0.37970731366974236, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 43538 + }, + { + "epoch": 0.37971603495491096, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 43539 + }, + { + "epoch": 0.37972475624007956, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 43540 + }, + { + "epoch": 0.3797334775252481, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 43541 + }, + { + "epoch": 0.3797421988104167, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 43542 + }, + { + "epoch": 0.3797509200955853, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 43543 + }, + { + "epoch": 0.37975964138075385, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 43544 + }, + { + "epoch": 0.37976836266592245, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 43545 + }, + { + "epoch": 0.37977708395109105, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 43546 + }, + { + "epoch": 0.3797858052362596, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 43547 + }, + { + "epoch": 0.3797945265214282, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43548 + }, + { + "epoch": 0.3798032478065968, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 43549 + }, + { + "epoch": 0.37981196909176534, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 43550 + }, + { + "epoch": 0.37982069037693394, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 43551 + }, + { + "epoch": 0.37982941166210255, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 43552 + }, + { + "epoch": 0.3798381329472711, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 43553 + }, + { + "epoch": 0.3798468542324397, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 43554 + }, + { + "epoch": 0.3798555755176083, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 43555 + }, + { + "epoch": 0.37986429680277684, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 43556 + }, + { + "epoch": 0.37987301808794544, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 43557 + }, + { + "epoch": 0.37988173937311404, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 43558 + }, + { + "epoch": 0.3798904606582826, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 43559 + }, + { + "epoch": 0.3798991819434512, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 43560 + }, + { + "epoch": 0.3799079032286198, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 43561 + }, + { + "epoch": 0.3799166245137883, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9698, + "step": 43562 + }, + { + "epoch": 0.3799253457989569, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 43563 + }, + { + "epoch": 0.3799340670841255, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 43564 + }, + { + "epoch": 0.3799427883692941, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 43565 + }, + { + "epoch": 0.37995150965446267, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 43566 + }, + { + "epoch": 0.37996023093963127, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 43567 + }, + { + "epoch": 0.37996895222479987, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 43568 + }, + { + "epoch": 0.3799776735099684, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 43569 + }, + { + "epoch": 0.379986394795137, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 43570 + }, + { + "epoch": 0.3799951160803056, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 43571 + }, + { + "epoch": 0.38000383736547416, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 43572 + }, + { + "epoch": 0.38001255865064276, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 43573 + }, + { + "epoch": 0.38002127993581136, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 43574 + }, + { + "epoch": 0.3800300012209799, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 43575 + }, + { + "epoch": 0.3800387225061485, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 43576 + }, + { + "epoch": 0.3800474437913171, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 43577 + }, + { + "epoch": 0.38005616507648565, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 43578 + }, + { + "epoch": 0.38006488636165425, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 43579 + }, + { + "epoch": 0.38007360764682285, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 43580 + }, + { + "epoch": 0.3800823289319914, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 43581 + }, + { + "epoch": 0.38009105021716, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 43582 + }, + { + "epoch": 0.3800997715023286, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 43583 + }, + { + "epoch": 0.38010849278749714, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 43584 + }, + { + "epoch": 0.38011721407266574, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 43585 + }, + { + "epoch": 0.38012593535783434, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 43586 + }, + { + "epoch": 0.3801346566430029, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 43587 + }, + { + "epoch": 0.3801433779281715, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 43588 + }, + { + "epoch": 0.3801520992133401, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 43589 + }, + { + "epoch": 0.38016082049850863, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9822, + "step": 43590 + }, + { + "epoch": 0.38016954178367723, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 43591 + }, + { + "epoch": 0.38017826306884583, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 43592 + }, + { + "epoch": 0.38018698435401443, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 43593 + }, + { + "epoch": 0.380195705639183, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 43594 + }, + { + "epoch": 0.3802044269243516, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 43595 + }, + { + "epoch": 0.3802131482095202, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 43596 + }, + { + "epoch": 0.3802218694946887, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 43597 + }, + { + "epoch": 0.3802305907798573, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 43598 + }, + { + "epoch": 0.3802393120650259, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 43599 + }, + { + "epoch": 0.38024803335019447, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 43600 + }, + { + "epoch": 0.38025675463536307, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 43601 + }, + { + "epoch": 0.38026547592053167, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 43602 + }, + { + "epoch": 0.3802741972057002, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 43603 + }, + { + "epoch": 0.3802829184908688, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43604 + }, + { + "epoch": 0.3802916397760374, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 43605 + }, + { + "epoch": 0.38030036106120596, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 43606 + }, + { + "epoch": 0.38030908234637456, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 43607 + }, + { + "epoch": 0.38031780363154316, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 43608 + }, + { + "epoch": 0.3803265249167117, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 43609 + }, + { + "epoch": 0.3803352462018803, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 43610 + }, + { + "epoch": 0.3803439674870489, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 43611 + }, + { + "epoch": 0.38035268877221745, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 43612 + }, + { + "epoch": 0.38036141005738605, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 43613 + }, + { + "epoch": 0.38037013134255465, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 43614 + }, + { + "epoch": 0.3803788526277232, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 43615 + }, + { + "epoch": 0.3803875739128918, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 43616 + }, + { + "epoch": 0.3803962951980604, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 43617 + }, + { + "epoch": 0.38040501648322894, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 43618 + }, + { + "epoch": 0.38041373776839754, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 43619 + }, + { + "epoch": 0.38042245905356614, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 43620 + }, + { + "epoch": 0.38043118033873474, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 43621 + }, + { + "epoch": 0.3804399016239033, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 43622 + }, + { + "epoch": 0.3804486229090719, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 43623 + }, + { + "epoch": 0.3804573441942405, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 43624 + }, + { + "epoch": 0.38046606547940903, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 43625 + }, + { + "epoch": 0.38047478676457763, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 43626 + }, + { + "epoch": 0.38048350804974623, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43627 + }, + { + "epoch": 0.3804922293349148, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 43628 + }, + { + "epoch": 0.3805009506200834, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 43629 + }, + { + "epoch": 0.380509671905252, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 43630 + }, + { + "epoch": 0.3805183931904205, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 43631 + }, + { + "epoch": 0.3805271144755891, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 43632 + }, + { + "epoch": 0.3805358357607577, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 43633 + }, + { + "epoch": 0.38054455704592627, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 43634 + }, + { + "epoch": 0.38055327833109487, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 43635 + }, + { + "epoch": 0.38056199961626347, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 43636 + }, + { + "epoch": 0.380570720901432, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 43637 + }, + { + "epoch": 0.3805794421866006, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 43638 + }, + { + "epoch": 0.3805881634717692, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 43639 + }, + { + "epoch": 0.38059688475693776, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 43640 + }, + { + "epoch": 0.38060560604210636, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 43641 + }, + { + "epoch": 0.38061432732727496, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 43642 + }, + { + "epoch": 0.3806230486124435, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 43643 + }, + { + "epoch": 0.3806317698976121, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 43644 + }, + { + "epoch": 0.3806404911827807, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43645 + }, + { + "epoch": 0.38064921246794925, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 43646 + }, + { + "epoch": 0.38065793375311785, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 43647 + }, + { + "epoch": 0.38066665503828645, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 43648 + }, + { + "epoch": 0.38067537632345505, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 43649 + }, + { + "epoch": 0.3806840976086236, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 43650 + }, + { + "epoch": 0.3806928188937922, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 43651 + }, + { + "epoch": 0.3807015401789608, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 43652 + }, + { + "epoch": 0.38071026146412934, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 43653 + }, + { + "epoch": 0.38071898274929794, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 43654 + }, + { + "epoch": 0.38072770403446654, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43655 + }, + { + "epoch": 0.3807364253196351, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 43656 + }, + { + "epoch": 0.3807451466048037, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 43657 + }, + { + "epoch": 0.3807538678899723, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 43658 + }, + { + "epoch": 0.38076258917514083, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 43659 + }, + { + "epoch": 0.38077131046030943, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 43660 + }, + { + "epoch": 0.38078003174547803, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 43661 + }, + { + "epoch": 0.3807887530306466, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 43662 + }, + { + "epoch": 0.3807974743158152, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43663 + }, + { + "epoch": 0.3808061956009838, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 43664 + }, + { + "epoch": 0.3808149168861523, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 43665 + }, + { + "epoch": 0.3808236381713209, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 43666 + }, + { + "epoch": 0.3808323594564895, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 43667 + }, + { + "epoch": 0.38084108074165807, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 43668 + }, + { + "epoch": 0.38084980202682667, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 43669 + }, + { + "epoch": 0.38085852331199527, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 43670 + }, + { + "epoch": 0.3808672445971638, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 43671 + }, + { + "epoch": 0.3808759658823324, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 43672 + }, + { + "epoch": 0.380884687167501, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 43673 + }, + { + "epoch": 0.3808934084526696, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 43674 + }, + { + "epoch": 0.38090212973783816, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 43675 + }, + { + "epoch": 0.38091085102300676, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 43676 + }, + { + "epoch": 0.38091957230817536, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 43677 + }, + { + "epoch": 0.3809282935933439, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 43678 + }, + { + "epoch": 0.3809370148785125, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 43679 + }, + { + "epoch": 0.3809457361636811, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 43680 + }, + { + "epoch": 0.38095445744884965, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 43681 + }, + { + "epoch": 0.38096317873401825, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 43682 + }, + { + "epoch": 0.38097190001918685, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 43683 + }, + { + "epoch": 0.3809806213043554, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 43684 + }, + { + "epoch": 0.380989342589524, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 43685 + }, + { + "epoch": 0.3809980638746926, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 43686 + }, + { + "epoch": 0.38100678515986114, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 43687 + }, + { + "epoch": 0.38101550644502974, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 43688 + }, + { + "epoch": 0.38102422773019834, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 43689 + }, + { + "epoch": 0.3810329490153669, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9735, + "step": 43690 + }, + { + "epoch": 0.3810416703005355, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 43691 + }, + { + "epoch": 0.3810503915857041, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 43692 + }, + { + "epoch": 0.38105911287087263, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 43693 + }, + { + "epoch": 0.38106783415604123, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43694 + }, + { + "epoch": 0.38107655544120983, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 43695 + }, + { + "epoch": 0.3810852767263784, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 43696 + }, + { + "epoch": 0.381093998011547, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 43697 + }, + { + "epoch": 0.3811027192967156, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 43698 + }, + { + "epoch": 0.3811114405818841, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 43699 + }, + { + "epoch": 0.3811201618670527, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 43700 + }, + { + "epoch": 0.3811288831522213, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 43701 + }, + { + "epoch": 0.3811376044373899, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 43702 + }, + { + "epoch": 0.38114632572255847, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43703 + }, + { + "epoch": 0.38115504700772707, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 43704 + }, + { + "epoch": 0.38116376829289567, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 43705 + }, + { + "epoch": 0.3811724895780642, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 43706 + }, + { + "epoch": 0.3811812108632328, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 43707 + }, + { + "epoch": 0.3811899321484014, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 0.9832, + "step": 43708 + }, + { + "epoch": 0.38119865343356996, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 43709 + }, + { + "epoch": 0.38120737471873856, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 43710 + }, + { + "epoch": 0.38121609600390716, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 43711 + }, + { + "epoch": 0.3812248172890757, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 43712 + }, + { + "epoch": 0.3812335385742443, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 43713 + }, + { + "epoch": 0.3812422598594129, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 43714 + }, + { + "epoch": 0.38125098114458145, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 43715 + }, + { + "epoch": 0.38125970242975005, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 43716 + }, + { + "epoch": 0.38126842371491865, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 43717 + }, + { + "epoch": 0.3812771450000872, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 43718 + }, + { + "epoch": 0.3812858662852558, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 43719 + }, + { + "epoch": 0.3812945875704244, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 43720 + }, + { + "epoch": 0.38130330885559294, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 43721 + }, + { + "epoch": 0.38131203014076154, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 43722 + }, + { + "epoch": 0.38132075142593014, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 43723 + }, + { + "epoch": 0.3813294727110987, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 43724 + }, + { + "epoch": 0.3813381939962673, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 43725 + }, + { + "epoch": 0.3813469152814359, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 43726 + }, + { + "epoch": 0.38135563656660443, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 43727 + }, + { + "epoch": 0.38136435785177303, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 43728 + }, + { + "epoch": 0.38137307913694163, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43729 + }, + { + "epoch": 0.38138180042211023, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 43730 + }, + { + "epoch": 0.3813905217072788, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 43731 + }, + { + "epoch": 0.3813992429924474, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 43732 + }, + { + "epoch": 0.381407964277616, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 43733 + }, + { + "epoch": 0.3814166855627845, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 43734 + }, + { + "epoch": 0.3814254068479531, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 43735 + }, + { + "epoch": 0.3814341281331217, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 43736 + }, + { + "epoch": 0.38144284941829026, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 43737 + }, + { + "epoch": 0.38145157070345886, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 43738 + }, + { + "epoch": 0.38146029198862746, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 43739 + }, + { + "epoch": 0.381469013273796, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 43740 + }, + { + "epoch": 0.3814777345589646, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 43741 + }, + { + "epoch": 0.3814864558441332, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 43742 + }, + { + "epoch": 0.38149517712930175, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 43743 + }, + { + "epoch": 0.38150389841447035, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 43744 + }, + { + "epoch": 0.38151261969963896, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 43745 + }, + { + "epoch": 0.3815213409848075, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 43746 + }, + { + "epoch": 0.3815300622699761, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 43747 + }, + { + "epoch": 0.3815387835551447, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 43748 + }, + { + "epoch": 0.38154750484031325, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9818, + "step": 43749 + }, + { + "epoch": 0.38155622612548185, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 43750 + }, + { + "epoch": 0.38156494741065045, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9805, + "step": 43751 + }, + { + "epoch": 0.381573668695819, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 43752 + }, + { + "epoch": 0.3815823899809876, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 43753 + }, + { + "epoch": 0.3815911112661562, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 43754 + }, + { + "epoch": 0.38159983255132474, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 43755 + }, + { + "epoch": 0.38160855383649334, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 43756 + }, + { + "epoch": 0.38161727512166194, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 43757 + }, + { + "epoch": 0.38162599640683054, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 43758 + }, + { + "epoch": 0.3816347176919991, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 43759 + }, + { + "epoch": 0.3816434389771677, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 43760 + }, + { + "epoch": 0.3816521602623363, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 43761 + }, + { + "epoch": 0.3816608815475048, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9737, + "step": 43762 + }, + { + "epoch": 0.3816696028326734, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 43763 + }, + { + "epoch": 0.381678324117842, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 43764 + }, + { + "epoch": 0.38168704540301057, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 43765 + }, + { + "epoch": 0.38169576668817917, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 43766 + }, + { + "epoch": 0.3817044879733478, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 43767 + }, + { + "epoch": 0.3817132092585163, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 43768 + }, + { + "epoch": 0.3817219305436849, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 43769 + }, + { + "epoch": 0.3817306518288535, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 43770 + }, + { + "epoch": 0.38173937311402206, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 43771 + }, + { + "epoch": 0.38174809439919066, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 43772 + }, + { + "epoch": 0.38175681568435926, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 43773 + }, + { + "epoch": 0.3817655369695278, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 43774 + }, + { + "epoch": 0.3817742582546964, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 43775 + }, + { + "epoch": 0.381782979539865, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 43776 + }, + { + "epoch": 0.38179170082503355, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 43777 + }, + { + "epoch": 0.38180042211020215, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 43778 + }, + { + "epoch": 0.38180914339537075, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 43779 + }, + { + "epoch": 0.3818178646805393, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 43780 + }, + { + "epoch": 0.3818265859657079, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 43781 + }, + { + "epoch": 0.3818353072508765, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 43782 + }, + { + "epoch": 0.3818440285360451, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 43783 + }, + { + "epoch": 0.38185274982121364, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 43784 + }, + { + "epoch": 0.38186147110638224, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 43785 + }, + { + "epoch": 0.38187019239155084, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 43786 + }, + { + "epoch": 0.3818789136767194, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 43787 + }, + { + "epoch": 0.381887634961888, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 43788 + }, + { + "epoch": 0.3818963562470566, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 43789 + }, + { + "epoch": 0.38190507753222513, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 43790 + }, + { + "epoch": 0.38191379881739373, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 43791 + }, + { + "epoch": 0.38192252010256234, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 43792 + }, + { + "epoch": 0.3819312413877309, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 43793 + }, + { + "epoch": 0.3819399626728995, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 43794 + }, + { + "epoch": 0.3819486839580681, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 43795 + }, + { + "epoch": 0.3819574052432366, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 43796 + }, + { + "epoch": 0.3819661265284052, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 43797 + }, + { + "epoch": 0.3819748478135738, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 43798 + }, + { + "epoch": 0.38198356909874237, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 43799 + }, + { + "epoch": 0.38199229038391097, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 43800 + }, + { + "epoch": 0.38200101166907957, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 43801 + }, + { + "epoch": 0.3820097329542481, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 43802 + }, + { + "epoch": 0.3820184542394167, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 43803 + }, + { + "epoch": 0.3820271755245853, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 43804 + }, + { + "epoch": 0.38203589680975386, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 43805 + }, + { + "epoch": 0.38204461809492246, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 43806 + }, + { + "epoch": 0.38205333938009106, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 43807 + }, + { + "epoch": 0.3820620606652596, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 43808 + }, + { + "epoch": 0.3820707819504282, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 43809 + }, + { + "epoch": 0.3820795032355968, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 43810 + }, + { + "epoch": 0.3820882245207654, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 43811 + }, + { + "epoch": 0.38209694580593395, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 43812 + }, + { + "epoch": 0.38210566709110255, + "grad_norm": 0.072265625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 43813 + }, + { + "epoch": 0.38211438837627115, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43814 + }, + { + "epoch": 0.3821231096614397, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 43815 + }, + { + "epoch": 0.3821318309466083, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 43816 + }, + { + "epoch": 0.3821405522317769, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 43817 + }, + { + "epoch": 0.38214927351694544, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 43818 + }, + { + "epoch": 0.38215799480211404, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 43819 + }, + { + "epoch": 0.38216671608728264, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 43820 + }, + { + "epoch": 0.3821754373724512, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 43821 + }, + { + "epoch": 0.3821841586576198, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 43822 + }, + { + "epoch": 0.3821928799427884, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 43823 + }, + { + "epoch": 0.38220160122795693, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 43824 + }, + { + "epoch": 0.38221032251312553, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 43825 + }, + { + "epoch": 0.38221904379829413, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 43826 + }, + { + "epoch": 0.3822277650834627, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9774, + "step": 43827 + }, + { + "epoch": 0.3822364863686313, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 43828 + }, + { + "epoch": 0.3822452076537999, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 43829 + }, + { + "epoch": 0.3822539289389684, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 43830 + }, + { + "epoch": 0.382262650224137, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 43831 + }, + { + "epoch": 0.3822713715093056, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 43832 + }, + { + "epoch": 0.38228009279447417, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 43833 + }, + { + "epoch": 0.38228881407964277, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 43834 + }, + { + "epoch": 0.38229753536481137, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43835 + }, + { + "epoch": 0.3823062566499799, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 43836 + }, + { + "epoch": 0.3823149779351485, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 43837 + }, + { + "epoch": 0.3823236992203171, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 43838 + }, + { + "epoch": 0.3823324205054857, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 43839 + }, + { + "epoch": 0.38234114179065426, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 43840 + }, + { + "epoch": 0.38234986307582286, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 43841 + }, + { + "epoch": 0.38235858436099146, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 43842 + }, + { + "epoch": 0.38236730564616, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 43843 + }, + { + "epoch": 0.3823760269313286, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 43844 + }, + { + "epoch": 0.3823847482164972, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 43845 + }, + { + "epoch": 0.38239346950166575, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 43846 + }, + { + "epoch": 0.38240219078683435, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9845, + "step": 43847 + }, + { + "epoch": 0.38241091207200295, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 43848 + }, + { + "epoch": 0.3824196333571715, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 43849 + }, + { + "epoch": 0.3824283546423401, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 43850 + }, + { + "epoch": 0.3824370759275087, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 43851 + }, + { + "epoch": 0.38244579721267724, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 43852 + }, + { + "epoch": 0.38245451849784584, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 43853 + }, + { + "epoch": 0.38246323978301444, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 43854 + }, + { + "epoch": 0.382471961068183, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 43855 + }, + { + "epoch": 0.3824806823533516, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 43856 + }, + { + "epoch": 0.3824894036385202, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 43857 + }, + { + "epoch": 0.38249812492368873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 43858 + }, + { + "epoch": 0.38250684620885733, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 43859 + }, + { + "epoch": 0.38251556749402593, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 43860 + }, + { + "epoch": 0.3825242887791945, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 43861 + }, + { + "epoch": 0.3825330100643631, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 43862 + }, + { + "epoch": 0.3825417313495317, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 43863 + }, + { + "epoch": 0.3825504526347002, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 43864 + }, + { + "epoch": 0.3825591739198688, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 43865 + }, + { + "epoch": 0.3825678952050374, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 43866 + }, + { + "epoch": 0.382576616490206, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 43867 + }, + { + "epoch": 0.38258533777537457, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 43868 + }, + { + "epoch": 0.38259405906054317, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 43869 + }, + { + "epoch": 0.38260278034571177, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 43870 + }, + { + "epoch": 0.3826115016308803, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 43871 + }, + { + "epoch": 0.3826202229160489, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 43872 + }, + { + "epoch": 0.3826289442012175, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 43873 + }, + { + "epoch": 0.38263766548638606, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 43874 + }, + { + "epoch": 0.38264638677155466, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 43875 + }, + { + "epoch": 0.38265510805672326, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 43876 + }, + { + "epoch": 0.3826638293418918, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 43877 + }, + { + "epoch": 0.3826725506270604, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 43878 + }, + { + "epoch": 0.382681271912229, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43879 + }, + { + "epoch": 0.38268999319739755, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 43880 + }, + { + "epoch": 0.38269871448256615, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 43881 + }, + { + "epoch": 0.38270743576773475, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 43882 + }, + { + "epoch": 0.3827161570529033, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 43883 + }, + { + "epoch": 0.3827248783380719, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 43884 + }, + { + "epoch": 0.3827335996232405, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 43885 + }, + { + "epoch": 0.38274232090840904, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 43886 + }, + { + "epoch": 0.38275104219357764, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 43887 + }, + { + "epoch": 0.38275976347874624, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 43888 + }, + { + "epoch": 0.3827684847639148, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 43889 + }, + { + "epoch": 0.3827772060490834, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 43890 + }, + { + "epoch": 0.382785927334252, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 43891 + }, + { + "epoch": 0.3827946486194206, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 43892 + }, + { + "epoch": 0.38280336990458913, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 43893 + }, + { + "epoch": 0.38281209118975773, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 43894 + }, + { + "epoch": 0.38282081247492633, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 43895 + }, + { + "epoch": 0.3828295337600949, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 43896 + }, + { + "epoch": 0.3828382550452635, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 43897 + }, + { + "epoch": 0.3828469763304321, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 43898 + }, + { + "epoch": 0.3828556976156006, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 43899 + }, + { + "epoch": 0.3828644189007692, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 43900 + }, + { + "epoch": 0.3828731401859378, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 43901 + }, + { + "epoch": 0.38288186147110637, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 43902 + }, + { + "epoch": 0.38289058275627497, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 43903 + }, + { + "epoch": 0.38289930404144357, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 43904 + }, + { + "epoch": 0.3829080253266121, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 43905 + }, + { + "epoch": 0.3829167466117807, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 43906 + }, + { + "epoch": 0.3829254678969493, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 43907 + }, + { + "epoch": 0.38293418918211786, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43908 + }, + { + "epoch": 0.38294291046728646, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 43909 + }, + { + "epoch": 0.38295163175245506, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 43910 + }, + { + "epoch": 0.3829603530376236, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 43911 + }, + { + "epoch": 0.3829690743227922, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 43912 + }, + { + "epoch": 0.3829777956079608, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 43913 + }, + { + "epoch": 0.38298651689312935, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 43914 + }, + { + "epoch": 0.38299523817829795, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 43915 + }, + { + "epoch": 0.38300395946346655, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 43916 + }, + { + "epoch": 0.3830126807486351, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 43917 + }, + { + "epoch": 0.3830214020338037, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 43918 + }, + { + "epoch": 0.3830301233189723, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 43919 + }, + { + "epoch": 0.3830388446041409, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 43920 + }, + { + "epoch": 0.38304756588930944, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 43921 + }, + { + "epoch": 0.38305628717447804, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 43922 + }, + { + "epoch": 0.38306500845964664, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 43923 + }, + { + "epoch": 0.3830737297448152, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 43924 + }, + { + "epoch": 0.3830824510299838, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 43925 + }, + { + "epoch": 0.3830911723151524, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 43926 + }, + { + "epoch": 0.38309989360032093, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 43927 + }, + { + "epoch": 0.38310861488548953, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 43928 + }, + { + "epoch": 0.38311733617065813, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 43929 + }, + { + "epoch": 0.3831260574558267, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 43930 + }, + { + "epoch": 0.3831347787409953, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 43931 + }, + { + "epoch": 0.3831435000261639, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 43932 + }, + { + "epoch": 0.3831522213113324, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 43933 + }, + { + "epoch": 0.383160942596501, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 43934 + }, + { + "epoch": 0.3831696638816696, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 43935 + }, + { + "epoch": 0.38317838516683816, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43936 + }, + { + "epoch": 0.38318710645200676, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 43937 + }, + { + "epoch": 0.38319582773717537, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 43938 + }, + { + "epoch": 0.3832045490223439, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 43939 + }, + { + "epoch": 0.3832132703075125, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 43940 + }, + { + "epoch": 0.3832219915926811, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 43941 + }, + { + "epoch": 0.38323071287784966, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43942 + }, + { + "epoch": 0.38323943416301826, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 43943 + }, + { + "epoch": 0.38324815544818686, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 43944 + }, + { + "epoch": 0.3832568767333554, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 43945 + }, + { + "epoch": 0.383265598018524, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 43946 + }, + { + "epoch": 0.3832743193036926, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 43947 + }, + { + "epoch": 0.3832830405888612, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 43948 + }, + { + "epoch": 0.38329176187402975, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 43949 + }, + { + "epoch": 0.38330048315919835, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 43950 + }, + { + "epoch": 0.38330920444436695, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 43951 + }, + { + "epoch": 0.3833179257295355, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 43952 + }, + { + "epoch": 0.3833266470147041, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 43953 + }, + { + "epoch": 0.3833353682998727, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 43954 + }, + { + "epoch": 0.38334408958504124, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 43955 + }, + { + "epoch": 0.38335281087020984, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 43956 + }, + { + "epoch": 0.38336153215537844, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 43957 + }, + { + "epoch": 0.383370253440547, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 43958 + }, + { + "epoch": 0.3833789747257156, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 43959 + }, + { + "epoch": 0.3833876960108842, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 43960 + }, + { + "epoch": 0.3833964172960527, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 43961 + }, + { + "epoch": 0.3834051385812213, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 43962 + }, + { + "epoch": 0.38341385986638993, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9766, + "step": 43963 + }, + { + "epoch": 0.3834225811515585, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 43964 + }, + { + "epoch": 0.3834313024367271, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 43965 + }, + { + "epoch": 0.3834400237218957, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 43966 + }, + { + "epoch": 0.3834487450070642, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 43967 + }, + { + "epoch": 0.3834574662922328, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 43968 + }, + { + "epoch": 0.3834661875774014, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 43969 + }, + { + "epoch": 0.38347490886256996, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 43970 + }, + { + "epoch": 0.38348363014773856, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 43971 + }, + { + "epoch": 0.38349235143290716, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 43972 + }, + { + "epoch": 0.3835010727180757, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 43973 + }, + { + "epoch": 0.3835097940032443, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 43974 + }, + { + "epoch": 0.3835185152884129, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 43975 + }, + { + "epoch": 0.3835272365735815, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 43976 + }, + { + "epoch": 0.38353595785875005, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 43977 + }, + { + "epoch": 0.38354467914391865, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 43978 + }, + { + "epoch": 0.38355340042908725, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 43979 + }, + { + "epoch": 0.3835621217142558, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 43980 + }, + { + "epoch": 0.3835708429994244, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 43981 + }, + { + "epoch": 0.383579564284593, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 43982 + }, + { + "epoch": 0.38358828556976154, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 43983 + }, + { + "epoch": 0.38359700685493014, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 43984 + }, + { + "epoch": 0.38360572814009875, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 43985 + }, + { + "epoch": 0.3836144494252673, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 43986 + }, + { + "epoch": 0.3836231707104359, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 43987 + }, + { + "epoch": 0.3836318919956045, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 43988 + }, + { + "epoch": 0.38364061328077304, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 43989 + }, + { + "epoch": 0.38364933456594164, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 43990 + }, + { + "epoch": 0.38365805585111024, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 43991 + }, + { + "epoch": 0.3836667771362788, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 43992 + }, + { + "epoch": 0.3836754984214474, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 43993 + }, + { + "epoch": 0.383684219706616, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 43994 + }, + { + "epoch": 0.3836929409917845, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 43995 + }, + { + "epoch": 0.3837016622769531, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 43996 + }, + { + "epoch": 0.3837103835621217, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 43997 + }, + { + "epoch": 0.38371910484729027, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 43998 + }, + { + "epoch": 0.38372782613245887, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 43999 + }, + { + "epoch": 0.38373654741762747, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 44000 + }, + { + "epoch": 0.383745268702796, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 44001 + }, + { + "epoch": 0.3837539899879646, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 44002 + }, + { + "epoch": 0.3837627112731332, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 44003 + }, + { + "epoch": 0.3837714325583018, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 44004 + }, + { + "epoch": 0.38378015384347036, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 44005 + }, + { + "epoch": 0.38378887512863896, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 44006 + }, + { + "epoch": 0.38379759641380756, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 44007 + }, + { + "epoch": 0.3838063176989761, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44008 + }, + { + "epoch": 0.3838150389841447, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 44009 + }, + { + "epoch": 0.3838237602693133, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 44010 + }, + { + "epoch": 0.38383248155448185, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 44011 + }, + { + "epoch": 0.38384120283965045, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 44012 + }, + { + "epoch": 0.38384992412481905, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 44013 + }, + { + "epoch": 0.3838586454099876, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 44014 + }, + { + "epoch": 0.3838673666951562, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 44015 + }, + { + "epoch": 0.3838760879803248, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44016 + }, + { + "epoch": 0.38388480926549334, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 44017 + }, + { + "epoch": 0.38389353055066194, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 44018 + }, + { + "epoch": 0.38390225183583054, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 44019 + }, + { + "epoch": 0.3839109731209991, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 44020 + }, + { + "epoch": 0.3839196944061677, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 44021 + }, + { + "epoch": 0.3839284156913363, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 44022 + }, + { + "epoch": 0.38393713697650483, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44023 + }, + { + "epoch": 0.38394585826167343, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 44024 + }, + { + "epoch": 0.38395457954684203, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 44025 + }, + { + "epoch": 0.3839633008320106, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 44026 + }, + { + "epoch": 0.3839720221171792, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 44027 + }, + { + "epoch": 0.3839807434023478, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 44028 + }, + { + "epoch": 0.3839894646875164, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 44029 + }, + { + "epoch": 0.3839981859726849, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 44030 + }, + { + "epoch": 0.3840069072578535, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 44031 + }, + { + "epoch": 0.3840156285430221, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 44032 + }, + { + "epoch": 0.38402434982819067, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 44033 + }, + { + "epoch": 0.38403307111335927, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 44034 + }, + { + "epoch": 0.38404179239852787, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 44035 + }, + { + "epoch": 0.3840505136836964, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 44036 + }, + { + "epoch": 0.384059234968865, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 44037 + }, + { + "epoch": 0.3840679562540336, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44038 + }, + { + "epoch": 0.38407667753920216, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 44039 + }, + { + "epoch": 0.38408539882437076, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 44040 + }, + { + "epoch": 0.38409412010953936, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 44041 + }, + { + "epoch": 0.3841028413947079, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 44042 + }, + { + "epoch": 0.3841115626798765, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 44043 + }, + { + "epoch": 0.3841202839650451, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44044 + }, + { + "epoch": 0.38412900525021365, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 44045 + }, + { + "epoch": 0.38413772653538225, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 44046 + }, + { + "epoch": 0.38414644782055085, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 44047 + }, + { + "epoch": 0.3841551691057194, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 44048 + }, + { + "epoch": 0.384163890390888, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 44049 + }, + { + "epoch": 0.3841726116760566, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 44050 + }, + { + "epoch": 0.38418133296122514, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 44051 + }, + { + "epoch": 0.38419005424639374, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 0.9783, + "step": 44052 + }, + { + "epoch": 0.38419877553156234, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 44053 + }, + { + "epoch": 0.3842074968167309, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 44054 + }, + { + "epoch": 0.3842162181018995, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 44055 + }, + { + "epoch": 0.3842249393870681, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 44056 + }, + { + "epoch": 0.3842336606722367, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 44057 + }, + { + "epoch": 0.38424238195740523, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 44058 + }, + { + "epoch": 0.38425110324257383, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 44059 + }, + { + "epoch": 0.38425982452774243, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 44060 + }, + { + "epoch": 0.384268545812911, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 44061 + }, + { + "epoch": 0.3842772670980796, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 44062 + }, + { + "epoch": 0.3842859883832482, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44063 + }, + { + "epoch": 0.3842947096684167, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 44064 + }, + { + "epoch": 0.3843034309535853, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44065 + }, + { + "epoch": 0.3843121522387539, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 44066 + }, + { + "epoch": 0.38432087352392247, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 44067 + }, + { + "epoch": 0.38432959480909107, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 44068 + }, + { + "epoch": 0.38433831609425967, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 44069 + }, + { + "epoch": 0.3843470373794282, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 44070 + }, + { + "epoch": 0.3843557586645968, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 44071 + }, + { + "epoch": 0.3843644799497654, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 44072 + }, + { + "epoch": 0.38437320123493396, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 44073 + }, + { + "epoch": 0.38438192252010256, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 44074 + }, + { + "epoch": 0.38439064380527116, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 44075 + }, + { + "epoch": 0.3843993650904397, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 44076 + }, + { + "epoch": 0.3844080863756083, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 44077 + }, + { + "epoch": 0.3844168076607769, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 44078 + }, + { + "epoch": 0.38442552894594545, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 44079 + }, + { + "epoch": 0.38443425023111405, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 44080 + }, + { + "epoch": 0.38444297151628265, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 44081 + }, + { + "epoch": 0.3844516928014512, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 44082 + }, + { + "epoch": 0.3844604140866198, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 44083 + }, + { + "epoch": 0.3844691353717884, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 44084 + }, + { + "epoch": 0.384477856656957, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 44085 + }, + { + "epoch": 0.38448657794212554, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 44086 + }, + { + "epoch": 0.38449529922729414, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 44087 + }, + { + "epoch": 0.38450402051246274, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 44088 + }, + { + "epoch": 0.3845127417976313, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 44089 + }, + { + "epoch": 0.3845214630827999, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 44090 + }, + { + "epoch": 0.3845301843679685, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9729, + "step": 44091 + }, + { + "epoch": 0.38453890565313703, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 44092 + }, + { + "epoch": 0.38454762693830563, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 44093 + }, + { + "epoch": 0.38455634822347423, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 44094 + }, + { + "epoch": 0.3845650695086428, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 44095 + }, + { + "epoch": 0.3845737907938114, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 44096 + }, + { + "epoch": 0.38458251207898, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 44097 + }, + { + "epoch": 0.3845912333641485, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 44098 + }, + { + "epoch": 0.3845999546493171, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.98, + "step": 44099 + }, + { + "epoch": 0.3846086759344857, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 44100 + }, + { + "epoch": 0.38461739721965427, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 44101 + }, + { + "epoch": 0.38462611850482287, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 44102 + }, + { + "epoch": 0.38463483978999147, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 44103 + }, + { + "epoch": 0.38464356107516, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 44104 + }, + { + "epoch": 0.3846522823603286, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 44105 + }, + { + "epoch": 0.3846610036454972, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 44106 + }, + { + "epoch": 0.38466972493066576, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 44107 + }, + { + "epoch": 0.38467844621583436, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 44108 + }, + { + "epoch": 0.38468716750100296, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 44109 + }, + { + "epoch": 0.3846958887861715, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 44110 + }, + { + "epoch": 0.3847046100713401, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44111 + }, + { + "epoch": 0.3847133313565087, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 44112 + }, + { + "epoch": 0.3847220526416773, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 44113 + }, + { + "epoch": 0.38473077392684585, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44114 + }, + { + "epoch": 0.38473949521201445, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44115 + }, + { + "epoch": 0.38474821649718305, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 44116 + }, + { + "epoch": 0.3847569377823516, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 44117 + }, + { + "epoch": 0.3847656590675202, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 44118 + }, + { + "epoch": 0.3847743803526888, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 44119 + }, + { + "epoch": 0.38478310163785734, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 44120 + }, + { + "epoch": 0.38479182292302594, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 44121 + }, + { + "epoch": 0.38480054420819454, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 44122 + }, + { + "epoch": 0.3848092654933631, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 44123 + }, + { + "epoch": 0.3848179867785317, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 44124 + }, + { + "epoch": 0.3848267080637003, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 44125 + }, + { + "epoch": 0.38483542934886883, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 44126 + }, + { + "epoch": 0.38484415063403743, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 44127 + }, + { + "epoch": 0.38485287191920603, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 44128 + }, + { + "epoch": 0.3848615932043746, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 44129 + }, + { + "epoch": 0.3848703144895432, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 44130 + }, + { + "epoch": 0.3848790357747118, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 44131 + }, + { + "epoch": 0.3848877570598803, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 44132 + }, + { + "epoch": 0.3848964783450489, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 44133 + }, + { + "epoch": 0.3849051996302175, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 44134 + }, + { + "epoch": 0.38491392091538607, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 44135 + }, + { + "epoch": 0.38492264220055467, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 44136 + }, + { + "epoch": 0.38493136348572327, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 44137 + }, + { + "epoch": 0.38494008477089187, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44138 + }, + { + "epoch": 0.3849488060560604, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 44139 + }, + { + "epoch": 0.384957527341229, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 44140 + }, + { + "epoch": 0.3849662486263976, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 44141 + }, + { + "epoch": 0.38497496991156616, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 44142 + }, + { + "epoch": 0.38498369119673476, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 44143 + }, + { + "epoch": 0.38499241248190336, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 44144 + }, + { + "epoch": 0.3850011337670719, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9781, + "step": 44145 + }, + { + "epoch": 0.3850098550522405, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 44146 + }, + { + "epoch": 0.3850185763374091, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 44147 + }, + { + "epoch": 0.38502729762257765, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 44148 + }, + { + "epoch": 0.38503601890774625, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 44149 + }, + { + "epoch": 0.38504474019291485, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 44150 + }, + { + "epoch": 0.3850534614780834, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 44151 + }, + { + "epoch": 0.385062182763252, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 44152 + }, + { + "epoch": 0.3850709040484206, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 44153 + }, + { + "epoch": 0.38507962533358914, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 44154 + }, + { + "epoch": 0.38508834661875774, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 44155 + }, + { + "epoch": 0.38509706790392634, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 44156 + }, + { + "epoch": 0.3851057891890949, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 44157 + }, + { + "epoch": 0.3851145104742635, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 44158 + }, + { + "epoch": 0.3851232317594321, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 44159 + }, + { + "epoch": 0.38513195304460063, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 44160 + }, + { + "epoch": 0.38514067432976923, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 44161 + }, + { + "epoch": 0.38514939561493783, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 44162 + }, + { + "epoch": 0.3851581169001064, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 44163 + }, + { + "epoch": 0.385166838185275, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 44164 + }, + { + "epoch": 0.3851755594704436, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 44165 + }, + { + "epoch": 0.3851842807556122, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 44166 + }, + { + "epoch": 0.3851930020407807, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 44167 + }, + { + "epoch": 0.3852017233259493, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 44168 + }, + { + "epoch": 0.3852104446111179, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 44169 + }, + { + "epoch": 0.38521916589628646, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 44170 + }, + { + "epoch": 0.38522788718145506, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 44171 + }, + { + "epoch": 0.38523660846662366, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 44172 + }, + { + "epoch": 0.3852453297517922, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 44173 + }, + { + "epoch": 0.3852540510369608, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9772, + "step": 44174 + }, + { + "epoch": 0.3852627723221294, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 44175 + }, + { + "epoch": 0.38527149360729795, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 44176 + }, + { + "epoch": 0.38528021489246655, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 44177 + }, + { + "epoch": 0.38528893617763516, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 44178 + }, + { + "epoch": 0.3852976574628037, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 44179 + }, + { + "epoch": 0.3853063787479723, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 44180 + }, + { + "epoch": 0.3853151000331409, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 44181 + }, + { + "epoch": 0.38532382131830945, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 44182 + }, + { + "epoch": 0.38533254260347805, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9768, + "step": 44183 + }, + { + "epoch": 0.38534126388864665, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 44184 + }, + { + "epoch": 0.3853499851738152, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 44185 + }, + { + "epoch": 0.3853587064589838, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44186 + }, + { + "epoch": 0.3853674277441524, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 44187 + }, + { + "epoch": 0.38537614902932094, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 44188 + }, + { + "epoch": 0.38538487031448954, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 44189 + }, + { + "epoch": 0.38539359159965814, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 44190 + }, + { + "epoch": 0.3854023128848267, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 44191 + }, + { + "epoch": 0.3854110341699953, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 44192 + }, + { + "epoch": 0.3854197554551639, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 44193 + }, + { + "epoch": 0.3854284767403325, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 44194 + }, + { + "epoch": 0.385437198025501, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 44195 + }, + { + "epoch": 0.3854459193106696, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9797, + "step": 44196 + }, + { + "epoch": 0.3854546405958382, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 44197 + }, + { + "epoch": 0.38546336188100677, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 44198 + }, + { + "epoch": 0.38547208316617537, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 44199 + }, + { + "epoch": 0.385480804451344, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 44200 + }, + { + "epoch": 0.3854895257365125, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 44201 + }, + { + "epoch": 0.3854982470216811, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 44202 + }, + { + "epoch": 0.3855069683068497, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 44203 + }, + { + "epoch": 0.38551568959201826, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 44204 + }, + { + "epoch": 0.38552441087718686, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 44205 + }, + { + "epoch": 0.38553313216235546, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44206 + }, + { + "epoch": 0.385541853447524, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 44207 + }, + { + "epoch": 0.3855505747326926, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 44208 + }, + { + "epoch": 0.3855592960178612, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 44209 + }, + { + "epoch": 0.38556801730302975, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 44210 + }, + { + "epoch": 0.38557673858819835, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 44211 + }, + { + "epoch": 0.38558545987336695, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 44212 + }, + { + "epoch": 0.3855941811585355, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 44213 + }, + { + "epoch": 0.3856029024437041, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 44214 + }, + { + "epoch": 0.3856116237288727, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 44215 + }, + { + "epoch": 0.38562034501404124, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 44216 + }, + { + "epoch": 0.38562906629920984, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 44217 + }, + { + "epoch": 0.38563778758437844, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 44218 + }, + { + "epoch": 0.385646508869547, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 44219 + }, + { + "epoch": 0.3856552301547156, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 44220 + }, + { + "epoch": 0.3856639514398842, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 44221 + }, + { + "epoch": 0.3856726727250528, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 44222 + }, + { + "epoch": 0.38568139401022133, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 44223 + }, + { + "epoch": 0.38569011529538993, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 44224 + }, + { + "epoch": 0.38569883658055854, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 44225 + }, + { + "epoch": 0.3857075578657271, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 44226 + }, + { + "epoch": 0.3857162791508957, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 44227 + }, + { + "epoch": 0.3857250004360643, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 44228 + }, + { + "epoch": 0.3857337217212328, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 44229 + }, + { + "epoch": 0.3857424430064014, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 44230 + }, + { + "epoch": 0.38575116429157, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 44231 + }, + { + "epoch": 0.38575988557673857, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44232 + }, + { + "epoch": 0.38576860686190717, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 44233 + }, + { + "epoch": 0.38577732814707577, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 44234 + }, + { + "epoch": 0.3857860494322443, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 44235 + }, + { + "epoch": 0.3857947707174129, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 44236 + }, + { + "epoch": 0.3858034920025815, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 44237 + }, + { + "epoch": 0.38581221328775006, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 44238 + }, + { + "epoch": 0.38582093457291866, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 44239 + }, + { + "epoch": 0.38582965585808726, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 44240 + }, + { + "epoch": 0.3858383771432558, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 44241 + }, + { + "epoch": 0.3858470984284244, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 44242 + }, + { + "epoch": 0.385855819713593, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 44243 + }, + { + "epoch": 0.38586454099876155, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9738, + "step": 44244 + }, + { + "epoch": 0.38587326228393015, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 44245 + }, + { + "epoch": 0.38588198356909875, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 44246 + }, + { + "epoch": 0.38589070485426735, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 44247 + }, + { + "epoch": 0.3858994261394359, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 44248 + }, + { + "epoch": 0.3859081474246045, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 44249 + }, + { + "epoch": 0.3859168687097731, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 44250 + }, + { + "epoch": 0.38592558999494164, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 44251 + }, + { + "epoch": 0.38593431128011024, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 44252 + }, + { + "epoch": 0.38594303256527884, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9793, + "step": 44253 + }, + { + "epoch": 0.3859517538504474, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 44254 + }, + { + "epoch": 0.385960475135616, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 44255 + }, + { + "epoch": 0.3859691964207846, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 44256 + }, + { + "epoch": 0.38597791770595313, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 44257 + }, + { + "epoch": 0.38598663899112173, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 44258 + }, + { + "epoch": 0.38599536027629033, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 44259 + }, + { + "epoch": 0.3860040815614589, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 44260 + }, + { + "epoch": 0.3860128028466275, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 44261 + }, + { + "epoch": 0.3860215241317961, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 44262 + }, + { + "epoch": 0.3860302454169646, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 44263 + }, + { + "epoch": 0.3860389667021332, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 44264 + }, + { + "epoch": 0.3860476879873018, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 44265 + }, + { + "epoch": 0.38605640927247037, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 44266 + }, + { + "epoch": 0.38606513055763897, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 44267 + }, + { + "epoch": 0.38607385184280757, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 44268 + }, + { + "epoch": 0.3860825731279761, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 44269 + }, + { + "epoch": 0.3860912944131447, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 44270 + }, + { + "epoch": 0.3861000156983133, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 44271 + }, + { + "epoch": 0.38610873698348186, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 44272 + }, + { + "epoch": 0.38611745826865046, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 44273 + }, + { + "epoch": 0.38612617955381906, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 44274 + }, + { + "epoch": 0.38613490083898766, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 44275 + }, + { + "epoch": 0.3861436221241562, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 44276 + }, + { + "epoch": 0.3861523434093248, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 44277 + }, + { + "epoch": 0.3861610646944934, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 44278 + }, + { + "epoch": 0.38616978597966195, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44279 + }, + { + "epoch": 0.38617850726483055, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 44280 + }, + { + "epoch": 0.38618722854999915, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 44281 + }, + { + "epoch": 0.3861959498351677, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 44282 + }, + { + "epoch": 0.3862046711203363, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 44283 + }, + { + "epoch": 0.3862133924055049, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 44284 + }, + { + "epoch": 0.38622211369067344, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 44285 + }, + { + "epoch": 0.38623083497584204, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 44286 + }, + { + "epoch": 0.38623955626101064, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 44287 + }, + { + "epoch": 0.3862482775461792, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 44288 + }, + { + "epoch": 0.3862569988313478, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 44289 + }, + { + "epoch": 0.3862657201165164, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44290 + }, + { + "epoch": 0.38627444140168493, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 44291 + }, + { + "epoch": 0.38628316268685353, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 44292 + }, + { + "epoch": 0.38629188397202213, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 44293 + }, + { + "epoch": 0.3863006052571907, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 44294 + }, + { + "epoch": 0.3863093265423593, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 44295 + }, + { + "epoch": 0.3863180478275279, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 44296 + }, + { + "epoch": 0.3863267691126964, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9813, + "step": 44297 + }, + { + "epoch": 0.386335490397865, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 44298 + }, + { + "epoch": 0.3863442116830336, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 44299 + }, + { + "epoch": 0.38635293296820217, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44300 + }, + { + "epoch": 0.38636165425337077, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 44301 + }, + { + "epoch": 0.38637037553853937, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 44302 + }, + { + "epoch": 0.38637909682370797, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 44303 + }, + { + "epoch": 0.3863878181088765, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 44304 + }, + { + "epoch": 0.3863965393940451, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 44305 + }, + { + "epoch": 0.3864052606792137, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 44306 + }, + { + "epoch": 0.38641398196438226, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 44307 + }, + { + "epoch": 0.38642270324955086, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 44308 + }, + { + "epoch": 0.38643142453471946, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 44309 + }, + { + "epoch": 0.386440145819888, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 44310 + }, + { + "epoch": 0.3864488671050566, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 44311 + }, + { + "epoch": 0.3864575883902252, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 44312 + }, + { + "epoch": 0.38646630967539375, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 44313 + }, + { + "epoch": 0.38647503096056235, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 44314 + }, + { + "epoch": 0.38648375224573095, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 44315 + }, + { + "epoch": 0.3864924735308995, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 44316 + }, + { + "epoch": 0.3865011948160681, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 44317 + }, + { + "epoch": 0.3865099161012367, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 44318 + }, + { + "epoch": 0.38651863738640524, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 44319 + }, + { + "epoch": 0.38652735867157384, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 44320 + }, + { + "epoch": 0.38653607995674244, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 44321 + }, + { + "epoch": 0.386544801241911, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 44322 + }, + { + "epoch": 0.3865535225270796, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44323 + }, + { + "epoch": 0.3865622438122482, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 44324 + }, + { + "epoch": 0.38657096509741673, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 44325 + }, + { + "epoch": 0.38657968638258533, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 44326 + }, + { + "epoch": 0.38658840766775393, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 44327 + }, + { + "epoch": 0.3865971289529225, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 44328 + }, + { + "epoch": 0.3866058502380911, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 44329 + }, + { + "epoch": 0.3866145715232597, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 44330 + }, + { + "epoch": 0.3866232928084283, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 44331 + }, + { + "epoch": 0.3866320140935968, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 44332 + }, + { + "epoch": 0.3866407353787654, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 44333 + }, + { + "epoch": 0.386649456663934, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 44334 + }, + { + "epoch": 0.38665817794910257, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 44335 + }, + { + "epoch": 0.38666689923427117, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 44336 + }, + { + "epoch": 0.38667562051943977, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 44337 + }, + { + "epoch": 0.3866843418046083, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 44338 + }, + { + "epoch": 0.3866930630897769, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 44339 + }, + { + "epoch": 0.3867017843749455, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 44340 + }, + { + "epoch": 0.38671050566011406, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 44341 + }, + { + "epoch": 0.38671922694528266, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44342 + }, + { + "epoch": 0.38672794823045126, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 44343 + }, + { + "epoch": 0.3867366695156198, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 44344 + }, + { + "epoch": 0.3867453908007884, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 44345 + }, + { + "epoch": 0.386754112085957, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 44346 + }, + { + "epoch": 0.38676283337112555, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 44347 + }, + { + "epoch": 0.38677155465629415, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 44348 + }, + { + "epoch": 0.38678027594146275, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 44349 + }, + { + "epoch": 0.3867889972266313, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 44350 + }, + { + "epoch": 0.3867977185117999, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 44351 + }, + { + "epoch": 0.3868064397969685, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 44352 + }, + { + "epoch": 0.38681516108213704, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 44353 + }, + { + "epoch": 0.38682388236730564, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 44354 + }, + { + "epoch": 0.38683260365247424, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 44355 + }, + { + "epoch": 0.38684132493764284, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 44356 + }, + { + "epoch": 0.3868500462228114, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 44357 + }, + { + "epoch": 0.38685876750798, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 44358 + }, + { + "epoch": 0.3868674887931486, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 44359 + }, + { + "epoch": 0.38687621007831713, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 44360 + }, + { + "epoch": 0.38688493136348573, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 44361 + }, + { + "epoch": 0.38689365264865433, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 44362 + }, + { + "epoch": 0.3869023739338229, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 44363 + }, + { + "epoch": 0.3869110952189915, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 44364 + }, + { + "epoch": 0.3869198165041601, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 44365 + }, + { + "epoch": 0.3869285377893286, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 44366 + }, + { + "epoch": 0.3869372590744972, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 44367 + }, + { + "epoch": 0.3869459803596658, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 44368 + }, + { + "epoch": 0.38695470164483436, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 44369 + }, + { + "epoch": 0.38696342293000296, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 44370 + }, + { + "epoch": 0.38697214421517157, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 44371 + }, + { + "epoch": 0.3869808655003401, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 44372 + }, + { + "epoch": 0.3869895867855087, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 44373 + }, + { + "epoch": 0.3869983080706773, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 44374 + }, + { + "epoch": 0.38700702935584586, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 44375 + }, + { + "epoch": 0.38701575064101446, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 44376 + }, + { + "epoch": 0.38702447192618306, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 44377 + }, + { + "epoch": 0.3870331932113516, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 44378 + }, + { + "epoch": 0.3870419144965202, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 44379 + }, + { + "epoch": 0.3870506357816888, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 44380 + }, + { + "epoch": 0.38705935706685735, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 44381 + }, + { + "epoch": 0.38706807835202595, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 44382 + }, + { + "epoch": 0.38707679963719455, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44383 + }, + { + "epoch": 0.38708552092236315, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 44384 + }, + { + "epoch": 0.3870942422075317, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 44385 + }, + { + "epoch": 0.3871029634927003, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 44386 + }, + { + "epoch": 0.3871116847778689, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 44387 + }, + { + "epoch": 0.38712040606303744, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 44388 + }, + { + "epoch": 0.38712912734820604, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 44389 + }, + { + "epoch": 0.38713784863337464, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 44390 + }, + { + "epoch": 0.3871465699185432, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9786, + "step": 44391 + }, + { + "epoch": 0.3871552912037118, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 44392 + }, + { + "epoch": 0.3871640124888804, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 44393 + }, + { + "epoch": 0.3871727337740489, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 44394 + }, + { + "epoch": 0.3871814550592175, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 44395 + }, + { + "epoch": 0.3871901763443861, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 44396 + }, + { + "epoch": 0.3871988976295547, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 44397 + }, + { + "epoch": 0.3872076189147233, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 44398 + }, + { + "epoch": 0.3872163401998919, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 44399 + }, + { + "epoch": 0.3872250614850604, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 44400 + }, + { + "epoch": 0.387233782770229, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 44401 + }, + { + "epoch": 0.3872425040553976, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 44402 + }, + { + "epoch": 0.38725122534056616, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 44403 + }, + { + "epoch": 0.38725994662573476, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 44404 + }, + { + "epoch": 0.38726866791090336, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 44405 + }, + { + "epoch": 0.3872773891960719, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 44406 + }, + { + "epoch": 0.3872861104812405, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 44407 + }, + { + "epoch": 0.3872948317664091, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 44408 + }, + { + "epoch": 0.38730355305157765, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 44409 + }, + { + "epoch": 0.38731227433674625, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 44410 + }, + { + "epoch": 0.38732099562191485, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 44411 + }, + { + "epoch": 0.38732971690708345, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 44412 + }, + { + "epoch": 0.387338438192252, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 44413 + }, + { + "epoch": 0.3873471594774206, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 44414 + }, + { + "epoch": 0.3873558807625892, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 44415 + }, + { + "epoch": 0.38736460204775774, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 44416 + }, + { + "epoch": 0.38737332333292634, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 44417 + }, + { + "epoch": 0.38738204461809495, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 44418 + }, + { + "epoch": 0.3873907659032635, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 44419 + }, + { + "epoch": 0.3873994871884321, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 44420 + }, + { + "epoch": 0.3874082084736007, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 44421 + }, + { + "epoch": 0.38741692975876924, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 44422 + }, + { + "epoch": 0.38742565104393784, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 44423 + }, + { + "epoch": 0.38743437232910644, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 44424 + }, + { + "epoch": 0.387443093614275, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 44425 + }, + { + "epoch": 0.3874518148994436, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 44426 + }, + { + "epoch": 0.3874605361846122, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 44427 + }, + { + "epoch": 0.3874692574697807, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 44428 + }, + { + "epoch": 0.3874779787549493, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 44429 + }, + { + "epoch": 0.3874867000401179, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 44430 + }, + { + "epoch": 0.38749542132528647, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 44431 + }, + { + "epoch": 0.38750414261045507, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 44432 + }, + { + "epoch": 0.38751286389562367, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 44433 + }, + { + "epoch": 0.3875215851807922, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 44434 + }, + { + "epoch": 0.3875303064659608, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 44435 + }, + { + "epoch": 0.3875390277511294, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 44436 + }, + { + "epoch": 0.38754774903629796, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 44437 + }, + { + "epoch": 0.38755647032146656, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 44438 + }, + { + "epoch": 0.38756519160663516, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 44439 + }, + { + "epoch": 0.38757391289180376, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 44440 + }, + { + "epoch": 0.3875826341769723, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 44441 + }, + { + "epoch": 0.3875913554621409, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 44442 + }, + { + "epoch": 0.3876000767473095, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 44443 + }, + { + "epoch": 0.38760879803247805, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 44444 + }, + { + "epoch": 0.38761751931764665, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 44445 + }, + { + "epoch": 0.38762624060281525, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 44446 + }, + { + "epoch": 0.3876349618879838, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 44447 + }, + { + "epoch": 0.3876436831731524, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 44448 + }, + { + "epoch": 0.387652404458321, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 44449 + }, + { + "epoch": 0.38766112574348954, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 44450 + }, + { + "epoch": 0.38766984702865814, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 44451 + }, + { + "epoch": 0.38767856831382674, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 44452 + }, + { + "epoch": 0.3876872895989953, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 44453 + }, + { + "epoch": 0.3876960108841639, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 44454 + }, + { + "epoch": 0.3877047321693325, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 44455 + }, + { + "epoch": 0.38771345345450103, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44456 + }, + { + "epoch": 0.38772217473966963, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 44457 + }, + { + "epoch": 0.38773089602483823, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 44458 + }, + { + "epoch": 0.3877396173100068, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 44459 + }, + { + "epoch": 0.3877483385951754, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 44460 + }, + { + "epoch": 0.387757059880344, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 44461 + }, + { + "epoch": 0.3877657811655125, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 44462 + }, + { + "epoch": 0.3877745024506811, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 44463 + }, + { + "epoch": 0.3877832237358497, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 44464 + }, + { + "epoch": 0.3877919450210183, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 44465 + }, + { + "epoch": 0.38780066630618687, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 44466 + }, + { + "epoch": 0.38780938759135547, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 44467 + }, + { + "epoch": 0.38781810887652407, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 44468 + }, + { + "epoch": 0.3878268301616926, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 44469 + }, + { + "epoch": 0.3878355514468612, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 44470 + }, + { + "epoch": 0.3878442727320298, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 44471 + }, + { + "epoch": 0.38785299401719836, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 44472 + }, + { + "epoch": 0.38786171530236696, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 44473 + }, + { + "epoch": 0.38787043658753556, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 44474 + }, + { + "epoch": 0.3878791578727041, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 44475 + }, + { + "epoch": 0.3878878791578727, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 44476 + }, + { + "epoch": 0.3878966004430413, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 44477 + }, + { + "epoch": 0.38790532172820985, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 44478 + }, + { + "epoch": 0.38791404301337845, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 44479 + }, + { + "epoch": 0.38792276429854705, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 44480 + }, + { + "epoch": 0.3879314855837156, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 44481 + }, + { + "epoch": 0.3879402068688842, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 44482 + }, + { + "epoch": 0.3879489281540528, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 44483 + }, + { + "epoch": 0.38795764943922134, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 44484 + }, + { + "epoch": 0.38796637072438994, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 44485 + }, + { + "epoch": 0.38797509200955854, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 44486 + }, + { + "epoch": 0.3879838132947271, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 44487 + }, + { + "epoch": 0.3879925345798957, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 44488 + }, + { + "epoch": 0.3880012558650643, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 44489 + }, + { + "epoch": 0.38800997715023283, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 44490 + }, + { + "epoch": 0.38801869843540143, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 44491 + }, + { + "epoch": 0.38802741972057003, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 44492 + }, + { + "epoch": 0.38803614100573863, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 44493 + }, + { + "epoch": 0.3880448622909072, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 44494 + }, + { + "epoch": 0.3880535835760758, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 44495 + }, + { + "epoch": 0.3880623048612444, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 44496 + }, + { + "epoch": 0.3880710261464129, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9655, + "step": 44497 + }, + { + "epoch": 0.3880797474315815, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 44498 + }, + { + "epoch": 0.3880884687167501, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 44499 + }, + { + "epoch": 0.38809719000191867, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 44500 + }, + { + "epoch": 0.38810591128708727, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 44501 + }, + { + "epoch": 0.38811463257225587, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 44502 + }, + { + "epoch": 0.3881233538574244, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 44503 + }, + { + "epoch": 0.388132075142593, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 44504 + }, + { + "epoch": 0.3881407964277616, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 44505 + }, + { + "epoch": 0.38814951771293016, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 44506 + }, + { + "epoch": 0.38815823899809876, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 44507 + }, + { + "epoch": 0.38816696028326736, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9836, + "step": 44508 + }, + { + "epoch": 0.3881756815684359, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 44509 + }, + { + "epoch": 0.3881844028536045, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 44510 + }, + { + "epoch": 0.3881931241387731, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 44511 + }, + { + "epoch": 0.38820184542394165, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 44512 + }, + { + "epoch": 0.38821056670911025, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 44513 + }, + { + "epoch": 0.38821928799427885, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 44514 + }, + { + "epoch": 0.3882280092794474, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44515 + }, + { + "epoch": 0.388236730564616, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 44516 + }, + { + "epoch": 0.3882454518497846, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 44517 + }, + { + "epoch": 0.38825417313495314, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 44518 + }, + { + "epoch": 0.38826289442012174, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 44519 + }, + { + "epoch": 0.38827161570529034, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 44520 + }, + { + "epoch": 0.38828033699045894, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 44521 + }, + { + "epoch": 0.3882890582756275, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 44522 + }, + { + "epoch": 0.3882977795607961, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 44523 + }, + { + "epoch": 0.3883065008459647, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 44524 + }, + { + "epoch": 0.38831522213113323, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 44525 + }, + { + "epoch": 0.38832394341630183, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 44526 + }, + { + "epoch": 0.38833266470147043, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 44527 + }, + { + "epoch": 0.388341385986639, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9713, + "step": 44528 + }, + { + "epoch": 0.3883501072718076, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 44529 + }, + { + "epoch": 0.3883588285569762, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 44530 + }, + { + "epoch": 0.3883675498421447, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 44531 + }, + { + "epoch": 0.3883762711273133, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 44532 + }, + { + "epoch": 0.3883849924124819, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 44533 + }, + { + "epoch": 0.38839371369765047, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 44534 + }, + { + "epoch": 0.38840243498281907, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 44535 + }, + { + "epoch": 0.38841115626798767, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 44536 + }, + { + "epoch": 0.3884198775531562, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44537 + }, + { + "epoch": 0.3884285988383248, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 44538 + }, + { + "epoch": 0.3884373201234934, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 44539 + }, + { + "epoch": 0.38844604140866196, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 44540 + }, + { + "epoch": 0.38845476269383056, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 44541 + }, + { + "epoch": 0.38846348397899916, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 44542 + }, + { + "epoch": 0.3884722052641677, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 44543 + }, + { + "epoch": 0.3884809265493363, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 44544 + }, + { + "epoch": 0.3884896478345049, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 44545 + }, + { + "epoch": 0.38849836911967345, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 44546 + }, + { + "epoch": 0.38850709040484205, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 44547 + }, + { + "epoch": 0.38851581169001065, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 44548 + }, + { + "epoch": 0.38852453297517925, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 44549 + }, + { + "epoch": 0.3885332542603478, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 44550 + }, + { + "epoch": 0.3885419755455164, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 44551 + }, + { + "epoch": 0.388550696830685, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 44552 + }, + { + "epoch": 0.38855941811585354, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 44553 + }, + { + "epoch": 0.38856813940102214, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 44554 + }, + { + "epoch": 0.38857686068619074, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 44555 + }, + { + "epoch": 0.3885855819713593, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 44556 + }, + { + "epoch": 0.3885943032565279, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 44557 + }, + { + "epoch": 0.3886030245416965, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 44558 + }, + { + "epoch": 0.38861174582686503, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 44559 + }, + { + "epoch": 0.38862046711203363, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 44560 + }, + { + "epoch": 0.38862918839720223, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 44561 + }, + { + "epoch": 0.3886379096823708, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 44562 + }, + { + "epoch": 0.3886466309675394, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 44563 + }, + { + "epoch": 0.388655352252708, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44564 + }, + { + "epoch": 0.3886640735378765, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 44565 + }, + { + "epoch": 0.3886727948230451, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 44566 + }, + { + "epoch": 0.3886815161082137, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 44567 + }, + { + "epoch": 0.38869023739338227, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 44568 + }, + { + "epoch": 0.38869895867855087, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 44569 + }, + { + "epoch": 0.38870767996371947, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 44570 + }, + { + "epoch": 0.388716401248888, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 44571 + }, + { + "epoch": 0.3887251225340566, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 44572 + }, + { + "epoch": 0.3887338438192252, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 44573 + }, + { + "epoch": 0.38874256510439376, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 44574 + }, + { + "epoch": 0.38875128638956236, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44575 + }, + { + "epoch": 0.38876000767473096, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 44576 + }, + { + "epoch": 0.38876872895989956, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 44577 + }, + { + "epoch": 0.3887774502450681, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 44578 + }, + { + "epoch": 0.3887861715302367, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 44579 + }, + { + "epoch": 0.3887948928154053, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44580 + }, + { + "epoch": 0.38880361410057385, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 44581 + }, + { + "epoch": 0.38881233538574245, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 44582 + }, + { + "epoch": 0.38882105667091105, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 44583 + }, + { + "epoch": 0.3888297779560796, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 44584 + }, + { + "epoch": 0.3888384992412482, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 44585 + }, + { + "epoch": 0.3888472205264168, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 44586 + }, + { + "epoch": 0.38885594181158534, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 44587 + }, + { + "epoch": 0.38886466309675394, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 44588 + }, + { + "epoch": 0.38887338438192254, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 44589 + }, + { + "epoch": 0.3888821056670911, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44590 + }, + { + "epoch": 0.3888908269522597, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 44591 + }, + { + "epoch": 0.3888995482374283, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 44592 + }, + { + "epoch": 0.38890826952259683, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 44593 + }, + { + "epoch": 0.38891699080776543, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44594 + }, + { + "epoch": 0.38892571209293403, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 44595 + }, + { + "epoch": 0.3889344333781026, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 44596 + }, + { + "epoch": 0.3889431546632712, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 44597 + }, + { + "epoch": 0.3889518759484398, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 44598 + }, + { + "epoch": 0.3889605972336083, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 44599 + }, + { + "epoch": 0.3889693185187769, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 44600 + }, + { + "epoch": 0.3889780398039455, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 44601 + }, + { + "epoch": 0.3889867610891141, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 44602 + }, + { + "epoch": 0.38899548237428266, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 44603 + }, + { + "epoch": 0.38900420365945126, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 44604 + }, + { + "epoch": 0.38901292494461986, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 44605 + }, + { + "epoch": 0.3890216462297884, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44606 + }, + { + "epoch": 0.389030367514957, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44607 + }, + { + "epoch": 0.3890390888001256, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 44608 + }, + { + "epoch": 0.38904781008529415, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 44609 + }, + { + "epoch": 0.38905653137046275, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 44610 + }, + { + "epoch": 0.38906525265563136, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 44611 + }, + { + "epoch": 0.3890739739407999, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44612 + }, + { + "epoch": 0.3890826952259685, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 44613 + }, + { + "epoch": 0.3890914165111371, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 44614 + }, + { + "epoch": 0.38910013779630565, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 44615 + }, + { + "epoch": 0.38910885908147425, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 44616 + }, + { + "epoch": 0.38911758036664285, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 44617 + }, + { + "epoch": 0.3891263016518114, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 44618 + }, + { + "epoch": 0.38913502293698, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 44619 + }, + { + "epoch": 0.3891437442221486, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 44620 + }, + { + "epoch": 0.38915246550731714, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 44621 + }, + { + "epoch": 0.38916118679248574, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 44622 + }, + { + "epoch": 0.38916990807765434, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 44623 + }, + { + "epoch": 0.3891786293628229, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 44624 + }, + { + "epoch": 0.3891873506479915, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 44625 + }, + { + "epoch": 0.3891960719331601, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 44626 + }, + { + "epoch": 0.3892047932183286, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 44627 + }, + { + "epoch": 0.3892135145034972, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 44628 + }, + { + "epoch": 0.3892222357886658, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 44629 + }, + { + "epoch": 0.3892309570738344, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 44630 + }, + { + "epoch": 0.38923967835900297, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 44631 + }, + { + "epoch": 0.38924839964417157, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 44632 + }, + { + "epoch": 0.3892571209293402, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 44633 + }, + { + "epoch": 0.3892658422145087, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 44634 + }, + { + "epoch": 0.3892745634996773, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 44635 + }, + { + "epoch": 0.3892832847848459, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 44636 + }, + { + "epoch": 0.38929200607001446, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 44637 + }, + { + "epoch": 0.38930072735518306, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 44638 + }, + { + "epoch": 0.38930944864035166, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 44639 + }, + { + "epoch": 0.3893181699255202, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 44640 + }, + { + "epoch": 0.3893268912106888, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 44641 + }, + { + "epoch": 0.3893356124958574, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 44642 + }, + { + "epoch": 0.38934433378102595, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 44643 + }, + { + "epoch": 0.38935305506619455, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 44644 + }, + { + "epoch": 0.38936177635136315, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 44645 + }, + { + "epoch": 0.3893704976365317, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 44646 + }, + { + "epoch": 0.3893792189217003, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 44647 + }, + { + "epoch": 0.3893879402068689, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 44648 + }, + { + "epoch": 0.38939666149203744, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 44649 + }, + { + "epoch": 0.38940538277720604, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 44650 + }, + { + "epoch": 0.38941410406237464, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 44651 + }, + { + "epoch": 0.3894228253475432, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44652 + }, + { + "epoch": 0.3894315466327118, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 44653 + }, + { + "epoch": 0.3894402679178804, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 44654 + }, + { + "epoch": 0.38944898920304893, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 44655 + }, + { + "epoch": 0.38945771048821753, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 44656 + }, + { + "epoch": 0.38946643177338613, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 44657 + }, + { + "epoch": 0.38947515305855473, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 44658 + }, + { + "epoch": 0.3894838743437233, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 44659 + }, + { + "epoch": 0.3894925956288919, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 44660 + }, + { + "epoch": 0.3895013169140605, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 44661 + }, + { + "epoch": 0.389510038199229, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 44662 + }, + { + "epoch": 0.3895187594843976, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9803, + "step": 44663 + }, + { + "epoch": 0.3895274807695662, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 44664 + }, + { + "epoch": 0.38953620205473477, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 44665 + }, + { + "epoch": 0.38954492333990337, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 44666 + }, + { + "epoch": 0.38955364462507197, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 44667 + }, + { + "epoch": 0.3895623659102405, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 44668 + }, + { + "epoch": 0.3895710871954091, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 44669 + }, + { + "epoch": 0.3895798084805777, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 44670 + }, + { + "epoch": 0.38958852976574626, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 44671 + }, + { + "epoch": 0.38959725105091486, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 44672 + }, + { + "epoch": 0.38960597233608346, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 44673 + }, + { + "epoch": 0.389614693621252, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 44674 + }, + { + "epoch": 0.3896234149064206, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 44675 + }, + { + "epoch": 0.3896321361915892, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 44676 + }, + { + "epoch": 0.38964085747675775, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 44677 + }, + { + "epoch": 0.38964957876192635, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 44678 + }, + { + "epoch": 0.38965830004709495, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 44679 + }, + { + "epoch": 0.3896670213322635, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 44680 + }, + { + "epoch": 0.3896757426174321, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 44681 + }, + { + "epoch": 0.3896844639026007, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 44682 + }, + { + "epoch": 0.38969318518776924, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 44683 + }, + { + "epoch": 0.38970190647293784, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 44684 + }, + { + "epoch": 0.38971062775810644, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 44685 + }, + { + "epoch": 0.38971934904327504, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 44686 + }, + { + "epoch": 0.3897280703284436, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44687 + }, + { + "epoch": 0.3897367916136122, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 44688 + }, + { + "epoch": 0.3897455128987808, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 44689 + }, + { + "epoch": 0.38975423418394933, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 44690 + }, + { + "epoch": 0.38976295546911793, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 44691 + }, + { + "epoch": 0.38977167675428653, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44692 + }, + { + "epoch": 0.3897803980394551, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 44693 + }, + { + "epoch": 0.3897891193246237, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44694 + }, + { + "epoch": 0.3897978406097923, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 44695 + }, + { + "epoch": 0.3898065618949608, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 44696 + }, + { + "epoch": 0.3898152831801294, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 44697 + }, + { + "epoch": 0.389824004465298, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 44698 + }, + { + "epoch": 0.38983272575046657, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 44699 + }, + { + "epoch": 0.38984144703563517, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 44700 + }, + { + "epoch": 0.38985016832080377, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 44701 + }, + { + "epoch": 0.3898588896059723, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 44702 + }, + { + "epoch": 0.3898676108911409, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 44703 + }, + { + "epoch": 0.3898763321763095, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 44704 + }, + { + "epoch": 0.38988505346147806, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 44705 + }, + { + "epoch": 0.38989377474664666, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 44706 + }, + { + "epoch": 0.38990249603181526, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 44707 + }, + { + "epoch": 0.3899112173169838, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 44708 + }, + { + "epoch": 0.3899199386021524, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 44709 + }, + { + "epoch": 0.389928659887321, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 44710 + }, + { + "epoch": 0.3899373811724896, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 44711 + }, + { + "epoch": 0.38994610245765815, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 44712 + }, + { + "epoch": 0.38995482374282675, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 44713 + }, + { + "epoch": 0.38996354502799535, + "grad_norm": 0.244140625, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 44714 + }, + { + "epoch": 0.3899722663131639, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 44715 + }, + { + "epoch": 0.3899809875983325, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 44716 + }, + { + "epoch": 0.3899897088835011, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 44717 + }, + { + "epoch": 0.38999843016866964, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 44718 + }, + { + "epoch": 0.39000715145383824, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 44719 + }, + { + "epoch": 0.39001587273900684, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 44720 + }, + { + "epoch": 0.3900245940241754, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 44721 + }, + { + "epoch": 0.390033315309344, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 44722 + }, + { + "epoch": 0.3900420365945126, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 44723 + }, + { + "epoch": 0.39005075787968113, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 44724 + }, + { + "epoch": 0.39005947916484973, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44725 + }, + { + "epoch": 0.39006820045001833, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 44726 + }, + { + "epoch": 0.3900769217351869, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 44727 + }, + { + "epoch": 0.3900856430203555, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 44728 + }, + { + "epoch": 0.3900943643055241, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9727, + "step": 44729 + }, + { + "epoch": 0.3901030855906926, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 44730 + }, + { + "epoch": 0.3901118068758612, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 44731 + }, + { + "epoch": 0.3901205281610298, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 44732 + }, + { + "epoch": 0.39012924944619837, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 44733 + }, + { + "epoch": 0.39013797073136697, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44734 + }, + { + "epoch": 0.39014669201653557, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 44735 + }, + { + "epoch": 0.3901554133017041, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 44736 + }, + { + "epoch": 0.3901641345868727, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 44737 + }, + { + "epoch": 0.3901728558720413, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 44738 + }, + { + "epoch": 0.3901815771572099, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 44739 + }, + { + "epoch": 0.39019029844237846, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 44740 + }, + { + "epoch": 0.39019901972754706, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 44741 + }, + { + "epoch": 0.39020774101271566, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 44742 + }, + { + "epoch": 0.3902164622978842, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 44743 + }, + { + "epoch": 0.3902251835830528, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 44744 + }, + { + "epoch": 0.3902339048682214, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 44745 + }, + { + "epoch": 0.39024262615338995, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44746 + }, + { + "epoch": 0.39025134743855855, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44747 + }, + { + "epoch": 0.39026006872372715, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 44748 + }, + { + "epoch": 0.3902687900088957, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 44749 + }, + { + "epoch": 0.3902775112940643, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 44750 + }, + { + "epoch": 0.3902862325792329, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 44751 + }, + { + "epoch": 0.39029495386440144, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9779, + "step": 44752 + }, + { + "epoch": 0.39030367514957004, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 44753 + }, + { + "epoch": 0.39031239643473864, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 44754 + }, + { + "epoch": 0.3903211177199072, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 44755 + }, + { + "epoch": 0.3903298390050758, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 44756 + }, + { + "epoch": 0.3903385602902444, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 44757 + }, + { + "epoch": 0.39034728157541293, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 44758 + }, + { + "epoch": 0.39035600286058153, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 44759 + }, + { + "epoch": 0.39036472414575013, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44760 + }, + { + "epoch": 0.3903734454309187, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 44761 + }, + { + "epoch": 0.3903821667160873, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 44762 + }, + { + "epoch": 0.3903908880012559, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 44763 + }, + { + "epoch": 0.3903996092864244, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 44764 + }, + { + "epoch": 0.390408330571593, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 44765 + }, + { + "epoch": 0.3904170518567616, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 44766 + }, + { + "epoch": 0.3904257731419302, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44767 + }, + { + "epoch": 0.39043449442709877, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 44768 + }, + { + "epoch": 0.39044321571226737, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 44769 + }, + { + "epoch": 0.39045193699743597, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44770 + }, + { + "epoch": 0.3904606582826045, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 44771 + }, + { + "epoch": 0.3904693795677731, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 44772 + }, + { + "epoch": 0.3904781008529417, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 44773 + }, + { + "epoch": 0.39048682213811026, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 44774 + }, + { + "epoch": 0.39049554342327886, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 44775 + }, + { + "epoch": 0.39050426470844746, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 44776 + }, + { + "epoch": 0.390512985993616, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44777 + }, + { + "epoch": 0.3905217072787846, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 44778 + }, + { + "epoch": 0.3905304285639532, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 44779 + }, + { + "epoch": 0.39053914984912175, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 44780 + }, + { + "epoch": 0.39054787113429035, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 44781 + }, + { + "epoch": 0.39055659241945895, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 44782 + }, + { + "epoch": 0.3905653137046275, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 44783 + }, + { + "epoch": 0.3905740349897961, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 44784 + }, + { + "epoch": 0.3905827562749647, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 44785 + }, + { + "epoch": 0.39059147756013324, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 44786 + }, + { + "epoch": 0.39060019884530184, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 44787 + }, + { + "epoch": 0.39060892013047044, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 44788 + }, + { + "epoch": 0.390617641415639, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 44789 + }, + { + "epoch": 0.3906263627008076, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 44790 + }, + { + "epoch": 0.3906350839859762, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 44791 + }, + { + "epoch": 0.39064380527114473, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 44792 + }, + { + "epoch": 0.39065252655631333, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 44793 + }, + { + "epoch": 0.39066124784148193, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 44794 + }, + { + "epoch": 0.39066996912665053, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 44795 + }, + { + "epoch": 0.3906786904118191, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 44796 + }, + { + "epoch": 0.3906874116969877, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 44797 + }, + { + "epoch": 0.3906961329821563, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 44798 + }, + { + "epoch": 0.3907048542673248, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 44799 + }, + { + "epoch": 0.3907135755524934, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 44800 + }, + { + "epoch": 0.390722296837662, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 44801 + }, + { + "epoch": 0.39073101812283056, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 44802 + }, + { + "epoch": 0.39073973940799916, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44803 + }, + { + "epoch": 0.39074846069316777, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 44804 + }, + { + "epoch": 0.3907571819783363, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 44805 + }, + { + "epoch": 0.3907659032635049, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 44806 + }, + { + "epoch": 0.3907746245486735, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 44807 + }, + { + "epoch": 0.39078334583384206, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 44808 + }, + { + "epoch": 0.39079206711901066, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 44809 + }, + { + "epoch": 0.39080078840417926, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 44810 + }, + { + "epoch": 0.3908095096893478, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 44811 + }, + { + "epoch": 0.3908182309745164, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 44812 + }, + { + "epoch": 0.390826952259685, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 44813 + }, + { + "epoch": 0.39083567354485355, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 44814 + }, + { + "epoch": 0.39084439483002215, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 44815 + }, + { + "epoch": 0.39085311611519075, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 44816 + }, + { + "epoch": 0.3908618374003593, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 44817 + }, + { + "epoch": 0.3908705586855279, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44818 + }, + { + "epoch": 0.3908792799706965, + "grad_norm": 0.2177734375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 44819 + }, + { + "epoch": 0.3908880012558651, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 44820 + }, + { + "epoch": 0.39089672254103364, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 44821 + }, + { + "epoch": 0.39090544382620224, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 44822 + }, + { + "epoch": 0.39091416511137084, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 44823 + }, + { + "epoch": 0.3909228863965394, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 44824 + }, + { + "epoch": 0.390931607681708, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 44825 + }, + { + "epoch": 0.3909403289668766, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44826 + }, + { + "epoch": 0.3909490502520451, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 44827 + }, + { + "epoch": 0.3909577715372137, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 44828 + }, + { + "epoch": 0.3909664928223823, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 44829 + }, + { + "epoch": 0.3909752141075509, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 44830 + }, + { + "epoch": 0.3909839353927195, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 44831 + }, + { + "epoch": 0.3909926566778881, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 44832 + }, + { + "epoch": 0.3910013779630566, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 44833 + }, + { + "epoch": 0.3910100992482252, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 44834 + }, + { + "epoch": 0.3910188205333938, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 44835 + }, + { + "epoch": 0.39102754181856236, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 44836 + }, + { + "epoch": 0.39103626310373096, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 44837 + }, + { + "epoch": 0.39104498438889956, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 44838 + }, + { + "epoch": 0.3910537056740681, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 44839 + }, + { + "epoch": 0.3910624269592367, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 44840 + }, + { + "epoch": 0.3910711482444053, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 44841 + }, + { + "epoch": 0.39107986952957385, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 44842 + }, + { + "epoch": 0.39108859081474245, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 44843 + }, + { + "epoch": 0.39109731209991105, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 44844 + }, + { + "epoch": 0.3911060333850796, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44845 + }, + { + "epoch": 0.3911147546702482, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 44846 + }, + { + "epoch": 0.3911234759554168, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 44847 + }, + { + "epoch": 0.3911321972405854, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 44848 + }, + { + "epoch": 0.39114091852575394, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0301, + "step": 44849 + }, + { + "epoch": 0.39114963981092254, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 44850 + }, + { + "epoch": 0.39115836109609115, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 44851 + }, + { + "epoch": 0.3911670823812597, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 44852 + }, + { + "epoch": 0.3911758036664283, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 44853 + }, + { + "epoch": 0.3911845249515969, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 44854 + }, + { + "epoch": 0.39119324623676544, + "grad_norm": 0.2890625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 44855 + }, + { + "epoch": 0.39120196752193404, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.982, + "step": 44856 + }, + { + "epoch": 0.39121068880710264, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 44857 + }, + { + "epoch": 0.3912194100922712, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 44858 + }, + { + "epoch": 0.3912281313774398, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 44859 + }, + { + "epoch": 0.3912368526626084, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 44860 + }, + { + "epoch": 0.3912455739477769, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 44861 + }, + { + "epoch": 0.3912542952329455, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 44862 + }, + { + "epoch": 0.3912630165181141, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 44863 + }, + { + "epoch": 0.39127173780328267, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 44864 + }, + { + "epoch": 0.39128045908845127, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 44865 + }, + { + "epoch": 0.39128918037361987, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 44866 + }, + { + "epoch": 0.3912979016587884, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 44867 + }, + { + "epoch": 0.391306622943957, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 44868 + }, + { + "epoch": 0.3913153442291256, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 44869 + }, + { + "epoch": 0.39132406551429416, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 44870 + }, + { + "epoch": 0.39133278679946276, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 44871 + }, + { + "epoch": 0.39134150808463136, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 44872 + }, + { + "epoch": 0.3913502293697999, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44873 + }, + { + "epoch": 0.3913589506549685, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 44874 + }, + { + "epoch": 0.3913676719401371, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 44875 + }, + { + "epoch": 0.3913763932253057, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 44876 + }, + { + "epoch": 0.39138511451047425, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 44877 + }, + { + "epoch": 0.39139383579564285, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 44878 + }, + { + "epoch": 0.39140255708081145, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 44879 + }, + { + "epoch": 0.39141127836598, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 44880 + }, + { + "epoch": 0.3914199996511486, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 44881 + }, + { + "epoch": 0.3914287209363172, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 44882 + }, + { + "epoch": 0.39143744222148574, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 44883 + }, + { + "epoch": 0.39144616350665434, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 44884 + }, + { + "epoch": 0.39145488479182294, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 44885 + }, + { + "epoch": 0.3914636060769915, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 44886 + }, + { + "epoch": 0.3914723273621601, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 44887 + }, + { + "epoch": 0.3914810486473287, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 44888 + }, + { + "epoch": 0.39148976993249723, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 44889 + }, + { + "epoch": 0.39149849121766583, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 44890 + }, + { + "epoch": 0.39150721250283443, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 44891 + }, + { + "epoch": 0.391515933788003, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 44892 + }, + { + "epoch": 0.3915246550731716, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 44893 + }, + { + "epoch": 0.3915333763583402, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 44894 + }, + { + "epoch": 0.3915420976435087, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 44895 + }, + { + "epoch": 0.3915508189286773, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 44896 + }, + { + "epoch": 0.3915595402138459, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9717, + "step": 44897 + }, + { + "epoch": 0.39156826149901447, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 44898 + }, + { + "epoch": 0.39157698278418307, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 44899 + }, + { + "epoch": 0.39158570406935167, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 44900 + }, + { + "epoch": 0.3915944253545202, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 44901 + }, + { + "epoch": 0.3916031466396888, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 44902 + }, + { + "epoch": 0.3916118679248574, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 44903 + }, + { + "epoch": 0.391620589210026, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 44904 + }, + { + "epoch": 0.39162931049519456, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 44905 + }, + { + "epoch": 0.39163803178036316, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 44906 + }, + { + "epoch": 0.39164675306553176, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 44907 + }, + { + "epoch": 0.3916554743507003, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 44908 + }, + { + "epoch": 0.3916641956358689, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 44909 + }, + { + "epoch": 0.3916729169210375, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 44910 + }, + { + "epoch": 0.39168163820620605, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 44911 + }, + { + "epoch": 0.39169035949137465, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 44912 + }, + { + "epoch": 0.39169908077654325, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 44913 + }, + { + "epoch": 0.3917078020617118, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 44914 + }, + { + "epoch": 0.3917165233468804, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 44915 + }, + { + "epoch": 0.391725244632049, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 44916 + }, + { + "epoch": 0.39173396591721754, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 44917 + }, + { + "epoch": 0.39174268720238614, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9689, + "step": 44918 + }, + { + "epoch": 0.39175140848755474, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 44919 + }, + { + "epoch": 0.3917601297727233, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 44920 + }, + { + "epoch": 0.3917688510578919, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 44921 + }, + { + "epoch": 0.3917775723430605, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 44922 + }, + { + "epoch": 0.39178629362822903, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 44923 + }, + { + "epoch": 0.39179501491339763, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9849, + "step": 44924 + }, + { + "epoch": 0.39180373619856623, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 44925 + }, + { + "epoch": 0.3918124574837348, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 44926 + }, + { + "epoch": 0.3918211787689034, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 44927 + }, + { + "epoch": 0.391829900054072, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 44928 + }, + { + "epoch": 0.3918386213392406, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 44929 + }, + { + "epoch": 0.3918473426244091, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 44930 + }, + { + "epoch": 0.3918560639095777, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9717, + "step": 44931 + }, + { + "epoch": 0.3918647851947463, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 44932 + }, + { + "epoch": 0.39187350647991487, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 44933 + }, + { + "epoch": 0.39188222776508347, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 44934 + }, + { + "epoch": 0.39189094905025207, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 44935 + }, + { + "epoch": 0.3918996703354206, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 44936 + }, + { + "epoch": 0.3919083916205892, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 44937 + }, + { + "epoch": 0.3919171129057578, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 44938 + }, + { + "epoch": 0.39192583419092636, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 44939 + }, + { + "epoch": 0.39193455547609496, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 44940 + }, + { + "epoch": 0.39194327676126356, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 44941 + }, + { + "epoch": 0.3919519980464321, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 44942 + }, + { + "epoch": 0.3919607193316007, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 44943 + }, + { + "epoch": 0.3919694406167693, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 44944 + }, + { + "epoch": 0.39197816190193785, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 44945 + }, + { + "epoch": 0.39198688318710645, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44946 + }, + { + "epoch": 0.39199560447227505, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 44947 + }, + { + "epoch": 0.3920043257574436, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 44948 + }, + { + "epoch": 0.3920130470426122, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 44949 + }, + { + "epoch": 0.3920217683277808, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 44950 + }, + { + "epoch": 0.39203048961294934, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44951 + }, + { + "epoch": 0.39203921089811794, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 44952 + }, + { + "epoch": 0.39204793218328654, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 44953 + }, + { + "epoch": 0.3920566534684551, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 44954 + }, + { + "epoch": 0.3920653747536237, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 44955 + }, + { + "epoch": 0.3920740960387923, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9818, + "step": 44956 + }, + { + "epoch": 0.3920828173239609, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 44957 + }, + { + "epoch": 0.39209153860912943, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 44958 + }, + { + "epoch": 0.39210025989429803, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 44959 + }, + { + "epoch": 0.39210898117946663, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 44960 + }, + { + "epoch": 0.3921177024646352, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 44961 + }, + { + "epoch": 0.3921264237498038, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 44962 + }, + { + "epoch": 0.3921351450349724, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 44963 + }, + { + "epoch": 0.3921438663201409, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 44964 + }, + { + "epoch": 0.3921525876053095, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 44965 + }, + { + "epoch": 0.3921613088904781, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 44966 + }, + { + "epoch": 0.39217003017564667, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 44967 + }, + { + "epoch": 0.39217875146081527, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 44968 + }, + { + "epoch": 0.39218747274598387, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 44969 + }, + { + "epoch": 0.3921961940311524, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 44970 + }, + { + "epoch": 0.392204915316321, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 44971 + }, + { + "epoch": 0.3922136366014896, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 44972 + }, + { + "epoch": 0.39222235788665816, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 44973 + }, + { + "epoch": 0.39223107917182676, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 44974 + }, + { + "epoch": 0.39223980045699536, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 44975 + }, + { + "epoch": 0.3922485217421639, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 44976 + }, + { + "epoch": 0.3922572430273325, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 44977 + }, + { + "epoch": 0.3922659643125011, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 44978 + }, + { + "epoch": 0.39227468559766965, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 44979 + }, + { + "epoch": 0.39228340688283825, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 44980 + }, + { + "epoch": 0.39229212816800685, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 44981 + }, + { + "epoch": 0.3923008494531754, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 44982 + }, + { + "epoch": 0.392309570738344, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 44983 + }, + { + "epoch": 0.3923182920235126, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 44984 + }, + { + "epoch": 0.3923270133086812, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 44985 + }, + { + "epoch": 0.39233573459384974, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9821, + "step": 44986 + }, + { + "epoch": 0.39234445587901834, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 44987 + }, + { + "epoch": 0.39235317716418694, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 44988 + }, + { + "epoch": 0.3923618984493555, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 44989 + }, + { + "epoch": 0.3923706197345241, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 44990 + }, + { + "epoch": 0.3923793410196927, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 44991 + }, + { + "epoch": 0.39238806230486123, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 44992 + }, + { + "epoch": 0.39239678359002983, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 44993 + }, + { + "epoch": 0.39240550487519843, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 44994 + }, + { + "epoch": 0.392414226160367, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 44995 + }, + { + "epoch": 0.3924229474455356, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9833, + "step": 44996 + }, + { + "epoch": 0.3924316687307042, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 44997 + }, + { + "epoch": 0.3924403900158727, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 44998 + }, + { + "epoch": 0.3924491113010413, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 44999 + }, + { + "epoch": 0.3924578325862099, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 45000 + }, + { + "epoch": 0.39246655387137847, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 45001 + }, + { + "epoch": 0.39247527515654707, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 45002 + }, + { + "epoch": 0.39248399644171567, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 45003 + }, + { + "epoch": 0.3924927177268842, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 45004 + }, + { + "epoch": 0.3925014390120528, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 45005 + }, + { + "epoch": 0.3925101602972214, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 45006 + }, + { + "epoch": 0.39251888158238996, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45007 + }, + { + "epoch": 0.39252760286755856, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 45008 + }, + { + "epoch": 0.39253632415272716, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45009 + }, + { + "epoch": 0.3925450454378957, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 45010 + }, + { + "epoch": 0.3925537667230643, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 45011 + }, + { + "epoch": 0.3925624880082329, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 45012 + }, + { + "epoch": 0.3925712092934015, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 45013 + }, + { + "epoch": 0.39257993057857005, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 45014 + }, + { + "epoch": 0.39258865186373865, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 45015 + }, + { + "epoch": 0.39259737314890725, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 45016 + }, + { + "epoch": 0.3926060944340758, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45017 + }, + { + "epoch": 0.3926148157192444, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45018 + }, + { + "epoch": 0.392623537004413, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 45019 + }, + { + "epoch": 0.39263225828958154, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45020 + }, + { + "epoch": 0.39264097957475014, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 45021 + }, + { + "epoch": 0.39264970085991874, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 45022 + }, + { + "epoch": 0.3926584221450873, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 45023 + }, + { + "epoch": 0.3926671434302559, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 45024 + }, + { + "epoch": 0.3926758647154245, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 45025 + }, + { + "epoch": 0.39268458600059303, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 45026 + }, + { + "epoch": 0.39269330728576163, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 45027 + }, + { + "epoch": 0.39270202857093023, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 45028 + }, + { + "epoch": 0.3927107498560988, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 45029 + }, + { + "epoch": 0.3927194711412674, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 45030 + }, + { + "epoch": 0.392728192426436, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 45031 + }, + { + "epoch": 0.3927369137116045, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 45032 + }, + { + "epoch": 0.3927456349967731, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 45033 + }, + { + "epoch": 0.3927543562819417, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 45034 + }, + { + "epoch": 0.39276307756711026, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 45035 + }, + { + "epoch": 0.39277179885227886, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 45036 + }, + { + "epoch": 0.39278052013744746, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 45037 + }, + { + "epoch": 0.39278924142261606, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 45038 + }, + { + "epoch": 0.3927979627077846, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 45039 + }, + { + "epoch": 0.3928066839929532, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 45040 + }, + { + "epoch": 0.3928154052781218, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 45041 + }, + { + "epoch": 0.39282412656329035, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 45042 + }, + { + "epoch": 0.39283284784845895, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 45043 + }, + { + "epoch": 0.39284156913362756, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 45044 + }, + { + "epoch": 0.3928502904187961, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 45045 + }, + { + "epoch": 0.3928590117039647, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 45046 + }, + { + "epoch": 0.3928677329891333, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 45047 + }, + { + "epoch": 0.39287645427430185, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45048 + }, + { + "epoch": 0.39288517555947045, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 45049 + }, + { + "epoch": 0.39289389684463905, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 45050 + }, + { + "epoch": 0.3929026181298076, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 45051 + }, + { + "epoch": 0.3929113394149762, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 45052 + }, + { + "epoch": 0.3929200607001448, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 45053 + }, + { + "epoch": 0.39292878198531334, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 45054 + }, + { + "epoch": 0.39293750327048194, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45055 + }, + { + "epoch": 0.39294622455565054, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 45056 + }, + { + "epoch": 0.3929549458408191, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 45057 + }, + { + "epoch": 0.3929636671259877, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 45058 + }, + { + "epoch": 0.3929723884111563, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 45059 + }, + { + "epoch": 0.3929811096963248, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 45060 + }, + { + "epoch": 0.3929898309814934, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 45061 + }, + { + "epoch": 0.392998552266662, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 45062 + }, + { + "epoch": 0.39300727355183057, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 45063 + }, + { + "epoch": 0.39301599483699917, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 45064 + }, + { + "epoch": 0.39302471612216777, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 45065 + }, + { + "epoch": 0.3930334374073364, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 45066 + }, + { + "epoch": 0.3930421586925049, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 45067 + }, + { + "epoch": 0.3930508799776735, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 45068 + }, + { + "epoch": 0.3930596012628421, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 45069 + }, + { + "epoch": 0.39306832254801066, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 45070 + }, + { + "epoch": 0.39307704383317926, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 45071 + }, + { + "epoch": 0.39308576511834786, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45072 + }, + { + "epoch": 0.3930944864035164, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 45073 + }, + { + "epoch": 0.393103207688685, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 45074 + }, + { + "epoch": 0.3931119289738536, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 45075 + }, + { + "epoch": 0.39312065025902215, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 45076 + }, + { + "epoch": 0.39312937154419075, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45077 + }, + { + "epoch": 0.39313809282935935, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 45078 + }, + { + "epoch": 0.3931468141145279, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 45079 + }, + { + "epoch": 0.3931555353996965, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 45080 + }, + { + "epoch": 0.3931642566848651, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 45081 + }, + { + "epoch": 0.39317297797003364, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 45082 + }, + { + "epoch": 0.39318169925520224, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 45083 + }, + { + "epoch": 0.39319042054037084, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 45084 + }, + { + "epoch": 0.3931991418255394, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 45085 + }, + { + "epoch": 0.393207863110708, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 45086 + }, + { + "epoch": 0.3932165843958766, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 45087 + }, + { + "epoch": 0.39322530568104513, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 45088 + }, + { + "epoch": 0.39323402696621373, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 45089 + }, + { + "epoch": 0.39324274825138233, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 45090 + }, + { + "epoch": 0.3932514695365509, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 45091 + }, + { + "epoch": 0.3932601908217195, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 45092 + }, + { + "epoch": 0.3932689121068881, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 45093 + }, + { + "epoch": 0.3932776333920567, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 45094 + }, + { + "epoch": 0.3932863546772252, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 45095 + }, + { + "epoch": 0.3932950759623938, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 45096 + }, + { + "epoch": 0.3933037972475624, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 45097 + }, + { + "epoch": 0.39331251853273097, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 45098 + }, + { + "epoch": 0.39332123981789957, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 45099 + }, + { + "epoch": 0.39332996110306817, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 45100 + }, + { + "epoch": 0.3933386823882367, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 45101 + }, + { + "epoch": 0.3933474036734053, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 45102 + }, + { + "epoch": 0.3933561249585739, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 45103 + }, + { + "epoch": 0.39336484624374246, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 45104 + }, + { + "epoch": 0.39337356752891106, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 45105 + }, + { + "epoch": 0.39338228881407966, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 45106 + }, + { + "epoch": 0.3933910100992482, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 45107 + }, + { + "epoch": 0.3933997313844168, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 45108 + }, + { + "epoch": 0.3934084526695854, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 45109 + }, + { + "epoch": 0.39341717395475395, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 45110 + }, + { + "epoch": 0.39342589523992255, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9775, + "step": 45111 + }, + { + "epoch": 0.39343461652509115, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 45112 + }, + { + "epoch": 0.3934433378102597, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 45113 + }, + { + "epoch": 0.3934520590954283, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 45114 + }, + { + "epoch": 0.3934607803805969, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 45115 + }, + { + "epoch": 0.39346950166576544, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 45116 + }, + { + "epoch": 0.39347822295093404, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 45117 + }, + { + "epoch": 0.39348694423610264, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 45118 + }, + { + "epoch": 0.3934956655212712, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 45119 + }, + { + "epoch": 0.3935043868064398, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 45120 + }, + { + "epoch": 0.3935131080916084, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 45121 + }, + { + "epoch": 0.393521829376777, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 45122 + }, + { + "epoch": 0.39353055066194553, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 45123 + }, + { + "epoch": 0.39353927194711413, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 45124 + }, + { + "epoch": 0.39354799323228273, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 45125 + }, + { + "epoch": 0.3935567145174513, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 45126 + }, + { + "epoch": 0.3935654358026199, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 45127 + }, + { + "epoch": 0.3935741570877885, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45128 + }, + { + "epoch": 0.393582878372957, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 45129 + }, + { + "epoch": 0.3935915996581256, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 45130 + }, + { + "epoch": 0.3936003209432942, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 45131 + }, + { + "epoch": 0.39360904222846277, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 45132 + }, + { + "epoch": 0.39361776351363137, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 45133 + }, + { + "epoch": 0.39362648479879997, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 45134 + }, + { + "epoch": 0.3936352060839685, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 45135 + }, + { + "epoch": 0.3936439273691371, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 45136 + }, + { + "epoch": 0.3936526486543057, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 45137 + }, + { + "epoch": 0.39366136993947426, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 45138 + }, + { + "epoch": 0.39367009122464286, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 45139 + }, + { + "epoch": 0.39367881250981146, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 45140 + }, + { + "epoch": 0.39368753379498, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 45141 + }, + { + "epoch": 0.3936962550801486, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 45142 + }, + { + "epoch": 0.3937049763653172, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 45143 + }, + { + "epoch": 0.39371369765048575, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 45144 + }, + { + "epoch": 0.39372241893565435, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 45145 + }, + { + "epoch": 0.39373114022082295, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 45146 + }, + { + "epoch": 0.3937398615059915, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 0.9793, + "step": 45147 + }, + { + "epoch": 0.3937485827911601, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 45148 + }, + { + "epoch": 0.3937573040763287, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 45149 + }, + { + "epoch": 0.3937660253614973, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 45150 + }, + { + "epoch": 0.39377474664666584, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 45151 + }, + { + "epoch": 0.39378346793183444, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 45152 + }, + { + "epoch": 0.39379218921700304, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 45153 + }, + { + "epoch": 0.3938009105021716, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 45154 + }, + { + "epoch": 0.3938096317873402, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 45155 + }, + { + "epoch": 0.3938183530725088, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 45156 + }, + { + "epoch": 0.39382707435767733, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 45157 + }, + { + "epoch": 0.39383579564284593, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 45158 + }, + { + "epoch": 0.39384451692801453, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 45159 + }, + { + "epoch": 0.3938532382131831, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 45160 + }, + { + "epoch": 0.3938619594983517, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 45161 + }, + { + "epoch": 0.3938706807835203, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 45162 + }, + { + "epoch": 0.3938794020686888, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 45163 + }, + { + "epoch": 0.3938881233538574, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 45164 + }, + { + "epoch": 0.393896844639026, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 45165 + }, + { + "epoch": 0.39390556592419457, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 45166 + }, + { + "epoch": 0.39391428720936317, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 45167 + }, + { + "epoch": 0.39392300849453177, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 45168 + }, + { + "epoch": 0.3939317297797003, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 45169 + }, + { + "epoch": 0.3939404510648689, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 45170 + }, + { + "epoch": 0.3939491723500375, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 45171 + }, + { + "epoch": 0.39395789363520606, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 45172 + }, + { + "epoch": 0.39396661492037466, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 45173 + }, + { + "epoch": 0.39397533620554326, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 45174 + }, + { + "epoch": 0.39398405749071186, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 45175 + }, + { + "epoch": 0.3939927787758804, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 45176 + }, + { + "epoch": 0.394001500061049, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 45177 + }, + { + "epoch": 0.3940102213462176, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 45178 + }, + { + "epoch": 0.39401894263138615, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 45179 + }, + { + "epoch": 0.39402766391655475, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 45180 + }, + { + "epoch": 0.39403638520172335, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 45181 + }, + { + "epoch": 0.3940451064868919, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 45182 + }, + { + "epoch": 0.3940538277720605, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 45183 + }, + { + "epoch": 0.3940625490572291, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 45184 + }, + { + "epoch": 0.39407127034239764, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45185 + }, + { + "epoch": 0.39407999162756624, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 45186 + }, + { + "epoch": 0.39408871291273484, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 45187 + }, + { + "epoch": 0.3940974341979034, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 45188 + }, + { + "epoch": 0.394106155483072, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 45189 + }, + { + "epoch": 0.3941148767682406, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 45190 + }, + { + "epoch": 0.39412359805340913, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45191 + }, + { + "epoch": 0.39413231933857773, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 45192 + }, + { + "epoch": 0.39414104062374633, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 45193 + }, + { + "epoch": 0.3941497619089149, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 45194 + }, + { + "epoch": 0.3941584831940835, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 45195 + }, + { + "epoch": 0.3941672044792521, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 45196 + }, + { + "epoch": 0.3941759257644206, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 45197 + }, + { + "epoch": 0.3941846470495892, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 45198 + }, + { + "epoch": 0.3941933683347578, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 45199 + }, + { + "epoch": 0.39420208961992637, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 45200 + }, + { + "epoch": 0.39421081090509497, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 45201 + }, + { + "epoch": 0.39421953219026357, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 45202 + }, + { + "epoch": 0.39422825347543217, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 45203 + }, + { + "epoch": 0.3942369747606007, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 45204 + }, + { + "epoch": 0.3942456960457693, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 45205 + }, + { + "epoch": 0.3942544173309379, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 45206 + }, + { + "epoch": 0.39426313861610646, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 45207 + }, + { + "epoch": 0.39427185990127506, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 45208 + }, + { + "epoch": 0.39428058118644366, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 45209 + }, + { + "epoch": 0.3942893024716122, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 45210 + }, + { + "epoch": 0.3942980237567808, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45211 + }, + { + "epoch": 0.3943067450419494, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 45212 + }, + { + "epoch": 0.39431546632711795, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 45213 + }, + { + "epoch": 0.39432418761228655, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 45214 + }, + { + "epoch": 0.39433290889745515, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 45215 + }, + { + "epoch": 0.3943416301826237, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 45216 + }, + { + "epoch": 0.3943503514677923, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 45217 + }, + { + "epoch": 0.3943590727529609, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 45218 + }, + { + "epoch": 0.39436779403812944, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 45219 + }, + { + "epoch": 0.39437651532329804, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 45220 + }, + { + "epoch": 0.39438523660846664, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 45221 + }, + { + "epoch": 0.3943939578936352, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 45222 + }, + { + "epoch": 0.3944026791788038, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 45223 + }, + { + "epoch": 0.3944114004639724, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 45224 + }, + { + "epoch": 0.39442012174914093, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 45225 + }, + { + "epoch": 0.39442884303430953, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 45226 + }, + { + "epoch": 0.39443756431947813, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45227 + }, + { + "epoch": 0.3944462856046467, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 45228 + }, + { + "epoch": 0.3944550068898153, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 45229 + }, + { + "epoch": 0.3944637281749839, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45230 + }, + { + "epoch": 0.3944724494601525, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 45231 + }, + { + "epoch": 0.394481170745321, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 45232 + }, + { + "epoch": 0.3944898920304896, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 45233 + }, + { + "epoch": 0.3944986133156582, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 45234 + }, + { + "epoch": 0.39450733460082676, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 45235 + }, + { + "epoch": 0.39451605588599536, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 45236 + }, + { + "epoch": 0.39452477717116397, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 45237 + }, + { + "epoch": 0.3945334984563325, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 45238 + }, + { + "epoch": 0.3945422197415011, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 45239 + }, + { + "epoch": 0.3945509410266697, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 45240 + }, + { + "epoch": 0.39455966231183826, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 45241 + }, + { + "epoch": 0.39456838359700686, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 45242 + }, + { + "epoch": 0.39457710488217546, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 45243 + }, + { + "epoch": 0.394585826167344, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45244 + }, + { + "epoch": 0.3945945474525126, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 45245 + }, + { + "epoch": 0.3946032687376812, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 45246 + }, + { + "epoch": 0.39461199002284975, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 45247 + }, + { + "epoch": 0.39462071130801835, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 45248 + }, + { + "epoch": 0.39462943259318695, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 45249 + }, + { + "epoch": 0.3946381538783555, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 45250 + }, + { + "epoch": 0.3946468751635241, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 45251 + }, + { + "epoch": 0.3946555964486927, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 45252 + }, + { + "epoch": 0.39466431773386124, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 45253 + }, + { + "epoch": 0.39467303901902984, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 45254 + }, + { + "epoch": 0.39468176030419844, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 45255 + }, + { + "epoch": 0.394690481589367, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 45256 + }, + { + "epoch": 0.3946992028745356, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 45257 + }, + { + "epoch": 0.3947079241597042, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 45258 + }, + { + "epoch": 0.3947166454448728, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 45259 + }, + { + "epoch": 0.3947253667300413, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 45260 + }, + { + "epoch": 0.3947340880152099, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 45261 + }, + { + "epoch": 0.3947428093003785, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 45262 + }, + { + "epoch": 0.3947515305855471, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 45263 + }, + { + "epoch": 0.3947602518707157, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 45264 + }, + { + "epoch": 0.3947689731558843, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 45265 + }, + { + "epoch": 0.3947776944410528, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 45266 + }, + { + "epoch": 0.3947864157262214, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 45267 + }, + { + "epoch": 0.39479513701139, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 45268 + }, + { + "epoch": 0.39480385829655856, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 45269 + }, + { + "epoch": 0.39481257958172716, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 45270 + }, + { + "epoch": 0.39482130086689576, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45271 + }, + { + "epoch": 0.3948300221520643, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 45272 + }, + { + "epoch": 0.3948387434372329, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 45273 + }, + { + "epoch": 0.3948474647224015, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 45274 + }, + { + "epoch": 0.39485618600757005, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 45275 + }, + { + "epoch": 0.39486490729273865, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 45276 + }, + { + "epoch": 0.39487362857790725, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 45277 + }, + { + "epoch": 0.3948823498630758, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 45278 + }, + { + "epoch": 0.3948910711482444, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 45279 + }, + { + "epoch": 0.394899792433413, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 45280 + }, + { + "epoch": 0.39490851371858154, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 45281 + }, + { + "epoch": 0.39491723500375014, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 45282 + }, + { + "epoch": 0.39492595628891874, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 45283 + }, + { + "epoch": 0.39493467757408734, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 45284 + }, + { + "epoch": 0.3949433988592559, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 45285 + }, + { + "epoch": 0.3949521201444245, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 45286 + }, + { + "epoch": 0.3949608414295931, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 45287 + }, + { + "epoch": 0.39496956271476164, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 45288 + }, + { + "epoch": 0.39497828399993024, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 45289 + }, + { + "epoch": 0.39498700528509884, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 45290 + }, + { + "epoch": 0.3949957265702674, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 45291 + }, + { + "epoch": 0.395004447855436, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 45292 + }, + { + "epoch": 0.3950131691406046, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 45293 + }, + { + "epoch": 0.3950218904257731, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 45294 + }, + { + "epoch": 0.3950306117109417, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45295 + }, + { + "epoch": 0.3950393329961103, + "grad_norm": 0.2578125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 45296 + }, + { + "epoch": 0.39504805428127887, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 45297 + }, + { + "epoch": 0.39505677556644747, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 45298 + }, + { + "epoch": 0.39506549685161607, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 45299 + }, + { + "epoch": 0.3950742181367846, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 45300 + }, + { + "epoch": 0.3950829394219532, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 45301 + }, + { + "epoch": 0.3950916607071218, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 45302 + }, + { + "epoch": 0.39510038199229036, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 45303 + }, + { + "epoch": 0.39510910327745896, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 45304 + }, + { + "epoch": 0.39511782456262756, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 45305 + }, + { + "epoch": 0.3951265458477961, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 45306 + }, + { + "epoch": 0.3951352671329647, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 45307 + }, + { + "epoch": 0.3951439884181333, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45308 + }, + { + "epoch": 0.39515270970330185, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 45309 + }, + { + "epoch": 0.39516143098847045, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 45310 + }, + { + "epoch": 0.39517015227363905, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 45311 + }, + { + "epoch": 0.39517887355880765, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 45312 + }, + { + "epoch": 0.3951875948439762, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 45313 + }, + { + "epoch": 0.3951963161291448, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 45314 + }, + { + "epoch": 0.3952050374143134, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45315 + }, + { + "epoch": 0.39521375869948194, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 45316 + }, + { + "epoch": 0.39522247998465054, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45317 + }, + { + "epoch": 0.39523120126981914, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 45318 + }, + { + "epoch": 0.3952399225549877, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 45319 + }, + { + "epoch": 0.3952486438401563, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 45320 + }, + { + "epoch": 0.3952573651253249, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 45321 + }, + { + "epoch": 0.39526608641049343, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 45322 + }, + { + "epoch": 0.39527480769566203, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 45323 + }, + { + "epoch": 0.39528352898083063, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45324 + }, + { + "epoch": 0.3952922502659992, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 45325 + }, + { + "epoch": 0.3953009715511678, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 45326 + }, + { + "epoch": 0.3953096928363364, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 45327 + }, + { + "epoch": 0.3953184141215049, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 45328 + }, + { + "epoch": 0.3953271354066735, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 45329 + }, + { + "epoch": 0.3953358566918421, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 45330 + }, + { + "epoch": 0.39534457797701067, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 45331 + }, + { + "epoch": 0.39535329926217927, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 45332 + }, + { + "epoch": 0.39536202054734787, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 45333 + }, + { + "epoch": 0.3953707418325164, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 45334 + }, + { + "epoch": 0.395379463117685, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 45335 + }, + { + "epoch": 0.3953881844028536, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 45336 + }, + { + "epoch": 0.39539690568802216, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 45337 + }, + { + "epoch": 0.39540562697319076, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 45338 + }, + { + "epoch": 0.39541434825835936, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 45339 + }, + { + "epoch": 0.39542306954352796, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 45340 + }, + { + "epoch": 0.3954317908286965, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 45341 + }, + { + "epoch": 0.3954405121138651, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 45342 + }, + { + "epoch": 0.3954492333990337, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 45343 + }, + { + "epoch": 0.39545795468420225, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9805, + "step": 45344 + }, + { + "epoch": 0.39546667596937085, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 45345 + }, + { + "epoch": 0.39547539725453945, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 45346 + }, + { + "epoch": 0.395484118539708, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 45347 + }, + { + "epoch": 0.3954928398248766, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 45348 + }, + { + "epoch": 0.3955015611100452, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 45349 + }, + { + "epoch": 0.39551028239521374, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45350 + }, + { + "epoch": 0.39551900368038234, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 45351 + }, + { + "epoch": 0.39552772496555094, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 45352 + }, + { + "epoch": 0.3955364462507195, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 45353 + }, + { + "epoch": 0.3955451675358881, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45354 + }, + { + "epoch": 0.3955538888210567, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 45355 + }, + { + "epoch": 0.39556261010622523, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.98, + "step": 45356 + }, + { + "epoch": 0.39557133139139383, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 45357 + }, + { + "epoch": 0.39558005267656243, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 45358 + }, + { + "epoch": 0.395588773961731, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 45359 + }, + { + "epoch": 0.3955974952468996, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0326, + "step": 45360 + }, + { + "epoch": 0.3956062165320682, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45361 + }, + { + "epoch": 0.3956149378172367, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 45362 + }, + { + "epoch": 0.3956236591024053, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 45363 + }, + { + "epoch": 0.3956323803875739, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45364 + }, + { + "epoch": 0.39564110167274247, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 45365 + }, + { + "epoch": 0.39564982295791107, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 45366 + }, + { + "epoch": 0.39565854424307967, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 45367 + }, + { + "epoch": 0.39566726552824827, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 45368 + }, + { + "epoch": 0.3956759868134168, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 45369 + }, + { + "epoch": 0.3956847080985854, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 45370 + }, + { + "epoch": 0.395693429383754, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 45371 + }, + { + "epoch": 0.39570215066892256, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45372 + }, + { + "epoch": 0.39571087195409116, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 45373 + }, + { + "epoch": 0.39571959323925976, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 45374 + }, + { + "epoch": 0.3957283145244283, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 45375 + }, + { + "epoch": 0.3957370358095969, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 45376 + }, + { + "epoch": 0.3957457570947655, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 45377 + }, + { + "epoch": 0.39575447837993405, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 45378 + }, + { + "epoch": 0.39576319966510265, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 45379 + }, + { + "epoch": 0.39577192095027125, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 45380 + }, + { + "epoch": 0.3957806422354398, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 45381 + }, + { + "epoch": 0.3957893635206084, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 45382 + }, + { + "epoch": 0.395798084805777, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 45383 + }, + { + "epoch": 0.39580680609094554, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 45384 + }, + { + "epoch": 0.39581552737611414, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 45385 + }, + { + "epoch": 0.39582424866128274, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45386 + }, + { + "epoch": 0.3958329699464513, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 45387 + }, + { + "epoch": 0.3958416912316199, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 45388 + }, + { + "epoch": 0.3958504125167885, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 45389 + }, + { + "epoch": 0.39585913380195703, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 45390 + }, + { + "epoch": 0.39586785508712563, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 45391 + }, + { + "epoch": 0.39587657637229423, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45392 + }, + { + "epoch": 0.39588529765746283, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 45393 + }, + { + "epoch": 0.3958940189426314, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 45394 + }, + { + "epoch": 0.3959027402278, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 45395 + }, + { + "epoch": 0.3959114615129686, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 45396 + }, + { + "epoch": 0.3959201827981371, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 45397 + }, + { + "epoch": 0.3959289040833057, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 45398 + }, + { + "epoch": 0.3959376253684743, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 45399 + }, + { + "epoch": 0.39594634665364287, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 45400 + }, + { + "epoch": 0.39595506793881147, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 45401 + }, + { + "epoch": 0.39596378922398007, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 45402 + }, + { + "epoch": 0.3959725105091486, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 45403 + }, + { + "epoch": 0.3959812317943172, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 45404 + }, + { + "epoch": 0.3959899530794858, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 45405 + }, + { + "epoch": 0.39599867436465436, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 45406 + }, + { + "epoch": 0.39600739564982296, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45407 + }, + { + "epoch": 0.39601611693499156, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 45408 + }, + { + "epoch": 0.3960248382201601, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 45409 + }, + { + "epoch": 0.3960335595053287, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 45410 + }, + { + "epoch": 0.3960422807904973, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 45411 + }, + { + "epoch": 0.39605100207566585, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 45412 + }, + { + "epoch": 0.39605972336083445, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 45413 + }, + { + "epoch": 0.39606844464600305, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 45414 + }, + { + "epoch": 0.3960771659311716, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 45415 + }, + { + "epoch": 0.3960858872163402, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 45416 + }, + { + "epoch": 0.3960946085015088, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 45417 + }, + { + "epoch": 0.39610332978667734, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 45418 + }, + { + "epoch": 0.39611205107184594, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 45419 + }, + { + "epoch": 0.39612077235701454, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 45420 + }, + { + "epoch": 0.39612949364218314, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 45421 + }, + { + "epoch": 0.3961382149273517, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 45422 + }, + { + "epoch": 0.3961469362125203, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 45423 + }, + { + "epoch": 0.3961556574976889, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 45424 + }, + { + "epoch": 0.39616437878285743, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 45425 + }, + { + "epoch": 0.39617310006802603, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 45426 + }, + { + "epoch": 0.39618182135319463, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 45427 + }, + { + "epoch": 0.3961905426383632, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 45428 + }, + { + "epoch": 0.3961992639235318, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 45429 + }, + { + "epoch": 0.3962079852087004, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 45430 + }, + { + "epoch": 0.3962167064938689, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 45431 + }, + { + "epoch": 0.3962254277790375, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 45432 + }, + { + "epoch": 0.3962341490642061, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 45433 + }, + { + "epoch": 0.39624287034937467, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 45434 + }, + { + "epoch": 0.39625159163454327, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 45435 + }, + { + "epoch": 0.39626031291971187, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 45436 + }, + { + "epoch": 0.3962690342048804, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 45437 + }, + { + "epoch": 0.396277755490049, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 45438 + }, + { + "epoch": 0.3962864767752176, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 45439 + }, + { + "epoch": 0.39629519806038616, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 45440 + }, + { + "epoch": 0.39630391934555476, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 45441 + }, + { + "epoch": 0.39631264063072336, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9729, + "step": 45442 + }, + { + "epoch": 0.3963213619158919, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 45443 + }, + { + "epoch": 0.3963300832010605, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 45444 + }, + { + "epoch": 0.3963388044862291, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 45445 + }, + { + "epoch": 0.39634752577139765, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45446 + }, + { + "epoch": 0.39635624705656625, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 45447 + }, + { + "epoch": 0.39636496834173485, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 45448 + }, + { + "epoch": 0.39637368962690345, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 45449 + }, + { + "epoch": 0.396382410912072, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 45450 + }, + { + "epoch": 0.3963911321972406, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 45451 + }, + { + "epoch": 0.3963998534824092, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 45452 + }, + { + "epoch": 0.39640857476757774, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 45453 + }, + { + "epoch": 0.39641729605274634, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 45454 + }, + { + "epoch": 0.39642601733791494, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 45455 + }, + { + "epoch": 0.3964347386230835, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45456 + }, + { + "epoch": 0.3964434599082521, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 45457 + }, + { + "epoch": 0.3964521811934207, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 45458 + }, + { + "epoch": 0.3964609024785892, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 45459 + }, + { + "epoch": 0.39646962376375783, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 45460 + }, + { + "epoch": 0.39647834504892643, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 45461 + }, + { + "epoch": 0.396487066334095, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 45462 + }, + { + "epoch": 0.3964957876192636, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 45463 + }, + { + "epoch": 0.3965045089044322, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 45464 + }, + { + "epoch": 0.3965132301896007, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 45465 + }, + { + "epoch": 0.3965219514747693, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 45466 + }, + { + "epoch": 0.3965306727599379, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 45467 + }, + { + "epoch": 0.39653939404510646, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 45468 + }, + { + "epoch": 0.39654811533027506, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9809, + "step": 45469 + }, + { + "epoch": 0.39655683661544366, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.968, + "step": 45470 + }, + { + "epoch": 0.3965655579006122, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 45471 + }, + { + "epoch": 0.3965742791857808, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 45472 + }, + { + "epoch": 0.3965830004709494, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 45473 + }, + { + "epoch": 0.39659172175611795, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 45474 + }, + { + "epoch": 0.39660044304128655, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 45475 + }, + { + "epoch": 0.39660916432645515, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 45476 + }, + { + "epoch": 0.39661788561162375, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 45477 + }, + { + "epoch": 0.3966266068967923, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 45478 + }, + { + "epoch": 0.3966353281819609, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 45479 + }, + { + "epoch": 0.3966440494671295, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 45480 + }, + { + "epoch": 0.39665277075229805, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 45481 + }, + { + "epoch": 0.39666149203746665, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 45482 + }, + { + "epoch": 0.39667021332263525, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 45483 + }, + { + "epoch": 0.3966789346078038, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 45484 + }, + { + "epoch": 0.3966876558929724, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 45485 + }, + { + "epoch": 0.396696377178141, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 45486 + }, + { + "epoch": 0.39670509846330954, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 45487 + }, + { + "epoch": 0.39671381974847814, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 45488 + }, + { + "epoch": 0.39672254103364674, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45489 + }, + { + "epoch": 0.3967312623188153, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 45490 + }, + { + "epoch": 0.3967399836039839, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 45491 + }, + { + "epoch": 0.3967487048891525, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 45492 + }, + { + "epoch": 0.396757426174321, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 45493 + }, + { + "epoch": 0.3967661474594896, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 45494 + }, + { + "epoch": 0.3967748687446582, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 45495 + }, + { + "epoch": 0.39678359002982677, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 45496 + }, + { + "epoch": 0.39679231131499537, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 45497 + }, + { + "epoch": 0.39680103260016397, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 45498 + }, + { + "epoch": 0.3968097538853325, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 45499 + }, + { + "epoch": 0.3968184751705011, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 45500 + }, + { + "epoch": 0.3968271964556697, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 45501 + }, + { + "epoch": 0.3968359177408383, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 45502 + }, + { + "epoch": 0.39684463902600686, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 45503 + }, + { + "epoch": 0.39685336031117546, + "grad_norm": 0.330078125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45504 + }, + { + "epoch": 0.39686208159634406, + "grad_norm": 0.32421875, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 45505 + }, + { + "epoch": 0.3968708028815126, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 45506 + }, + { + "epoch": 0.3968795241666812, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 45507 + }, + { + "epoch": 0.3968882454518498, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45508 + }, + { + "epoch": 0.39689696673701835, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 45509 + }, + { + "epoch": 0.39690568802218695, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 45510 + }, + { + "epoch": 0.39691440930735555, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 45511 + }, + { + "epoch": 0.3969231305925241, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 45512 + }, + { + "epoch": 0.3969318518776927, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45513 + }, + { + "epoch": 0.3969405731628613, + "grad_norm": 0.2255859375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 45514 + }, + { + "epoch": 0.39694929444802984, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 45515 + }, + { + "epoch": 0.39695801573319844, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 45516 + }, + { + "epoch": 0.39696673701836704, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45517 + }, + { + "epoch": 0.3969754583035356, + "grad_norm": 0.29296875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 45518 + }, + { + "epoch": 0.3969841795887042, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 45519 + }, + { + "epoch": 0.3969929008738728, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 45520 + }, + { + "epoch": 0.39700162215904133, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 45521 + }, + { + "epoch": 0.39701034344420993, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 45522 + }, + { + "epoch": 0.39701906472937853, + "grad_norm": 0.298828125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 45523 + }, + { + "epoch": 0.3970277860145471, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 45524 + }, + { + "epoch": 0.3970365072997157, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 45525 + }, + { + "epoch": 0.3970452285848843, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 45526 + }, + { + "epoch": 0.3970539498700528, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 45527 + }, + { + "epoch": 0.3970626711552214, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 45528 + }, + { + "epoch": 0.39707139244039, + "grad_norm": 0.396484375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 45529 + }, + { + "epoch": 0.3970801137255586, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 45530 + }, + { + "epoch": 0.39708883501072717, + "grad_norm": 0.43359375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 45531 + }, + { + "epoch": 0.39709755629589577, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 45532 + }, + { + "epoch": 0.39710627758106437, + "grad_norm": 0.5234375, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 45533 + }, + { + "epoch": 0.3971149988662329, + "grad_norm": 0.33984375, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 45534 + }, + { + "epoch": 0.3971237201514015, + "grad_norm": 0.62890625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 45535 + }, + { + "epoch": 0.3971324414365701, + "grad_norm": 0.408203125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 45536 + }, + { + "epoch": 0.39714116272173866, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 45537 + }, + { + "epoch": 0.39714988400690726, + "grad_norm": 0.546875, + "learning_rate": 0.0005, + "loss": 0.9743, + "step": 45538 + }, + { + "epoch": 0.39715860529207586, + "grad_norm": 0.59375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 45539 + }, + { + "epoch": 0.3971673265772444, + "grad_norm": 0.546875, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 45540 + }, + { + "epoch": 0.397176047862413, + "grad_norm": 0.71875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 45541 + }, + { + "epoch": 0.3971847691475816, + "grad_norm": 0.5859375, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 45542 + }, + { + "epoch": 0.39719349043275015, + "grad_norm": 0.59375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45543 + }, + { + "epoch": 0.39720221171791875, + "grad_norm": 0.6640625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 45544 + }, + { + "epoch": 0.39721093300308735, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 45545 + }, + { + "epoch": 0.3972196542882559, + "grad_norm": 0.5390625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 45546 + }, + { + "epoch": 0.3972283755734245, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 45547 + }, + { + "epoch": 0.3972370968585931, + "grad_norm": 0.498046875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 45548 + }, + { + "epoch": 0.39724581814376164, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 45549 + }, + { + "epoch": 0.39725453942893024, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 45550 + }, + { + "epoch": 0.39726326071409884, + "grad_norm": 0.36328125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 45551 + }, + { + "epoch": 0.3972719819992674, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 45552 + }, + { + "epoch": 0.397280703284436, + "grad_norm": 0.494140625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 45553 + }, + { + "epoch": 0.3972894245696046, + "grad_norm": 0.3125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45554 + }, + { + "epoch": 0.39729814585477313, + "grad_norm": 0.5, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 45555 + }, + { + "epoch": 0.39730686713994173, + "grad_norm": 0.384765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 45556 + }, + { + "epoch": 0.39731558842511033, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 45557 + }, + { + "epoch": 0.39732430971027893, + "grad_norm": 0.365234375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 45558 + }, + { + "epoch": 0.3973330309954475, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 45559 + }, + { + "epoch": 0.3973417522806161, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 1.0298, + "step": 45560 + }, + { + "epoch": 0.3973504735657847, + "grad_norm": 0.283203125, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 45561 + }, + { + "epoch": 0.3973591948509532, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 45562 + }, + { + "epoch": 0.3973679161361218, + "grad_norm": 0.2373046875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 45563 + }, + { + "epoch": 0.3973766374212904, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 45564 + }, + { + "epoch": 0.39738535870645897, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 45565 + }, + { + "epoch": 0.39739407999162757, + "grad_norm": 0.33203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 45566 + }, + { + "epoch": 0.39740280127679617, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 45567 + }, + { + "epoch": 0.3974115225619647, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 45568 + }, + { + "epoch": 0.3974202438471333, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 45569 + }, + { + "epoch": 0.3974289651323019, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 45570 + }, + { + "epoch": 0.39743768641747046, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 45571 + }, + { + "epoch": 0.39744640770263906, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 45572 + }, + { + "epoch": 0.39745512898780766, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45573 + }, + { + "epoch": 0.3974638502729762, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45574 + }, + { + "epoch": 0.3974725715581448, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 45575 + }, + { + "epoch": 0.3974812928433134, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45576 + }, + { + "epoch": 0.39749001412848195, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 45577 + }, + { + "epoch": 0.39749873541365055, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 45578 + }, + { + "epoch": 0.39750745669881915, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 45579 + }, + { + "epoch": 0.3975161779839877, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 45580 + }, + { + "epoch": 0.3975248992691563, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 45581 + }, + { + "epoch": 0.3975336205543249, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 45582 + }, + { + "epoch": 0.39754234183949344, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 45583 + }, + { + "epoch": 0.39755106312466204, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 45584 + }, + { + "epoch": 0.39755978440983064, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 45585 + }, + { + "epoch": 0.39756850569499924, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 45586 + }, + { + "epoch": 0.3975772269801678, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 45587 + }, + { + "epoch": 0.3975859482653364, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 45588 + }, + { + "epoch": 0.397594669550505, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 45589 + }, + { + "epoch": 0.39760339083567353, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45590 + }, + { + "epoch": 0.39761211212084213, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 45591 + }, + { + "epoch": 0.39762083340601073, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45592 + }, + { + "epoch": 0.3976295546911793, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 45593 + }, + { + "epoch": 0.3976382759763479, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0313, + "step": 45594 + }, + { + "epoch": 0.3976469972615165, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 45595 + }, + { + "epoch": 0.397655718546685, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 45596 + }, + { + "epoch": 0.3976644398318536, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 45597 + }, + { + "epoch": 0.3976731611170222, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 45598 + }, + { + "epoch": 0.39768188240219077, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 45599 + }, + { + "epoch": 0.39769060368735937, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 45600 + }, + { + "epoch": 0.39769932497252797, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 45601 + }, + { + "epoch": 0.3977080462576965, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 45602 + }, + { + "epoch": 0.3977167675428651, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 45603 + }, + { + "epoch": 0.3977254888280337, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 45604 + }, + { + "epoch": 0.39773421011320226, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 45605 + }, + { + "epoch": 0.39774293139837086, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 45606 + }, + { + "epoch": 0.39775165268353946, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 45607 + }, + { + "epoch": 0.397760373968708, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 45608 + }, + { + "epoch": 0.3977690952538766, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 45609 + }, + { + "epoch": 0.3977778165390452, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 45610 + }, + { + "epoch": 0.39778653782421375, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 45611 + }, + { + "epoch": 0.39779525910938235, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 45612 + }, + { + "epoch": 0.39780398039455095, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 45613 + }, + { + "epoch": 0.39781270167971955, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 45614 + }, + { + "epoch": 0.3978214229648881, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 45615 + }, + { + "epoch": 0.3978301442500567, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.031, + "step": 45616 + }, + { + "epoch": 0.3978388655352253, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 45617 + }, + { + "epoch": 0.39784758682039384, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 45618 + }, + { + "epoch": 0.39785630810556244, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 45619 + }, + { + "epoch": 0.39786502939073104, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 45620 + }, + { + "epoch": 0.3978737506758996, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45621 + }, + { + "epoch": 0.3978824719610682, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 45622 + }, + { + "epoch": 0.3978911932462368, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 45623 + }, + { + "epoch": 0.39789991453140533, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 45624 + }, + { + "epoch": 0.39790863581657393, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 45625 + }, + { + "epoch": 0.39791735710174253, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 45626 + }, + { + "epoch": 0.3979260783869111, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45627 + }, + { + "epoch": 0.3979347996720797, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 45628 + }, + { + "epoch": 0.3979435209572483, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 45629 + }, + { + "epoch": 0.3979522422424168, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 45630 + }, + { + "epoch": 0.3979609635275854, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 45631 + }, + { + "epoch": 0.397969684812754, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 45632 + }, + { + "epoch": 0.39797840609792257, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 45633 + }, + { + "epoch": 0.39798712738309117, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 45634 + }, + { + "epoch": 0.39799584866825977, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 45635 + }, + { + "epoch": 0.3980045699534283, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 45636 + }, + { + "epoch": 0.3980132912385969, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 45637 + }, + { + "epoch": 0.3980220125237655, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 45638 + }, + { + "epoch": 0.3980307338089341, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 45639 + }, + { + "epoch": 0.39803945509410266, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 45640 + }, + { + "epoch": 0.39804817637927126, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 45641 + }, + { + "epoch": 0.39805689766443986, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 45642 + }, + { + "epoch": 0.3980656189496084, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 45643 + }, + { + "epoch": 0.398074340234777, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 45644 + }, + { + "epoch": 0.3980830615199456, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 45645 + }, + { + "epoch": 0.39809178280511415, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 45646 + }, + { + "epoch": 0.39810050409028275, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 45647 + }, + { + "epoch": 0.39810922537545135, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 45648 + }, + { + "epoch": 0.3981179466606199, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 45649 + }, + { + "epoch": 0.3981266679457885, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 45650 + }, + { + "epoch": 0.3981353892309571, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 45651 + }, + { + "epoch": 0.39814411051612564, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 45652 + }, + { + "epoch": 0.39815283180129424, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 45653 + }, + { + "epoch": 0.39816155308646284, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 45654 + }, + { + "epoch": 0.3981702743716314, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45655 + }, + { + "epoch": 0.3981789956568, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0335, + "step": 45656 + }, + { + "epoch": 0.3981877169419686, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 45657 + }, + { + "epoch": 0.39819643822713713, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 45658 + }, + { + "epoch": 0.39820515951230573, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 45659 + }, + { + "epoch": 0.39821388079747433, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 45660 + }, + { + "epoch": 0.3982226020826429, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 45661 + }, + { + "epoch": 0.3982313233678115, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 45662 + }, + { + "epoch": 0.3982400446529801, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 45663 + }, + { + "epoch": 0.3982487659381486, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 45664 + }, + { + "epoch": 0.3982574872233172, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 45665 + }, + { + "epoch": 0.3982662085084858, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 45666 + }, + { + "epoch": 0.3982749297936544, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 45667 + }, + { + "epoch": 0.39828365107882296, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 45668 + }, + { + "epoch": 0.39829237236399156, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 45669 + }, + { + "epoch": 0.39830109364916017, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 45670 + }, + { + "epoch": 0.3983098149343287, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 45671 + }, + { + "epoch": 0.3983185362194973, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 45672 + }, + { + "epoch": 0.3983272575046659, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 45673 + }, + { + "epoch": 0.39833597878983446, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45674 + }, + { + "epoch": 0.39834470007500306, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 45675 + }, + { + "epoch": 0.39835342136017166, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 45676 + }, + { + "epoch": 0.3983621426453402, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 45677 + }, + { + "epoch": 0.3983708639305088, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 45678 + }, + { + "epoch": 0.3983795852156774, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 45679 + }, + { + "epoch": 0.39838830650084595, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 45680 + }, + { + "epoch": 0.39839702778601455, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.976, + "step": 45681 + }, + { + "epoch": 0.39840574907118315, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 45682 + }, + { + "epoch": 0.3984144703563517, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 45683 + }, + { + "epoch": 0.3984231916415203, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 45684 + }, + { + "epoch": 0.3984319129266889, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45685 + }, + { + "epoch": 0.39844063421185744, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 45686 + }, + { + "epoch": 0.39844935549702604, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 45687 + }, + { + "epoch": 0.39845807678219464, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 45688 + }, + { + "epoch": 0.3984667980673632, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 45689 + }, + { + "epoch": 0.3984755193525318, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 45690 + }, + { + "epoch": 0.3984842406377004, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 45691 + }, + { + "epoch": 0.3984929619228689, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 45692 + }, + { + "epoch": 0.3985016832080375, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 45693 + }, + { + "epoch": 0.3985104044932061, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45694 + }, + { + "epoch": 0.3985191257783747, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 0.9861, + "step": 45695 + }, + { + "epoch": 0.3985278470635433, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 45696 + }, + { + "epoch": 0.3985365683487119, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 45697 + }, + { + "epoch": 0.3985452896338805, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 45698 + }, + { + "epoch": 0.398554010919049, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 45699 + }, + { + "epoch": 0.3985627322042176, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9862, + "step": 45700 + }, + { + "epoch": 0.3985714534893862, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 45701 + }, + { + "epoch": 0.39858017477455476, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 45702 + }, + { + "epoch": 0.39858889605972336, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 45703 + }, + { + "epoch": 0.39859761734489196, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 45704 + }, + { + "epoch": 0.3986063386300605, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 45705 + }, + { + "epoch": 0.3986150599152291, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 45706 + }, + { + "epoch": 0.3986237812003977, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 45707 + }, + { + "epoch": 0.39863250248556625, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 45708 + }, + { + "epoch": 0.39864122377073485, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45709 + }, + { + "epoch": 0.39864994505590345, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 45710 + }, + { + "epoch": 0.398658666341072, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 45711 + }, + { + "epoch": 0.3986673876262406, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 45712 + }, + { + "epoch": 0.3986761089114092, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 45713 + }, + { + "epoch": 0.39868483019657774, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45714 + }, + { + "epoch": 0.39869355148174634, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 45715 + }, + { + "epoch": 0.39870227276691494, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 45716 + }, + { + "epoch": 0.3987109940520835, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 45717 + }, + { + "epoch": 0.3987197153372521, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 45718 + }, + { + "epoch": 0.3987284366224207, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 45719 + }, + { + "epoch": 0.39873715790758923, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 45720 + }, + { + "epoch": 0.39874587919275783, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 45721 + }, + { + "epoch": 0.39875460047792644, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 45722 + }, + { + "epoch": 0.39876332176309504, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 45723 + }, + { + "epoch": 0.3987720430482636, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45724 + }, + { + "epoch": 0.3987807643334322, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 45725 + }, + { + "epoch": 0.3987894856186008, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 45726 + }, + { + "epoch": 0.3987982069037693, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 45727 + }, + { + "epoch": 0.3988069281889379, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45728 + }, + { + "epoch": 0.3988156494741065, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 45729 + }, + { + "epoch": 0.39882437075927507, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 45730 + }, + { + "epoch": 0.39883309204444367, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 45731 + }, + { + "epoch": 0.39884181332961227, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 45732 + }, + { + "epoch": 0.3988505346147808, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 45733 + }, + { + "epoch": 0.3988592558999494, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 45734 + }, + { + "epoch": 0.398867977185118, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 45735 + }, + { + "epoch": 0.39887669847028656, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 45736 + }, + { + "epoch": 0.39888541975545516, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 45737 + }, + { + "epoch": 0.39889414104062376, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 45738 + }, + { + "epoch": 0.3989028623257923, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 45739 + }, + { + "epoch": 0.3989115836109609, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 45740 + }, + { + "epoch": 0.3989203048961295, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45741 + }, + { + "epoch": 0.39892902618129805, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9785, + "step": 45742 + }, + { + "epoch": 0.39893774746646665, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 45743 + }, + { + "epoch": 0.39894646875163525, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 45744 + }, + { + "epoch": 0.3989551900368038, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 45745 + }, + { + "epoch": 0.3989639113219724, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 45746 + }, + { + "epoch": 0.398972632607141, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 45747 + }, + { + "epoch": 0.3989813538923096, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 45748 + }, + { + "epoch": 0.39899007517747814, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 45749 + }, + { + "epoch": 0.39899879646264674, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 45750 + }, + { + "epoch": 0.39900751774781534, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 45751 + }, + { + "epoch": 0.3990162390329839, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 45752 + }, + { + "epoch": 0.3990249603181525, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 45753 + }, + { + "epoch": 0.3990336816033211, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 45754 + }, + { + "epoch": 0.39904240288848963, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 45755 + }, + { + "epoch": 0.39905112417365823, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 45756 + }, + { + "epoch": 0.39905984545882683, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 45757 + }, + { + "epoch": 0.3990685667439954, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 45758 + }, + { + "epoch": 0.399077288029164, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45759 + }, + { + "epoch": 0.3990860093143326, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45760 + }, + { + "epoch": 0.3990947305995011, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 45761 + }, + { + "epoch": 0.3991034518846697, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 45762 + }, + { + "epoch": 0.3991121731698383, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 45763 + }, + { + "epoch": 0.39912089445500687, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 45764 + }, + { + "epoch": 0.39912961574017547, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 45765 + }, + { + "epoch": 0.39913833702534407, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45766 + }, + { + "epoch": 0.3991470583105126, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 45767 + }, + { + "epoch": 0.3991557795956812, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 45768 + }, + { + "epoch": 0.3991645008808498, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 45769 + }, + { + "epoch": 0.39917322216601836, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 45770 + }, + { + "epoch": 0.39918194345118696, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 45771 + }, + { + "epoch": 0.39919066473635556, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 45772 + }, + { + "epoch": 0.3991993860215241, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 45773 + }, + { + "epoch": 0.3992081073066927, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 45774 + }, + { + "epoch": 0.3992168285918613, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 45775 + }, + { + "epoch": 0.3992255498770299, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 45776 + }, + { + "epoch": 0.39923427116219845, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 45777 + }, + { + "epoch": 0.39924299244736705, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 45778 + }, + { + "epoch": 0.39925171373253565, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 45779 + }, + { + "epoch": 0.3992604350177042, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 45780 + }, + { + "epoch": 0.3992691563028728, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 45781 + }, + { + "epoch": 0.3992778775880414, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 45782 + }, + { + "epoch": 0.39928659887320994, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 45783 + }, + { + "epoch": 0.39929532015837854, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 45784 + }, + { + "epoch": 0.39930404144354714, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 45785 + }, + { + "epoch": 0.3993127627287157, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 45786 + }, + { + "epoch": 0.3993214840138843, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 45787 + }, + { + "epoch": 0.3993302052990529, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 45788 + }, + { + "epoch": 0.39933892658422143, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 45789 + }, + { + "epoch": 0.39934764786939003, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 45790 + }, + { + "epoch": 0.39935636915455863, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 45791 + }, + { + "epoch": 0.3993650904397272, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 45792 + }, + { + "epoch": 0.3993738117248958, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 45793 + }, + { + "epoch": 0.3993825330100644, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 45794 + }, + { + "epoch": 0.3993912542952329, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 45795 + }, + { + "epoch": 0.3993999755804015, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45796 + }, + { + "epoch": 0.3994086968655701, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 45797 + }, + { + "epoch": 0.39941741815073867, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 45798 + }, + { + "epoch": 0.39942613943590727, + "grad_norm": 0.072265625, + "learning_rate": 0.0005, + "loss": 0.9819, + "step": 45799 + }, + { + "epoch": 0.39943486072107587, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 45800 + }, + { + "epoch": 0.3994435820062444, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 45801 + }, + { + "epoch": 0.399452303291413, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 45802 + }, + { + "epoch": 0.3994610245765816, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 45803 + }, + { + "epoch": 0.3994697458617502, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 45804 + }, + { + "epoch": 0.39947846714691876, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 45805 + }, + { + "epoch": 0.39948718843208736, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 45806 + }, + { + "epoch": 0.39949590971725596, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 45807 + }, + { + "epoch": 0.3995046310024245, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 45808 + }, + { + "epoch": 0.3995133522875931, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 45809 + }, + { + "epoch": 0.3995220735727617, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 45810 + }, + { + "epoch": 0.39953079485793025, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45811 + }, + { + "epoch": 0.39953951614309885, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 45812 + }, + { + "epoch": 0.39954823742826745, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 45813 + }, + { + "epoch": 0.399556958713436, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 45814 + }, + { + "epoch": 0.3995656799986046, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 45815 + }, + { + "epoch": 0.3995744012837732, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 45816 + }, + { + "epoch": 0.39958312256894174, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 45817 + }, + { + "epoch": 0.39959184385411034, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 45818 + }, + { + "epoch": 0.39960056513927894, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 45819 + }, + { + "epoch": 0.3996092864244475, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 45820 + }, + { + "epoch": 0.3996180077096161, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 45821 + }, + { + "epoch": 0.3996267289947847, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 45822 + }, + { + "epoch": 0.39963545027995323, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 45823 + }, + { + "epoch": 0.39964417156512183, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 45824 + }, + { + "epoch": 0.39965289285029043, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 45825 + }, + { + "epoch": 0.399661614135459, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 45826 + }, + { + "epoch": 0.3996703354206276, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 45827 + }, + { + "epoch": 0.3996790567057962, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 45828 + }, + { + "epoch": 0.3996877779909647, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 45829 + }, + { + "epoch": 0.3996964992761333, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45830 + }, + { + "epoch": 0.3997052205613019, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 45831 + }, + { + "epoch": 0.3997139418464705, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 45832 + }, + { + "epoch": 0.39972266313163907, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 45833 + }, + { + "epoch": 0.39973138441680767, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 45834 + }, + { + "epoch": 0.39974010570197627, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 45835 + }, + { + "epoch": 0.3997488269871448, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 45836 + }, + { + "epoch": 0.3997575482723134, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 45837 + }, + { + "epoch": 0.399766269557482, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 45838 + }, + { + "epoch": 0.39977499084265056, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 45839 + }, + { + "epoch": 0.39978371212781916, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 45840 + }, + { + "epoch": 0.39979243341298776, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 45841 + }, + { + "epoch": 0.3998011546981563, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45842 + }, + { + "epoch": 0.3998098759833249, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 45843 + }, + { + "epoch": 0.3998185972684935, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 45844 + }, + { + "epoch": 0.39982731855366205, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 45845 + }, + { + "epoch": 0.39983603983883065, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 45846 + }, + { + "epoch": 0.39984476112399925, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 45847 + }, + { + "epoch": 0.3998534824091678, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 45848 + }, + { + "epoch": 0.3998622036943364, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 45849 + }, + { + "epoch": 0.399870924979505, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45850 + }, + { + "epoch": 0.39987964626467354, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 45851 + }, + { + "epoch": 0.39988836754984214, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 45852 + }, + { + "epoch": 0.39989708883501074, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 45853 + }, + { + "epoch": 0.3999058101201793, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 45854 + }, + { + "epoch": 0.3999145314053479, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 45855 + }, + { + "epoch": 0.3999232526905165, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 45856 + }, + { + "epoch": 0.3999319739756851, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 45857 + }, + { + "epoch": 0.39994069526085363, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 45858 + }, + { + "epoch": 0.39994941654602223, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 45859 + }, + { + "epoch": 0.39995813783119083, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 45860 + }, + { + "epoch": 0.3999668591163594, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 45861 + }, + { + "epoch": 0.399975580401528, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 45862 + }, + { + "epoch": 0.3999843016866966, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 45863 + }, + { + "epoch": 0.3999930229718651, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 45864 + }, + { + "epoch": 0.4000017442570337, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 45865 + }, + { + "epoch": 0.4000104655422023, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 45866 + }, + { + "epoch": 0.40001918682737087, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 45867 + }, + { + "epoch": 0.40002790811253947, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 45868 + }, + { + "epoch": 0.40003662939770807, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 45869 + }, + { + "epoch": 0.4000453506828766, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 45870 + }, + { + "epoch": 0.4000540719680452, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 45871 + }, + { + "epoch": 0.4000627932532138, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 45872 + }, + { + "epoch": 0.40007151453838236, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 45873 + }, + { + "epoch": 0.40008023582355096, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 45874 + }, + { + "epoch": 0.40008895710871956, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 45875 + }, + { + "epoch": 0.4000976783938881, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 45876 + }, + { + "epoch": 0.4001063996790567, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 45877 + }, + { + "epoch": 0.4001151209642253, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 45878 + }, + { + "epoch": 0.40012384224939385, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 45879 + }, + { + "epoch": 0.40013256353456245, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 45880 + }, + { + "epoch": 0.40014128481973105, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 45881 + }, + { + "epoch": 0.4001500061048996, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 45882 + }, + { + "epoch": 0.4001587273900682, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 45883 + }, + { + "epoch": 0.4001674486752368, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 45884 + }, + { + "epoch": 0.4001761699604054, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 45885 + }, + { + "epoch": 0.40018489124557394, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 45886 + }, + { + "epoch": 0.40019361253074254, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 45887 + }, + { + "epoch": 0.40020233381591114, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 45888 + }, + { + "epoch": 0.4002110551010797, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 45889 + }, + { + "epoch": 0.4002197763862483, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 45890 + }, + { + "epoch": 0.4002284976714169, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 45891 + }, + { + "epoch": 0.4002372189565854, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 45892 + }, + { + "epoch": 0.40024594024175403, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 45893 + }, + { + "epoch": 0.40025466152692263, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 45894 + }, + { + "epoch": 0.4002633828120912, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 45895 + }, + { + "epoch": 0.4002721040972598, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 45896 + }, + { + "epoch": 0.4002808253824284, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0279, + "step": 45897 + }, + { + "epoch": 0.4002895466675969, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 45898 + }, + { + "epoch": 0.4002982679527655, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45899 + }, + { + "epoch": 0.4003069892379341, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 45900 + }, + { + "epoch": 0.40031571052310266, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 45901 + }, + { + "epoch": 0.40032443180827126, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 45902 + }, + { + "epoch": 0.40033315309343986, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 45903 + }, + { + "epoch": 0.4003418743786084, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 45904 + }, + { + "epoch": 0.400350595663777, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 45905 + }, + { + "epoch": 0.4003593169489456, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 45906 + }, + { + "epoch": 0.40036803823411415, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 45907 + }, + { + "epoch": 0.40037675951928275, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 45908 + }, + { + "epoch": 0.40038548080445135, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 45909 + }, + { + "epoch": 0.4003942020896199, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 45910 + }, + { + "epoch": 0.4004029233747885, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 45911 + }, + { + "epoch": 0.4004116446599571, + "grad_norm": 0.310546875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 45912 + }, + { + "epoch": 0.4004203659451257, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 45913 + }, + { + "epoch": 0.40042908723029425, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 45914 + }, + { + "epoch": 0.40043780851546285, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 45915 + }, + { + "epoch": 0.40044652980063145, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 45916 + }, + { + "epoch": 0.4004552510858, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 45917 + }, + { + "epoch": 0.4004639723709686, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 45918 + }, + { + "epoch": 0.4004726936561372, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 45919 + }, + { + "epoch": 0.40048141494130574, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 45920 + }, + { + "epoch": 0.40049013622647434, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 45921 + }, + { + "epoch": 0.40049885751164294, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 45922 + }, + { + "epoch": 0.4005075787968115, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 45923 + }, + { + "epoch": 0.4005163000819801, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 45924 + }, + { + "epoch": 0.4005250213671487, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 45925 + }, + { + "epoch": 0.4005337426523172, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 45926 + }, + { + "epoch": 0.4005424639374858, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 45927 + }, + { + "epoch": 0.4005511852226544, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 45928 + }, + { + "epoch": 0.40055990650782297, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 45929 + }, + { + "epoch": 0.40056862779299157, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 45930 + }, + { + "epoch": 0.40057734907816017, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 45931 + }, + { + "epoch": 0.4005860703633287, + "grad_norm": 0.27734375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 45932 + }, + { + "epoch": 0.4005947916484973, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 45933 + }, + { + "epoch": 0.4006035129336659, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 45934 + }, + { + "epoch": 0.40061223421883446, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 45935 + }, + { + "epoch": 0.40062095550400306, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 45936 + }, + { + "epoch": 0.40062967678917166, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 45937 + }, + { + "epoch": 0.4006383980743402, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 45938 + }, + { + "epoch": 0.4006471193595088, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 45939 + }, + { + "epoch": 0.4006558406446774, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 45940 + }, + { + "epoch": 0.400664561929846, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 45941 + }, + { + "epoch": 0.40067328321501455, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 45942 + }, + { + "epoch": 0.40068200450018315, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 45943 + }, + { + "epoch": 0.40069072578535175, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 45944 + }, + { + "epoch": 0.4006994470705203, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 45945 + }, + { + "epoch": 0.4007081683556889, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 45946 + }, + { + "epoch": 0.4007168896408575, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 45947 + }, + { + "epoch": 0.40072561092602604, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 45948 + }, + { + "epoch": 0.40073433221119464, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 45949 + }, + { + "epoch": 0.40074305349636324, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 45950 + }, + { + "epoch": 0.4007517747815318, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 45951 + }, + { + "epoch": 0.4007604960667004, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 45952 + }, + { + "epoch": 0.400769217351869, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 45953 + }, + { + "epoch": 0.40077793863703753, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 45954 + }, + { + "epoch": 0.40078665992220613, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 45955 + }, + { + "epoch": 0.40079538120737473, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 45956 + }, + { + "epoch": 0.4008041024925433, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 45957 + }, + { + "epoch": 0.4008128237777119, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 45958 + }, + { + "epoch": 0.4008215450628805, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 45959 + }, + { + "epoch": 0.400830266348049, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 45960 + }, + { + "epoch": 0.4008389876332176, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 45961 + }, + { + "epoch": 0.4008477089183862, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 45962 + }, + { + "epoch": 0.40085643020355477, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 45963 + }, + { + "epoch": 0.40086515148872337, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 45964 + }, + { + "epoch": 0.40087387277389197, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 45965 + }, + { + "epoch": 0.40088259405906057, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 45966 + }, + { + "epoch": 0.4008913153442291, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 45967 + }, + { + "epoch": 0.4009000366293977, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 45968 + }, + { + "epoch": 0.4009087579145663, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 45969 + }, + { + "epoch": 0.40091747919973486, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 45970 + }, + { + "epoch": 0.40092620048490346, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 45971 + }, + { + "epoch": 0.40093492177007206, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 45972 + }, + { + "epoch": 0.4009436430552406, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 45973 + }, + { + "epoch": 0.4009523643404092, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45974 + }, + { + "epoch": 0.4009610856255778, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 45975 + }, + { + "epoch": 0.40096980691074635, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 45976 + }, + { + "epoch": 0.40097852819591495, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 45977 + }, + { + "epoch": 0.40098724948108355, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 45978 + }, + { + "epoch": 0.4009959707662521, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 45979 + }, + { + "epoch": 0.4010046920514207, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 45980 + }, + { + "epoch": 0.4010134133365893, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 45981 + }, + { + "epoch": 0.40102213462175784, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 45982 + }, + { + "epoch": 0.40103085590692644, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 45983 + }, + { + "epoch": 0.40103957719209504, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 45984 + }, + { + "epoch": 0.4010482984772636, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 45985 + }, + { + "epoch": 0.4010570197624322, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 45986 + }, + { + "epoch": 0.4010657410476008, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 45987 + }, + { + "epoch": 0.40107446233276933, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 45988 + }, + { + "epoch": 0.40108318361793793, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 45989 + }, + { + "epoch": 0.40109190490310653, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 45990 + }, + { + "epoch": 0.4011006261882751, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 45991 + }, + { + "epoch": 0.4011093474734437, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 45992 + }, + { + "epoch": 0.4011180687586123, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 45993 + }, + { + "epoch": 0.4011267900437809, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 45994 + }, + { + "epoch": 0.4011355113289494, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 45995 + }, + { + "epoch": 0.401144232614118, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 45996 + }, + { + "epoch": 0.4011529538992866, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 45997 + }, + { + "epoch": 0.40116167518445517, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 45998 + }, + { + "epoch": 0.40117039646962377, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 45999 + }, + { + "epoch": 0.40117911775479237, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 46000 + }, + { + "epoch": 0.4011878390399609, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 46001 + }, + { + "epoch": 0.4011965603251295, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 46002 + }, + { + "epoch": 0.4012052816102981, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 46003 + }, + { + "epoch": 0.40121400289546666, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 46004 + }, + { + "epoch": 0.40122272418063526, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 46005 + }, + { + "epoch": 0.40123144546580386, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 46006 + }, + { + "epoch": 0.4012401667509724, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 46007 + }, + { + "epoch": 0.401248888036141, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 46008 + }, + { + "epoch": 0.4012576093213096, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 46009 + }, + { + "epoch": 0.40126633060647815, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 46010 + }, + { + "epoch": 0.40127505189164675, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 46011 + }, + { + "epoch": 0.40128377317681535, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46012 + }, + { + "epoch": 0.4012924944619839, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 46013 + }, + { + "epoch": 0.4013012157471525, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 46014 + }, + { + "epoch": 0.4013099370323211, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 46015 + }, + { + "epoch": 0.40131865831748964, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 46016 + }, + { + "epoch": 0.40132737960265824, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 46017 + }, + { + "epoch": 0.40133610088782684, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 46018 + }, + { + "epoch": 0.4013448221729954, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 46019 + }, + { + "epoch": 0.401353543458164, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 46020 + }, + { + "epoch": 0.4013622647433326, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 46021 + }, + { + "epoch": 0.4013709860285012, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 46022 + }, + { + "epoch": 0.40137970731366973, + "grad_norm": 0.28515625, + "learning_rate": 0.0005, + "loss": 1.0316, + "step": 46023 + }, + { + "epoch": 0.40138842859883833, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 46024 + }, + { + "epoch": 0.40139714988400693, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 0.9733, + "step": 46025 + }, + { + "epoch": 0.4014058711691755, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46026 + }, + { + "epoch": 0.4014145924543441, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 46027 + }, + { + "epoch": 0.4014233137395127, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 46028 + }, + { + "epoch": 0.4014320350246812, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 46029 + }, + { + "epoch": 0.4014407563098498, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 46030 + }, + { + "epoch": 0.4014494775950184, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 46031 + }, + { + "epoch": 0.40145819888018697, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 46032 + }, + { + "epoch": 0.40146692016535557, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 46033 + }, + { + "epoch": 0.40147564145052417, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 46034 + }, + { + "epoch": 0.4014843627356927, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9791, + "step": 46035 + }, + { + "epoch": 0.4014930840208613, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 46036 + }, + { + "epoch": 0.4015018053060299, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 46037 + }, + { + "epoch": 0.40151052659119846, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 46038 + }, + { + "epoch": 0.40151924787636706, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 46039 + }, + { + "epoch": 0.40152796916153566, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 46040 + }, + { + "epoch": 0.4015366904467042, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 46041 + }, + { + "epoch": 0.4015454117318728, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 46042 + }, + { + "epoch": 0.4015541330170414, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 46043 + }, + { + "epoch": 0.40156285430220995, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 46044 + }, + { + "epoch": 0.40157157558737855, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 46045 + }, + { + "epoch": 0.40158029687254715, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 46046 + }, + { + "epoch": 0.4015890181577157, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 46047 + }, + { + "epoch": 0.4015977394428843, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 46048 + }, + { + "epoch": 0.4016064607280529, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 46049 + }, + { + "epoch": 0.4016151820132215, + "grad_norm": 0.3359375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 46050 + }, + { + "epoch": 0.40162390329839004, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 46051 + }, + { + "epoch": 0.40163262458355864, + "grad_norm": 0.34375, + "learning_rate": 0.0005, + "loss": 0.9813, + "step": 46052 + }, + { + "epoch": 0.40164134586872724, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 46053 + }, + { + "epoch": 0.4016500671538958, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 46054 + }, + { + "epoch": 0.4016587884390644, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 46055 + }, + { + "epoch": 0.401667509724233, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 46056 + }, + { + "epoch": 0.40167623100940153, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 46057 + }, + { + "epoch": 0.40168495229457013, + "grad_norm": 0.265625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 46058 + }, + { + "epoch": 0.40169367357973873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 46059 + }, + { + "epoch": 0.4017023948649073, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 46060 + }, + { + "epoch": 0.4017111161500759, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 46061 + }, + { + "epoch": 0.4017198374352445, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 46062 + }, + { + "epoch": 0.401728558720413, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 46063 + }, + { + "epoch": 0.4017372800055816, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 46064 + }, + { + "epoch": 0.4017460012907502, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 46065 + }, + { + "epoch": 0.40175472257591877, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46066 + }, + { + "epoch": 0.40176344386108737, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 46067 + }, + { + "epoch": 0.40177216514625597, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 46068 + }, + { + "epoch": 0.4017808864314245, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.974, + "step": 46069 + }, + { + "epoch": 0.4017896077165931, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 0.9759, + "step": 46070 + }, + { + "epoch": 0.4017983290017617, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 46071 + }, + { + "epoch": 0.40180705028693026, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 46072 + }, + { + "epoch": 0.40181577157209886, + "grad_norm": 0.30859375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 46073 + }, + { + "epoch": 0.40182449285726746, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 46074 + }, + { + "epoch": 0.40183321414243606, + "grad_norm": 0.3515625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 46075 + }, + { + "epoch": 0.4018419354276046, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 46076 + }, + { + "epoch": 0.4018506567127732, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 46077 + }, + { + "epoch": 0.4018593779979418, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 46078 + }, + { + "epoch": 0.40186809928311035, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 46079 + }, + { + "epoch": 0.40187682056827895, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 46080 + }, + { + "epoch": 0.40188554185344755, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 46081 + }, + { + "epoch": 0.4018942631386161, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 46082 + }, + { + "epoch": 0.4019029844237847, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 46083 + }, + { + "epoch": 0.4019117057089533, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 46084 + }, + { + "epoch": 0.40192042699412184, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 46085 + }, + { + "epoch": 0.40192914827929044, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 46086 + }, + { + "epoch": 0.40193786956445904, + "grad_norm": 0.271484375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 46087 + }, + { + "epoch": 0.4019465908496276, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 46088 + }, + { + "epoch": 0.4019553121347962, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 46089 + }, + { + "epoch": 0.4019640334199648, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46090 + }, + { + "epoch": 0.40197275470513333, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 46091 + }, + { + "epoch": 0.40198147599030193, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 46092 + }, + { + "epoch": 0.40199019727547053, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 46093 + }, + { + "epoch": 0.4019989185606391, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 46094 + }, + { + "epoch": 0.4020076398458077, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 46095 + }, + { + "epoch": 0.4020163611309763, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 46096 + }, + { + "epoch": 0.4020250824161448, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 46097 + }, + { + "epoch": 0.4020338037013134, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 46098 + }, + { + "epoch": 0.402042524986482, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 46099 + }, + { + "epoch": 0.40205124627165056, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 46100 + }, + { + "epoch": 0.40205996755681916, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 46101 + }, + { + "epoch": 0.40206868884198776, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 46102 + }, + { + "epoch": 0.40207741012715636, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 46103 + }, + { + "epoch": 0.4020861314123249, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 46104 + }, + { + "epoch": 0.4020948526974935, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 46105 + }, + { + "epoch": 0.4021035739826621, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 46106 + }, + { + "epoch": 0.40211229526783066, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 46107 + }, + { + "epoch": 0.40212101655299926, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 46108 + }, + { + "epoch": 0.40212973783816786, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 46109 + }, + { + "epoch": 0.4021384591233364, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 46110 + }, + { + "epoch": 0.402147180408505, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 46111 + }, + { + "epoch": 0.4021559016936736, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 46112 + }, + { + "epoch": 0.40216462297884215, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 46113 + }, + { + "epoch": 0.40217334426401075, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 46114 + }, + { + "epoch": 0.40218206554917935, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 46115 + }, + { + "epoch": 0.4021907868343479, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 46116 + }, + { + "epoch": 0.4021995081195165, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 46117 + }, + { + "epoch": 0.4022082294046851, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 46118 + }, + { + "epoch": 0.40221695068985364, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 46119 + }, + { + "epoch": 0.40222567197502224, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 46120 + }, + { + "epoch": 0.40223439326019084, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 46121 + }, + { + "epoch": 0.4022431145453594, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 46122 + }, + { + "epoch": 0.402251835830528, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 46123 + }, + { + "epoch": 0.4022605571156966, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 46124 + }, + { + "epoch": 0.4022692784008651, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 46125 + }, + { + "epoch": 0.4022779996860337, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 46126 + }, + { + "epoch": 0.4022867209712023, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 46127 + }, + { + "epoch": 0.40229544225637087, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46128 + }, + { + "epoch": 0.4023041635415395, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 46129 + }, + { + "epoch": 0.4023128848267081, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 46130 + }, + { + "epoch": 0.4023216061118767, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 46131 + }, + { + "epoch": 0.4023303273970452, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 46132 + }, + { + "epoch": 0.4023390486822138, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 46133 + }, + { + "epoch": 0.4023477699673824, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 46134 + }, + { + "epoch": 0.40235649125255096, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 46135 + }, + { + "epoch": 0.40236521253771956, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 46136 + }, + { + "epoch": 0.40237393382288816, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 46137 + }, + { + "epoch": 0.4023826551080567, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 46138 + }, + { + "epoch": 0.4023913763932253, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46139 + }, + { + "epoch": 0.4024000976783939, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 46140 + }, + { + "epoch": 0.40240881896356245, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 46141 + }, + { + "epoch": 0.40241754024873105, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 46142 + }, + { + "epoch": 0.40242626153389965, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 46143 + }, + { + "epoch": 0.4024349828190682, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 46144 + }, + { + "epoch": 0.4024437041042368, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9811, + "step": 46145 + }, + { + "epoch": 0.4024524253894054, + "grad_norm": 0.2099609375, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 46146 + }, + { + "epoch": 0.40246114667457394, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 46147 + }, + { + "epoch": 0.40246986795974254, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 46148 + }, + { + "epoch": 0.40247858924491114, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46149 + }, + { + "epoch": 0.4024873105300797, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 46150 + }, + { + "epoch": 0.4024960318152483, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 46151 + }, + { + "epoch": 0.4025047531004169, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 46152 + }, + { + "epoch": 0.40251347438558543, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 46153 + }, + { + "epoch": 0.40252219567075403, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 46154 + }, + { + "epoch": 0.40253091695592264, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 46155 + }, + { + "epoch": 0.4025396382410912, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 46156 + }, + { + "epoch": 0.4025483595262598, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 46157 + }, + { + "epoch": 0.4025570808114284, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 46158 + }, + { + "epoch": 0.402565802096597, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 46159 + }, + { + "epoch": 0.4025745233817655, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 46160 + }, + { + "epoch": 0.4025832446669341, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 46161 + }, + { + "epoch": 0.4025919659521027, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 46162 + }, + { + "epoch": 0.40260068723727127, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46163 + }, + { + "epoch": 0.40260940852243987, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 46164 + }, + { + "epoch": 0.40261812980760847, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 46165 + }, + { + "epoch": 0.402626851092777, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9812, + "step": 46166 + }, + { + "epoch": 0.4026355723779456, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 46167 + }, + { + "epoch": 0.4026442936631142, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.974, + "step": 46168 + }, + { + "epoch": 0.40265301494828276, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 46169 + }, + { + "epoch": 0.40266173623345136, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 46170 + }, + { + "epoch": 0.40267045751861996, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 46171 + }, + { + "epoch": 0.4026791788037885, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 46172 + }, + { + "epoch": 0.4026879000889571, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 46173 + }, + { + "epoch": 0.4026966213741257, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 46174 + }, + { + "epoch": 0.40270534265929425, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46175 + }, + { + "epoch": 0.40271406394446285, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 46176 + }, + { + "epoch": 0.40272278522963145, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 46177 + }, + { + "epoch": 0.4027315065148, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 46178 + }, + { + "epoch": 0.4027402277999686, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46179 + }, + { + "epoch": 0.4027489490851372, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 46180 + }, + { + "epoch": 0.40275767037030574, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 46181 + }, + { + "epoch": 0.40276639165547434, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 46182 + }, + { + "epoch": 0.40277511294064294, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 46183 + }, + { + "epoch": 0.4027838342258115, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 46184 + }, + { + "epoch": 0.4027925555109801, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46185 + }, + { + "epoch": 0.4028012767961487, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 46186 + }, + { + "epoch": 0.4028099980813173, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 46187 + }, + { + "epoch": 0.40281871936648583, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 46188 + }, + { + "epoch": 0.40282744065165443, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 46189 + }, + { + "epoch": 0.40283616193682303, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 46190 + }, + { + "epoch": 0.4028448832219916, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 46191 + }, + { + "epoch": 0.4028536045071602, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 46192 + }, + { + "epoch": 0.4028623257923288, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 46193 + }, + { + "epoch": 0.4028710470774973, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 46194 + }, + { + "epoch": 0.4028797683626659, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 46195 + }, + { + "epoch": 0.4028884896478345, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 46196 + }, + { + "epoch": 0.40289721093300307, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 46197 + }, + { + "epoch": 0.40290593221817167, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 46198 + }, + { + "epoch": 0.40291465350334027, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 46199 + }, + { + "epoch": 0.4029233747885088, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9764, + "step": 46200 + }, + { + "epoch": 0.4029320960736774, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 46201 + }, + { + "epoch": 0.402940817358846, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 46202 + }, + { + "epoch": 0.40294953864401456, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 46203 + }, + { + "epoch": 0.40295825992918316, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 46204 + }, + { + "epoch": 0.40296698121435176, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 46205 + }, + { + "epoch": 0.4029757024995203, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 46206 + }, + { + "epoch": 0.4029844237846889, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 46207 + }, + { + "epoch": 0.4029931450698575, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 46208 + }, + { + "epoch": 0.40300186635502605, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 46209 + }, + { + "epoch": 0.40301058764019465, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 46210 + }, + { + "epoch": 0.40301930892536325, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 46211 + }, + { + "epoch": 0.40302803021053185, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 46212 + }, + { + "epoch": 0.4030367514957004, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 46213 + }, + { + "epoch": 0.403045472780869, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46214 + }, + { + "epoch": 0.4030541940660376, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 46215 + }, + { + "epoch": 0.40306291535120614, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 46216 + }, + { + "epoch": 0.40307163663637474, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 46217 + }, + { + "epoch": 0.40308035792154334, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 46218 + }, + { + "epoch": 0.4030890792067119, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 46219 + }, + { + "epoch": 0.4030978004918805, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 46220 + }, + { + "epoch": 0.4031065217770491, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 46221 + }, + { + "epoch": 0.40311524306221763, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 46222 + }, + { + "epoch": 0.40312396434738623, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 46223 + }, + { + "epoch": 0.40313268563255483, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 46224 + }, + { + "epoch": 0.4031414069177234, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 46225 + }, + { + "epoch": 0.403150128202892, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 46226 + }, + { + "epoch": 0.4031588494880606, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 46227 + }, + { + "epoch": 0.4031675707732291, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 46228 + }, + { + "epoch": 0.4031762920583977, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 46229 + }, + { + "epoch": 0.4031850133435663, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 46230 + }, + { + "epoch": 0.40319373462873487, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 46231 + }, + { + "epoch": 0.40320245591390347, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 46232 + }, + { + "epoch": 0.40321117719907207, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 46233 + }, + { + "epoch": 0.4032198984842406, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 46234 + }, + { + "epoch": 0.4032286197694092, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 46235 + }, + { + "epoch": 0.4032373410545778, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 46236 + }, + { + "epoch": 0.40324606233974636, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 46237 + }, + { + "epoch": 0.40325478362491496, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 46238 + }, + { + "epoch": 0.40326350491008356, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46239 + }, + { + "epoch": 0.40327222619525216, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 46240 + }, + { + "epoch": 0.4032809474804207, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 46241 + }, + { + "epoch": 0.4032896687655893, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 46242 + }, + { + "epoch": 0.4032983900507579, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 46243 + }, + { + "epoch": 0.40330711133592645, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 46244 + }, + { + "epoch": 0.40331583262109505, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 46245 + }, + { + "epoch": 0.40332455390626365, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46246 + }, + { + "epoch": 0.4033332751914322, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 46247 + }, + { + "epoch": 0.4033419964766008, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 46248 + }, + { + "epoch": 0.4033507177617694, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 46249 + }, + { + "epoch": 0.40335943904693794, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 46250 + }, + { + "epoch": 0.40336816033210654, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 46251 + }, + { + "epoch": 0.40337688161727514, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 46252 + }, + { + "epoch": 0.4033856029024437, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 46253 + }, + { + "epoch": 0.4033943241876123, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 46254 + }, + { + "epoch": 0.4034030454727809, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 46255 + }, + { + "epoch": 0.40341176675794943, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46256 + }, + { + "epoch": 0.40342048804311803, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 46257 + }, + { + "epoch": 0.40342920932828663, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 46258 + }, + { + "epoch": 0.4034379306134552, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 46259 + }, + { + "epoch": 0.4034466518986238, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 46260 + }, + { + "epoch": 0.4034553731837924, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 46261 + }, + { + "epoch": 0.4034640944689609, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 46262 + }, + { + "epoch": 0.4034728157541295, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 46263 + }, + { + "epoch": 0.4034815370392981, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 46264 + }, + { + "epoch": 0.40349025832446667, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 46265 + }, + { + "epoch": 0.40349897960963527, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 46266 + }, + { + "epoch": 0.40350770089480387, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 46267 + }, + { + "epoch": 0.40351642217997247, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 46268 + }, + { + "epoch": 0.403525143465141, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 46269 + }, + { + "epoch": 0.4035338647503096, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 46270 + }, + { + "epoch": 0.4035425860354782, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 46271 + }, + { + "epoch": 0.40355130732064676, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 46272 + }, + { + "epoch": 0.40356002860581536, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 46273 + }, + { + "epoch": 0.40356874989098396, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 46274 + }, + { + "epoch": 0.4035774711761525, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 46275 + }, + { + "epoch": 0.4035861924613211, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 46276 + }, + { + "epoch": 0.4035949137464897, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 46277 + }, + { + "epoch": 0.40360363503165825, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 46278 + }, + { + "epoch": 0.40361235631682685, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 46279 + }, + { + "epoch": 0.40362107760199545, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 46280 + }, + { + "epoch": 0.403629798887164, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 46281 + }, + { + "epoch": 0.4036385201723326, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 46282 + }, + { + "epoch": 0.4036472414575012, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 46283 + }, + { + "epoch": 0.40365596274266974, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 46284 + }, + { + "epoch": 0.40366468402783834, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 46285 + }, + { + "epoch": 0.40367340531300694, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 46286 + }, + { + "epoch": 0.4036821265981755, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 46287 + }, + { + "epoch": 0.4036908478833441, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 46288 + }, + { + "epoch": 0.4036995691685127, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 46289 + }, + { + "epoch": 0.40370829045368123, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 46290 + }, + { + "epoch": 0.40371701173884983, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 46291 + }, + { + "epoch": 0.40372573302401843, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 46292 + }, + { + "epoch": 0.403734454309187, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 46293 + }, + { + "epoch": 0.4037431755943556, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 46294 + }, + { + "epoch": 0.4037518968795242, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 46295 + }, + { + "epoch": 0.4037606181646928, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 46296 + }, + { + "epoch": 0.4037693394498613, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 46297 + }, + { + "epoch": 0.4037780607350299, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0305, + "step": 46298 + }, + { + "epoch": 0.4037867820201985, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 46299 + }, + { + "epoch": 0.40379550330536707, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 46300 + }, + { + "epoch": 0.40380422459053567, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 46301 + }, + { + "epoch": 0.40381294587570427, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 46302 + }, + { + "epoch": 0.4038216671608728, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 46303 + }, + { + "epoch": 0.4038303884460414, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 46304 + }, + { + "epoch": 0.40383910973121, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 46305 + }, + { + "epoch": 0.40384783101637856, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 46306 + }, + { + "epoch": 0.40385655230154716, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 46307 + }, + { + "epoch": 0.40386527358671576, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 46308 + }, + { + "epoch": 0.4038739948718843, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 46309 + }, + { + "epoch": 0.4038827161570529, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46310 + }, + { + "epoch": 0.4038914374422215, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 46311 + }, + { + "epoch": 0.40390015872739005, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 46312 + }, + { + "epoch": 0.40390888001255865, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 46313 + }, + { + "epoch": 0.40391760129772725, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 46314 + }, + { + "epoch": 0.4039263225828958, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46315 + }, + { + "epoch": 0.4039350438680644, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 46316 + }, + { + "epoch": 0.403943765153233, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46317 + }, + { + "epoch": 0.40395248643840154, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 46318 + }, + { + "epoch": 0.40396120772357014, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 46319 + }, + { + "epoch": 0.40396992900873874, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 46320 + }, + { + "epoch": 0.40397865029390734, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 46321 + }, + { + "epoch": 0.4039873715790759, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 46322 + }, + { + "epoch": 0.4039960928642445, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 46323 + }, + { + "epoch": 0.4040048141494131, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 46324 + }, + { + "epoch": 0.4040135354345816, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 46325 + }, + { + "epoch": 0.40402225671975023, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 46326 + }, + { + "epoch": 0.40403097800491883, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 46327 + }, + { + "epoch": 0.4040396992900874, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 46328 + }, + { + "epoch": 0.404048420575256, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 46329 + }, + { + "epoch": 0.4040571418604246, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 46330 + }, + { + "epoch": 0.4040658631455931, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 46331 + }, + { + "epoch": 0.4040745844307617, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 46332 + }, + { + "epoch": 0.4040833057159303, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 46333 + }, + { + "epoch": 0.40409202700109886, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 46334 + }, + { + "epoch": 0.40410074828626746, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 46335 + }, + { + "epoch": 0.40410946957143606, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 46336 + }, + { + "epoch": 0.4041181908566046, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 46337 + }, + { + "epoch": 0.4041269121417732, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 46338 + }, + { + "epoch": 0.4041356334269418, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 46339 + }, + { + "epoch": 0.40414435471211035, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 46340 + }, + { + "epoch": 0.40415307599727895, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 46341 + }, + { + "epoch": 0.40416179728244755, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 46342 + }, + { + "epoch": 0.4041705185676161, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 46343 + }, + { + "epoch": 0.4041792398527847, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 46344 + }, + { + "epoch": 0.4041879611379533, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 46345 + }, + { + "epoch": 0.40419668242312184, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 46346 + }, + { + "epoch": 0.40420540370829044, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 46347 + }, + { + "epoch": 0.40421412499345905, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 46348 + }, + { + "epoch": 0.40422284627862765, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 46349 + }, + { + "epoch": 0.4042315675637962, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46350 + }, + { + "epoch": 0.4042402888489648, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 46351 + }, + { + "epoch": 0.4042490101341334, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 46352 + }, + { + "epoch": 0.40425773141930194, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 46353 + }, + { + "epoch": 0.40426645270447054, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 46354 + }, + { + "epoch": 0.40427517398963914, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 46355 + }, + { + "epoch": 0.4042838952748077, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 46356 + }, + { + "epoch": 0.4042926165599763, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 46357 + }, + { + "epoch": 0.4043013378451449, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 46358 + }, + { + "epoch": 0.4043100591303134, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 46359 + }, + { + "epoch": 0.404318780415482, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 46360 + }, + { + "epoch": 0.4043275017006506, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 46361 + }, + { + "epoch": 0.40433622298581917, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 46362 + }, + { + "epoch": 0.40434494427098777, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 46363 + }, + { + "epoch": 0.40435366555615637, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 46364 + }, + { + "epoch": 0.4043623868413249, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 46365 + }, + { + "epoch": 0.4043711081264935, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 46366 + }, + { + "epoch": 0.4043798294116621, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 46367 + }, + { + "epoch": 0.40438855069683066, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 46368 + }, + { + "epoch": 0.40439727198199926, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 46369 + }, + { + "epoch": 0.40440599326716786, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 46370 + }, + { + "epoch": 0.4044147145523364, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 46371 + }, + { + "epoch": 0.404423435837505, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 46372 + }, + { + "epoch": 0.4044321571226736, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0231, + "step": 46373 + }, + { + "epoch": 0.40444087840784215, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 46374 + }, + { + "epoch": 0.40444959969301075, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 46375 + }, + { + "epoch": 0.40445832097817935, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 46376 + }, + { + "epoch": 0.40446704226334795, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 46377 + }, + { + "epoch": 0.4044757635485165, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 46378 + }, + { + "epoch": 0.4044844848336851, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 46379 + }, + { + "epoch": 0.4044932061188537, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 46380 + }, + { + "epoch": 0.40450192740402224, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0318, + "step": 46381 + }, + { + "epoch": 0.40451064868919084, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 46382 + }, + { + "epoch": 0.40451936997435944, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 46383 + }, + { + "epoch": 0.404528091259528, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 46384 + }, + { + "epoch": 0.4045368125446966, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 46385 + }, + { + "epoch": 0.4045455338298652, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 46386 + }, + { + "epoch": 0.40455425511503373, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 46387 + }, + { + "epoch": 0.40456297640020233, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 46388 + }, + { + "epoch": 0.40457169768537093, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 46389 + }, + { + "epoch": 0.4045804189705395, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 46390 + }, + { + "epoch": 0.4045891402557081, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 46391 + }, + { + "epoch": 0.4045978615408767, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 46392 + }, + { + "epoch": 0.4046065828260452, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 46393 + }, + { + "epoch": 0.4046153041112138, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 46394 + }, + { + "epoch": 0.4046240253963824, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 46395 + }, + { + "epoch": 0.40463274668155097, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 46396 + }, + { + "epoch": 0.40464146796671957, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 46397 + }, + { + "epoch": 0.40465018925188817, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 46398 + }, + { + "epoch": 0.4046589105370567, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 46399 + }, + { + "epoch": 0.4046676318222253, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 46400 + }, + { + "epoch": 0.4046763531073939, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 46401 + }, + { + "epoch": 0.40468507439256246, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 46402 + }, + { + "epoch": 0.40469379567773106, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 46403 + }, + { + "epoch": 0.40470251696289966, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 46404 + }, + { + "epoch": 0.40471123824806826, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 46405 + }, + { + "epoch": 0.4047199595332368, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 46406 + }, + { + "epoch": 0.4047286808184054, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 46407 + }, + { + "epoch": 0.404737402103574, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 46408 + }, + { + "epoch": 0.40474612338874255, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 46409 + }, + { + "epoch": 0.40475484467391115, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 46410 + }, + { + "epoch": 0.40476356595907975, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 46411 + }, + { + "epoch": 0.4047722872442483, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 46412 + }, + { + "epoch": 0.4047810085294169, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 46413 + }, + { + "epoch": 0.4047897298145855, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 46414 + }, + { + "epoch": 0.40479845109975404, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0306, + "step": 46415 + }, + { + "epoch": 0.40480717238492264, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 46416 + }, + { + "epoch": 0.40481589367009124, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 46417 + }, + { + "epoch": 0.4048246149552598, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 46418 + }, + { + "epoch": 0.4048333362404284, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 46419 + }, + { + "epoch": 0.404842057525597, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 46420 + }, + { + "epoch": 0.40485077881076553, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 46421 + }, + { + "epoch": 0.40485950009593413, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 46422 + }, + { + "epoch": 0.40486822138110273, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 46423 + }, + { + "epoch": 0.4048769426662713, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 46424 + }, + { + "epoch": 0.4048856639514399, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 46425 + }, + { + "epoch": 0.4048943852366085, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46426 + }, + { + "epoch": 0.404903106521777, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 46427 + }, + { + "epoch": 0.4049118278069456, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46428 + }, + { + "epoch": 0.4049205490921142, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 46429 + }, + { + "epoch": 0.4049292703772828, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 46430 + }, + { + "epoch": 0.40493799166245137, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 46431 + }, + { + "epoch": 0.40494671294761997, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 46432 + }, + { + "epoch": 0.40495543423278857, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 46433 + }, + { + "epoch": 0.4049641555179571, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 46434 + }, + { + "epoch": 0.4049728768031257, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 46435 + }, + { + "epoch": 0.4049815980882943, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 46436 + }, + { + "epoch": 0.40499031937346286, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 46437 + }, + { + "epoch": 0.40499904065863146, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 46438 + }, + { + "epoch": 0.40500776194380006, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 46439 + }, + { + "epoch": 0.4050164832289686, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 46440 + }, + { + "epoch": 0.4050252045141372, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 46441 + }, + { + "epoch": 0.4050339257993058, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 46442 + }, + { + "epoch": 0.40504264708447435, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 46443 + }, + { + "epoch": 0.40505136836964295, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 46444 + }, + { + "epoch": 0.40506008965481155, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 46445 + }, + { + "epoch": 0.4050688109399801, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 46446 + }, + { + "epoch": 0.4050775322251487, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 46447 + }, + { + "epoch": 0.4050862535103173, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 46448 + }, + { + "epoch": 0.40509497479548584, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 46449 + }, + { + "epoch": 0.40510369608065444, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 46450 + }, + { + "epoch": 0.40511241736582304, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 46451 + }, + { + "epoch": 0.4051211386509916, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 46452 + }, + { + "epoch": 0.4051298599361602, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 46453 + }, + { + "epoch": 0.4051385812213288, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 46454 + }, + { + "epoch": 0.40514730250649733, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 46455 + }, + { + "epoch": 0.40515602379166593, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 46456 + }, + { + "epoch": 0.40516474507683453, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 46457 + }, + { + "epoch": 0.40517346636200313, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 46458 + }, + { + "epoch": 0.4051821876471717, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 46459 + }, + { + "epoch": 0.4051909089323403, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 46460 + }, + { + "epoch": 0.4051996302175089, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 46461 + }, + { + "epoch": 0.4052083515026774, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 46462 + }, + { + "epoch": 0.405217072787846, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 46463 + }, + { + "epoch": 0.4052257940730146, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 46464 + }, + { + "epoch": 0.40523451535818317, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 46465 + }, + { + "epoch": 0.40524323664335177, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 46466 + }, + { + "epoch": 0.40525195792852037, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 46467 + }, + { + "epoch": 0.4052606792136889, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 46468 + }, + { + "epoch": 0.4052694004988575, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 46469 + }, + { + "epoch": 0.4052781217840261, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 46470 + }, + { + "epoch": 0.40528684306919466, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 46471 + }, + { + "epoch": 0.40529556435436326, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 46472 + }, + { + "epoch": 0.40530428563953186, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 46473 + }, + { + "epoch": 0.4053130069247004, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 46474 + }, + { + "epoch": 0.405321728209869, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 46475 + }, + { + "epoch": 0.4053304494950376, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 46476 + }, + { + "epoch": 0.40533917078020615, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 46477 + }, + { + "epoch": 0.40534789206537475, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 46478 + }, + { + "epoch": 0.40535661335054335, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 46479 + }, + { + "epoch": 0.4053653346357119, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 46480 + }, + { + "epoch": 0.4053740559208805, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 46481 + }, + { + "epoch": 0.4053827772060491, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 46482 + }, + { + "epoch": 0.40539149849121764, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 46483 + }, + { + "epoch": 0.40540021977638624, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 46484 + }, + { + "epoch": 0.40540894106155484, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46485 + }, + { + "epoch": 0.40541766234672344, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 46486 + }, + { + "epoch": 0.405426383631892, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 46487 + }, + { + "epoch": 0.4054351049170606, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 46488 + }, + { + "epoch": 0.4054438262022292, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 46489 + }, + { + "epoch": 0.40545254748739773, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 46490 + }, + { + "epoch": 0.40546126877256633, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 46491 + }, + { + "epoch": 0.40546999005773493, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 46492 + }, + { + "epoch": 0.4054787113429035, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 46493 + }, + { + "epoch": 0.4054874326280721, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 46494 + }, + { + "epoch": 0.4054961539132407, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 46495 + }, + { + "epoch": 0.4055048751984092, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46496 + }, + { + "epoch": 0.4055135964835778, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 46497 + }, + { + "epoch": 0.4055223177687464, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 46498 + }, + { + "epoch": 0.40553103905391497, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 46499 + }, + { + "epoch": 0.40553976033908357, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 46500 + }, + { + "epoch": 0.40554848162425217, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 46501 + }, + { + "epoch": 0.4055572029094207, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 46502 + }, + { + "epoch": 0.4055659241945893, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 46503 + }, + { + "epoch": 0.4055746454797579, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 46504 + }, + { + "epoch": 0.40558336676492646, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 46505 + }, + { + "epoch": 0.40559208805009506, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46506 + }, + { + "epoch": 0.40560080933526366, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 46507 + }, + { + "epoch": 0.4056095306204322, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 46508 + }, + { + "epoch": 0.4056182519056008, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 46509 + }, + { + "epoch": 0.4056269731907694, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 46510 + }, + { + "epoch": 0.40563569447593795, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 46511 + }, + { + "epoch": 0.40564441576110655, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 46512 + }, + { + "epoch": 0.40565313704627515, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 46513 + }, + { + "epoch": 0.40566185833144375, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 46514 + }, + { + "epoch": 0.4056705796166123, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 46515 + }, + { + "epoch": 0.4056793009017809, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 46516 + }, + { + "epoch": 0.4056880221869495, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 46517 + }, + { + "epoch": 0.40569674347211804, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 46518 + }, + { + "epoch": 0.40570546475728664, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 46519 + }, + { + "epoch": 0.40571418604245524, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 46520 + }, + { + "epoch": 0.4057229073276238, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 46521 + }, + { + "epoch": 0.4057316286127924, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 46522 + }, + { + "epoch": 0.405740349897961, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 46523 + }, + { + "epoch": 0.40574907118312953, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 46524 + }, + { + "epoch": 0.40575779246829813, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 46525 + }, + { + "epoch": 0.40576651375346673, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46526 + }, + { + "epoch": 0.4057752350386353, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 46527 + }, + { + "epoch": 0.4057839563238039, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 46528 + }, + { + "epoch": 0.4057926776089725, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 46529 + }, + { + "epoch": 0.405801398894141, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 46530 + }, + { + "epoch": 0.4058101201793096, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 46531 + }, + { + "epoch": 0.4058188414644782, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 46532 + }, + { + "epoch": 0.40582756274964676, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46533 + }, + { + "epoch": 0.40583628403481536, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46534 + }, + { + "epoch": 0.40584500531998396, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 46535 + }, + { + "epoch": 0.4058537266051525, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 46536 + }, + { + "epoch": 0.4058624478903211, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 46537 + }, + { + "epoch": 0.4058711691754897, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 46538 + }, + { + "epoch": 0.4058798904606583, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 46539 + }, + { + "epoch": 0.40588861174582685, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 46540 + }, + { + "epoch": 0.40589733303099546, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 46541 + }, + { + "epoch": 0.40590605431616406, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 46542 + }, + { + "epoch": 0.4059147756013326, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 46543 + }, + { + "epoch": 0.4059234968865012, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 46544 + }, + { + "epoch": 0.4059322181716698, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 46545 + }, + { + "epoch": 0.40594093945683835, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 46546 + }, + { + "epoch": 0.40594966074200695, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 46547 + }, + { + "epoch": 0.40595838202717555, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 46548 + }, + { + "epoch": 0.4059671033123441, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 46549 + }, + { + "epoch": 0.4059758245975127, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 46550 + }, + { + "epoch": 0.4059845458826813, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46551 + }, + { + "epoch": 0.40599326716784984, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 46552 + }, + { + "epoch": 0.40600198845301844, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.977, + "step": 46553 + }, + { + "epoch": 0.40601070973818704, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 46554 + }, + { + "epoch": 0.4060194310233556, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 46555 + }, + { + "epoch": 0.4060281523085242, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 46556 + }, + { + "epoch": 0.4060368735936928, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 46557 + }, + { + "epoch": 0.4060455948788613, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 46558 + }, + { + "epoch": 0.4060543161640299, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 46559 + }, + { + "epoch": 0.4060630374491985, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 46560 + }, + { + "epoch": 0.40607175873436707, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 46561 + }, + { + "epoch": 0.4060804800195357, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 46562 + }, + { + "epoch": 0.4060892013047043, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 46563 + }, + { + "epoch": 0.4060979225898728, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46564 + }, + { + "epoch": 0.4061066438750414, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 46565 + }, + { + "epoch": 0.40611536516021, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 46566 + }, + { + "epoch": 0.4061240864453786, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 46567 + }, + { + "epoch": 0.40613280773054716, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 46568 + }, + { + "epoch": 0.40614152901571576, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 46569 + }, + { + "epoch": 0.40615025030088436, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 46570 + }, + { + "epoch": 0.4061589715860529, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46571 + }, + { + "epoch": 0.4061676928712215, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 46572 + }, + { + "epoch": 0.4061764141563901, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 46573 + }, + { + "epoch": 0.40618513544155865, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 46574 + }, + { + "epoch": 0.40619385672672725, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 46575 + }, + { + "epoch": 0.40620257801189585, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 46576 + }, + { + "epoch": 0.4062112992970644, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 46577 + }, + { + "epoch": 0.406220020582233, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 46578 + }, + { + "epoch": 0.4062287418674016, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 46579 + }, + { + "epoch": 0.40623746315257014, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 46580 + }, + { + "epoch": 0.40624618443773874, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46581 + }, + { + "epoch": 0.40625490572290734, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 46582 + }, + { + "epoch": 0.4062636270080759, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 46583 + }, + { + "epoch": 0.4062723482932445, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 46584 + }, + { + "epoch": 0.4062810695784131, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 46585 + }, + { + "epoch": 0.40628979086358163, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 46586 + }, + { + "epoch": 0.40629851214875023, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 46587 + }, + { + "epoch": 0.40630723343391884, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 46588 + }, + { + "epoch": 0.4063159547190874, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 46589 + }, + { + "epoch": 0.406324676004256, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 46590 + }, + { + "epoch": 0.4063333972894246, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 46591 + }, + { + "epoch": 0.4063421185745931, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 46592 + }, + { + "epoch": 0.4063508398597617, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 46593 + }, + { + "epoch": 0.4063595611449303, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 46594 + }, + { + "epoch": 0.4063682824300989, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 46595 + }, + { + "epoch": 0.40637700371526747, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 46596 + }, + { + "epoch": 0.40638572500043607, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 46597 + }, + { + "epoch": 0.40639444628560467, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 46598 + }, + { + "epoch": 0.4064031675707732, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 46599 + }, + { + "epoch": 0.4064118888559418, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 46600 + }, + { + "epoch": 0.4064206101411104, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 46601 + }, + { + "epoch": 0.40642933142627896, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 46602 + }, + { + "epoch": 0.40643805271144756, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 46603 + }, + { + "epoch": 0.40644677399661616, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 46604 + }, + { + "epoch": 0.4064554952817847, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 46605 + }, + { + "epoch": 0.4064642165669533, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 46606 + }, + { + "epoch": 0.4064729378521219, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 46607 + }, + { + "epoch": 0.40648165913729045, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 46608 + }, + { + "epoch": 0.40649038042245905, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 46609 + }, + { + "epoch": 0.40649910170762765, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 46610 + }, + { + "epoch": 0.4065078229927962, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46611 + }, + { + "epoch": 0.4065165442779648, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 46612 + }, + { + "epoch": 0.4065252655631334, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 46613 + }, + { + "epoch": 0.40653398684830194, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 46614 + }, + { + "epoch": 0.40654270813347054, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 46615 + }, + { + "epoch": 0.40655142941863914, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46616 + }, + { + "epoch": 0.4065601507038077, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 46617 + }, + { + "epoch": 0.4065688719889763, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 46618 + }, + { + "epoch": 0.4065775932741449, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9819, + "step": 46619 + }, + { + "epoch": 0.40658631455931343, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 46620 + }, + { + "epoch": 0.40659503584448203, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46621 + }, + { + "epoch": 0.40660375712965063, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 46622 + }, + { + "epoch": 0.40661247841481923, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 46623 + }, + { + "epoch": 0.4066211996999878, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 46624 + }, + { + "epoch": 0.4066299209851564, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 46625 + }, + { + "epoch": 0.406638642270325, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 46626 + }, + { + "epoch": 0.4066473635554935, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9758, + "step": 46627 + }, + { + "epoch": 0.4066560848406621, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 46628 + }, + { + "epoch": 0.4066648061258307, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 46629 + }, + { + "epoch": 0.40667352741099927, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 46630 + }, + { + "epoch": 0.40668224869616787, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 46631 + }, + { + "epoch": 0.40669096998133647, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 46632 + }, + { + "epoch": 0.406699691266505, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 46633 + }, + { + "epoch": 0.4067084125516736, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 46634 + }, + { + "epoch": 0.4067171338368422, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 46635 + }, + { + "epoch": 0.40672585512201076, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 46636 + }, + { + "epoch": 0.40673457640717936, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 46637 + }, + { + "epoch": 0.40674329769234796, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 46638 + }, + { + "epoch": 0.4067520189775165, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 46639 + }, + { + "epoch": 0.4067607402626851, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 46640 + }, + { + "epoch": 0.4067694615478537, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 46641 + }, + { + "epoch": 0.40677818283302225, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 46642 + }, + { + "epoch": 0.40678690411819085, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 46643 + }, + { + "epoch": 0.40679562540335945, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 46644 + }, + { + "epoch": 0.406804346688528, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 46645 + }, + { + "epoch": 0.4068130679736966, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 46646 + }, + { + "epoch": 0.4068217892588652, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 46647 + }, + { + "epoch": 0.4068305105440338, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 46648 + }, + { + "epoch": 0.40683923182920234, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 46649 + }, + { + "epoch": 0.40684795311437094, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 46650 + }, + { + "epoch": 0.40685667439953954, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 46651 + }, + { + "epoch": 0.4068653956847081, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 46652 + }, + { + "epoch": 0.4068741169698767, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 46653 + }, + { + "epoch": 0.4068828382550453, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 46654 + }, + { + "epoch": 0.40689155954021383, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0317, + "step": 46655 + }, + { + "epoch": 0.40690028082538243, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9829, + "step": 46656 + }, + { + "epoch": 0.40690900211055103, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 46657 + }, + { + "epoch": 0.4069177233957196, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 46658 + }, + { + "epoch": 0.4069264446808882, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 46659 + }, + { + "epoch": 0.4069351659660568, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 46660 + }, + { + "epoch": 0.4069438872512253, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 46661 + }, + { + "epoch": 0.4069526085363939, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 46662 + }, + { + "epoch": 0.4069613298215625, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 46663 + }, + { + "epoch": 0.40697005110673107, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 46664 + }, + { + "epoch": 0.40697877239189967, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9802, + "step": 46665 + }, + { + "epoch": 0.40698749367706827, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 46666 + }, + { + "epoch": 0.4069962149622368, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46667 + }, + { + "epoch": 0.4070049362474054, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 46668 + }, + { + "epoch": 0.407013657532574, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 46669 + }, + { + "epoch": 0.40702237881774256, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 46670 + }, + { + "epoch": 0.40703110010291116, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46671 + }, + { + "epoch": 0.40703982138807976, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 46672 + }, + { + "epoch": 0.4070485426732483, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 46673 + }, + { + "epoch": 0.4070572639584169, + "grad_norm": 0.07080078125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 46674 + }, + { + "epoch": 0.4070659852435855, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 46675 + }, + { + "epoch": 0.4070747065287541, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 46676 + }, + { + "epoch": 0.40708342781392265, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 46677 + }, + { + "epoch": 0.40709214909909125, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 46678 + }, + { + "epoch": 0.40710087038425985, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 46679 + }, + { + "epoch": 0.4071095916694284, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 46680 + }, + { + "epoch": 0.407118312954597, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 46681 + }, + { + "epoch": 0.4071270342397656, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 46682 + }, + { + "epoch": 0.40713575552493414, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 46683 + }, + { + "epoch": 0.40714447681010274, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 46684 + }, + { + "epoch": 0.40715319809527134, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 46685 + }, + { + "epoch": 0.4071619193804399, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 46686 + }, + { + "epoch": 0.4071706406656085, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 46687 + }, + { + "epoch": 0.4071793619507771, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 46688 + }, + { + "epoch": 0.40718808323594563, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 46689 + }, + { + "epoch": 0.40719680452111423, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 46690 + }, + { + "epoch": 0.40720552580628283, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 46691 + }, + { + "epoch": 0.4072142470914514, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46692 + }, + { + "epoch": 0.40722296837662, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9813, + "step": 46693 + }, + { + "epoch": 0.4072316896617886, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 46694 + }, + { + "epoch": 0.4072404109469571, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 46695 + }, + { + "epoch": 0.4072491322321257, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 46696 + }, + { + "epoch": 0.4072578535172943, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 46697 + }, + { + "epoch": 0.40726657480246287, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 46698 + }, + { + "epoch": 0.40727529608763147, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0308, + "step": 46699 + }, + { + "epoch": 0.40728401737280007, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 46700 + }, + { + "epoch": 0.4072927386579686, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 46701 + }, + { + "epoch": 0.4073014599431372, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 46702 + }, + { + "epoch": 0.4073101812283058, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 46703 + }, + { + "epoch": 0.4073189025134744, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 46704 + }, + { + "epoch": 0.40732762379864296, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 46705 + }, + { + "epoch": 0.40733634508381156, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 46706 + }, + { + "epoch": 0.40734506636898016, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 46707 + }, + { + "epoch": 0.4073537876541487, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 46708 + }, + { + "epoch": 0.4073625089393173, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 46709 + }, + { + "epoch": 0.4073712302244859, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 46710 + }, + { + "epoch": 0.40737995150965445, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 46711 + }, + { + "epoch": 0.40738867279482305, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 46712 + }, + { + "epoch": 0.40739739407999165, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 46713 + }, + { + "epoch": 0.4074061153651602, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46714 + }, + { + "epoch": 0.4074148366503288, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 46715 + }, + { + "epoch": 0.4074235579354974, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 46716 + }, + { + "epoch": 0.40743227922066594, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 46717 + }, + { + "epoch": 0.40744100050583454, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9841, + "step": 46718 + }, + { + "epoch": 0.40744972179100314, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 46719 + }, + { + "epoch": 0.4074584430761717, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 46720 + }, + { + "epoch": 0.4074671643613403, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 46721 + }, + { + "epoch": 0.4074758856465089, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 46722 + }, + { + "epoch": 0.40748460693167743, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46723 + }, + { + "epoch": 0.40749332821684603, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 46724 + }, + { + "epoch": 0.40750204950201463, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 46725 + }, + { + "epoch": 0.4075107707871832, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 46726 + }, + { + "epoch": 0.4075194920723518, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 46727 + }, + { + "epoch": 0.4075282133575204, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 46728 + }, + { + "epoch": 0.4075369346426889, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 46729 + }, + { + "epoch": 0.4075456559278575, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 46730 + }, + { + "epoch": 0.4075543772130261, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 46731 + }, + { + "epoch": 0.4075630984981947, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 46732 + }, + { + "epoch": 0.40757181978336326, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 46733 + }, + { + "epoch": 0.40758054106853187, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 46734 + }, + { + "epoch": 0.40758926235370047, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 46735 + }, + { + "epoch": 0.407597983638869, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 46736 + }, + { + "epoch": 0.4076067049240376, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 46737 + }, + { + "epoch": 0.4076154262092062, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 46738 + }, + { + "epoch": 0.40762414749437476, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 46739 + }, + { + "epoch": 0.40763286877954336, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 46740 + }, + { + "epoch": 0.40764159006471196, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46741 + }, + { + "epoch": 0.4076503113498805, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 46742 + }, + { + "epoch": 0.4076590326350491, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 46743 + }, + { + "epoch": 0.4076677539202177, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46744 + }, + { + "epoch": 0.40767647520538625, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 46745 + }, + { + "epoch": 0.40768519649055485, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 46746 + }, + { + "epoch": 0.40769391777572345, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 46747 + }, + { + "epoch": 0.407702639060892, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 46748 + }, + { + "epoch": 0.4077113603460606, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 46749 + }, + { + "epoch": 0.4077200816312292, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 46750 + }, + { + "epoch": 0.40772880291639774, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 46751 + }, + { + "epoch": 0.40773752420156634, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 46752 + }, + { + "epoch": 0.40774624548673494, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46753 + }, + { + "epoch": 0.4077549667719035, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 46754 + }, + { + "epoch": 0.4077636880570721, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 46755 + }, + { + "epoch": 0.4077724093422407, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 46756 + }, + { + "epoch": 0.4077811306274092, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 46757 + }, + { + "epoch": 0.4077898519125778, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 46758 + }, + { + "epoch": 0.40779857319774643, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 46759 + }, + { + "epoch": 0.40780729448291503, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 46760 + }, + { + "epoch": 0.4078160157680836, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 46761 + }, + { + "epoch": 0.4078247370532522, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 46762 + }, + { + "epoch": 0.4078334583384208, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46763 + }, + { + "epoch": 0.4078421796235893, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 46764 + }, + { + "epoch": 0.4078509009087579, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 46765 + }, + { + "epoch": 0.4078596221939265, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 46766 + }, + { + "epoch": 0.40786834347909506, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 46767 + }, + { + "epoch": 0.40787706476426366, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 46768 + }, + { + "epoch": 0.40788578604943226, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 46769 + }, + { + "epoch": 0.4078945073346008, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46770 + }, + { + "epoch": 0.4079032286197694, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 46771 + }, + { + "epoch": 0.407911949904938, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 46772 + }, + { + "epoch": 0.40792067119010655, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 46773 + }, + { + "epoch": 0.40792939247527515, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 46774 + }, + { + "epoch": 0.40793811376044375, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 46775 + }, + { + "epoch": 0.4079468350456123, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 46776 + }, + { + "epoch": 0.4079555563307809, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 46777 + }, + { + "epoch": 0.4079642776159495, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 46778 + }, + { + "epoch": 0.40797299890111804, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 46779 + }, + { + "epoch": 0.40798172018628664, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 46780 + }, + { + "epoch": 0.40799044147145525, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 46781 + }, + { + "epoch": 0.4079991627566238, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 46782 + }, + { + "epoch": 0.4080078840417924, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 46783 + }, + { + "epoch": 0.408016605326961, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 46784 + }, + { + "epoch": 0.4080253266121296, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 46785 + }, + { + "epoch": 0.40803404789729814, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 46786 + }, + { + "epoch": 0.40804276918246674, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 46787 + }, + { + "epoch": 0.40805149046763534, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 46788 + }, + { + "epoch": 0.4080602117528039, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 46789 + }, + { + "epoch": 0.4080689330379725, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 46790 + }, + { + "epoch": 0.4080776543231411, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 46791 + }, + { + "epoch": 0.4080863756083096, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 46792 + }, + { + "epoch": 0.4080950968934782, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 46793 + }, + { + "epoch": 0.4081038181786468, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 46794 + }, + { + "epoch": 0.40811253946381537, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 46795 + }, + { + "epoch": 0.40812126074898397, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 46796 + }, + { + "epoch": 0.40812998203415257, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 46797 + }, + { + "epoch": 0.4081387033193211, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 46798 + }, + { + "epoch": 0.4081474246044897, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 46799 + }, + { + "epoch": 0.4081561458896583, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 46800 + }, + { + "epoch": 0.40816486717482686, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 46801 + }, + { + "epoch": 0.40817358845999546, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 46802 + }, + { + "epoch": 0.40818230974516406, + "grad_norm": 0.2265625, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 46803 + }, + { + "epoch": 0.4081910310303326, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 46804 + }, + { + "epoch": 0.4081997523155012, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 46805 + }, + { + "epoch": 0.4082084736006698, + "grad_norm": 0.318359375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46806 + }, + { + "epoch": 0.40821719488583835, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 46807 + }, + { + "epoch": 0.40822591617100695, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 46808 + }, + { + "epoch": 0.40823463745617555, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 46809 + }, + { + "epoch": 0.4082433587413441, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.98, + "step": 46810 + }, + { + "epoch": 0.4082520800265127, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 46811 + }, + { + "epoch": 0.4082608013116813, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 46812 + }, + { + "epoch": 0.4082695225968499, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 46813 + }, + { + "epoch": 0.40827824388201844, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 46814 + }, + { + "epoch": 0.40828696516718704, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 46815 + }, + { + "epoch": 0.40829568645235564, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 46816 + }, + { + "epoch": 0.4083044077375242, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 46817 + }, + { + "epoch": 0.4083131290226928, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 46818 + }, + { + "epoch": 0.4083218503078614, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 46819 + }, + { + "epoch": 0.40833057159302993, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 46820 + }, + { + "epoch": 0.40833929287819853, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 46821 + }, + { + "epoch": 0.40834801416336713, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 46822 + }, + { + "epoch": 0.4083567354485357, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 46823 + }, + { + "epoch": 0.4083654567337043, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 46824 + }, + { + "epoch": 0.4083741780188729, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 46825 + }, + { + "epoch": 0.4083828993040414, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 46826 + }, + { + "epoch": 0.40839162058921, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 46827 + }, + { + "epoch": 0.4084003418743786, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 46828 + }, + { + "epoch": 0.40840906315954717, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 46829 + }, + { + "epoch": 0.40841778444471577, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 46830 + }, + { + "epoch": 0.40842650572988437, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 46831 + }, + { + "epoch": 0.4084352270150529, + "grad_norm": 0.35546875, + "learning_rate": 0.0005, + "loss": 0.9843, + "step": 46832 + }, + { + "epoch": 0.4084439483002215, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 46833 + }, + { + "epoch": 0.4084526695853901, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 46834 + }, + { + "epoch": 0.40846139087055866, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 46835 + }, + { + "epoch": 0.40847011215572726, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 46836 + }, + { + "epoch": 0.40847883344089586, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 46837 + }, + { + "epoch": 0.4084875547260644, + "grad_norm": 0.2431640625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 46838 + }, + { + "epoch": 0.408496276011233, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 46839 + }, + { + "epoch": 0.4085049972964016, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 46840 + }, + { + "epoch": 0.4085137185815702, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 46841 + }, + { + "epoch": 0.40852243986673875, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 46842 + }, + { + "epoch": 0.40853116115190735, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 46843 + }, + { + "epoch": 0.40853988243707595, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 46844 + }, + { + "epoch": 0.4085486037222445, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 46845 + }, + { + "epoch": 0.4085573250074131, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 46846 + }, + { + "epoch": 0.4085660462925817, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 46847 + }, + { + "epoch": 0.40857476757775024, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 46848 + }, + { + "epoch": 0.40858348886291884, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 46849 + }, + { + "epoch": 0.40859221014808744, + "grad_norm": 0.2158203125, + "learning_rate": 0.0005, + "loss": 0.9845, + "step": 46850 + }, + { + "epoch": 0.408600931433256, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 46851 + }, + { + "epoch": 0.4086096527184246, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 46852 + }, + { + "epoch": 0.4086183740035932, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 46853 + }, + { + "epoch": 0.40862709528876173, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 46854 + }, + { + "epoch": 0.40863581657393033, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 46855 + }, + { + "epoch": 0.40864453785909893, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 46856 + }, + { + "epoch": 0.4086532591442675, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 46857 + }, + { + "epoch": 0.4086619804294361, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 46858 + }, + { + "epoch": 0.4086707017146047, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 46859 + }, + { + "epoch": 0.4086794229997732, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 46860 + }, + { + "epoch": 0.4086881442849418, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 46861 + }, + { + "epoch": 0.4086968655701104, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 46862 + }, + { + "epoch": 0.40870558685527897, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 46863 + }, + { + "epoch": 0.40871430814044757, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 46864 + }, + { + "epoch": 0.40872302942561617, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 46865 + }, + { + "epoch": 0.4087317507107847, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 46866 + }, + { + "epoch": 0.4087404719959533, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 46867 + }, + { + "epoch": 0.4087491932811219, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 46868 + }, + { + "epoch": 0.4087579145662905, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 46869 + }, + { + "epoch": 0.40876663585145906, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 46870 + }, + { + "epoch": 0.40877535713662766, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0281, + "step": 46871 + }, + { + "epoch": 0.40878407842179626, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 46872 + }, + { + "epoch": 0.4087927997069648, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 46873 + }, + { + "epoch": 0.4088015209921334, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 46874 + }, + { + "epoch": 0.408810242277302, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 46875 + }, + { + "epoch": 0.40881896356247055, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 46876 + }, + { + "epoch": 0.40882768484763915, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 46877 + }, + { + "epoch": 0.40883640613280775, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 46878 + }, + { + "epoch": 0.4088451274179763, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 46879 + }, + { + "epoch": 0.4088538487031449, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46880 + }, + { + "epoch": 0.4088625699883135, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 46881 + }, + { + "epoch": 0.40887129127348204, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 46882 + }, + { + "epoch": 0.40888001255865064, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 46883 + }, + { + "epoch": 0.40888873384381924, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 46884 + }, + { + "epoch": 0.4088974551289878, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 46885 + }, + { + "epoch": 0.4089061764141564, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 46886 + }, + { + "epoch": 0.408914897699325, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 46887 + }, + { + "epoch": 0.40892361898449353, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 46888 + }, + { + "epoch": 0.40893234026966213, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 46889 + }, + { + "epoch": 0.40894106155483073, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 46890 + }, + { + "epoch": 0.4089497828399993, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 46891 + }, + { + "epoch": 0.4089585041251679, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 46892 + }, + { + "epoch": 0.4089672254103365, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 46893 + }, + { + "epoch": 0.4089759466955051, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 46894 + }, + { + "epoch": 0.4089846679806736, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46895 + }, + { + "epoch": 0.4089933892658422, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 46896 + }, + { + "epoch": 0.4090021105510108, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 46897 + }, + { + "epoch": 0.40901083183617937, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 46898 + }, + { + "epoch": 0.40901955312134797, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9737, + "step": 46899 + }, + { + "epoch": 0.40902827440651657, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46900 + }, + { + "epoch": 0.4090369956916851, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 46901 + }, + { + "epoch": 0.4090457169768537, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 46902 + }, + { + "epoch": 0.4090544382620223, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 46903 + }, + { + "epoch": 0.40906315954719086, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 46904 + }, + { + "epoch": 0.40907188083235946, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 46905 + }, + { + "epoch": 0.40908060211752806, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 46906 + }, + { + "epoch": 0.4090893234026966, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 46907 + }, + { + "epoch": 0.4090980446878652, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 46908 + }, + { + "epoch": 0.4091067659730338, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 46909 + }, + { + "epoch": 0.40911548725820235, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 46910 + }, + { + "epoch": 0.40912420854337095, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 46911 + }, + { + "epoch": 0.40913292982853955, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 46912 + }, + { + "epoch": 0.4091416511137081, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 46913 + }, + { + "epoch": 0.4091503723988767, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 46914 + }, + { + "epoch": 0.4091590936840453, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 46915 + }, + { + "epoch": 0.40916781496921384, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 46916 + }, + { + "epoch": 0.40917653625438244, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 46917 + }, + { + "epoch": 0.40918525753955104, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 46918 + }, + { + "epoch": 0.4091939788247196, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 46919 + }, + { + "epoch": 0.4092027001098882, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 46920 + }, + { + "epoch": 0.4092114213950568, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 46921 + }, + { + "epoch": 0.4092201426802254, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46922 + }, + { + "epoch": 0.40922886396539393, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 46923 + }, + { + "epoch": 0.40923758525056253, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 46924 + }, + { + "epoch": 0.40924630653573113, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 46925 + }, + { + "epoch": 0.4092550278208997, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 46926 + }, + { + "epoch": 0.4092637491060683, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 46927 + }, + { + "epoch": 0.4092724703912369, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 46928 + }, + { + "epoch": 0.4092811916764054, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 46929 + }, + { + "epoch": 0.409289912961574, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 46930 + }, + { + "epoch": 0.4092986342467426, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 46931 + }, + { + "epoch": 0.40930735553191117, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 46932 + }, + { + "epoch": 0.40931607681707977, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 46933 + }, + { + "epoch": 0.40932479810224837, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 46934 + }, + { + "epoch": 0.4093335193874169, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 46935 + }, + { + "epoch": 0.4093422406725855, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 46936 + }, + { + "epoch": 0.4093509619577541, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 46937 + }, + { + "epoch": 0.40935968324292266, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 46938 + }, + { + "epoch": 0.40936840452809126, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 46939 + }, + { + "epoch": 0.40937712581325986, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 46940 + }, + { + "epoch": 0.4093858470984284, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 46941 + }, + { + "epoch": 0.409394568383597, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 46942 + }, + { + "epoch": 0.4094032896687656, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 46943 + }, + { + "epoch": 0.40941201095393415, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 46944 + }, + { + "epoch": 0.40942073223910275, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 46945 + }, + { + "epoch": 0.40942945352427135, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 46946 + }, + { + "epoch": 0.4094381748094399, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 46947 + }, + { + "epoch": 0.4094468960946085, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 46948 + }, + { + "epoch": 0.4094556173797771, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 46949 + }, + { + "epoch": 0.4094643386649457, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 46950 + }, + { + "epoch": 0.40947305995011424, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 46951 + }, + { + "epoch": 0.40948178123528284, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46952 + }, + { + "epoch": 0.40949050252045144, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 46953 + }, + { + "epoch": 0.40949922380562, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 46954 + }, + { + "epoch": 0.4095079450907886, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 46955 + }, + { + "epoch": 0.4095166663759572, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 46956 + }, + { + "epoch": 0.40952538766112573, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 46957 + }, + { + "epoch": 0.40953410894629433, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 46958 + }, + { + "epoch": 0.40954283023146293, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 46959 + }, + { + "epoch": 0.4095515515166315, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 46960 + }, + { + "epoch": 0.4095602728018001, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 46961 + }, + { + "epoch": 0.4095689940869687, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 46962 + }, + { + "epoch": 0.4095777153721372, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 46963 + }, + { + "epoch": 0.4095864366573058, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 46964 + }, + { + "epoch": 0.4095951579424744, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 46965 + }, + { + "epoch": 0.40960387922764296, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46966 + }, + { + "epoch": 0.40961260051281156, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 46967 + }, + { + "epoch": 0.40962132179798016, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 46968 + }, + { + "epoch": 0.4096300430831487, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 46969 + }, + { + "epoch": 0.4096387643683173, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 46970 + }, + { + "epoch": 0.4096474856534859, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 46971 + }, + { + "epoch": 0.40965620693865445, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 46972 + }, + { + "epoch": 0.40966492822382305, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 46973 + }, + { + "epoch": 0.40967364950899166, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 46974 + }, + { + "epoch": 0.4096823707941602, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 46975 + }, + { + "epoch": 0.4096910920793288, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 46976 + }, + { + "epoch": 0.4096998133644974, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 46977 + }, + { + "epoch": 0.409708534649666, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 46978 + }, + { + "epoch": 0.40971725593483455, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 46979 + }, + { + "epoch": 0.40972597722000315, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 46980 + }, + { + "epoch": 0.40973469850517175, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 46981 + }, + { + "epoch": 0.4097434197903403, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 46982 + }, + { + "epoch": 0.4097521410755089, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 46983 + }, + { + "epoch": 0.4097608623606775, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 46984 + }, + { + "epoch": 0.40976958364584604, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 46985 + }, + { + "epoch": 0.40977830493101464, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 46986 + }, + { + "epoch": 0.40978702621618324, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9801, + "step": 46987 + }, + { + "epoch": 0.4097957475013518, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 46988 + }, + { + "epoch": 0.4098044687865204, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 46989 + }, + { + "epoch": 0.409813190071689, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 46990 + }, + { + "epoch": 0.4098219113568575, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 46991 + }, + { + "epoch": 0.4098306326420261, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 46992 + }, + { + "epoch": 0.4098393539271947, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 46993 + }, + { + "epoch": 0.40984807521236327, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 46994 + }, + { + "epoch": 0.4098567964975319, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 46995 + }, + { + "epoch": 0.4098655177827005, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 46996 + }, + { + "epoch": 0.409874239067869, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 46997 + }, + { + "epoch": 0.4098829603530376, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 46998 + }, + { + "epoch": 0.4098916816382062, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 46999 + }, + { + "epoch": 0.40990040292337476, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 47000 + }, + { + "epoch": 0.40990912420854336, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 47001 + }, + { + "epoch": 0.40991784549371196, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 47002 + }, + { + "epoch": 0.40992656677888056, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 47003 + }, + { + "epoch": 0.4099352880640491, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 47004 + }, + { + "epoch": 0.4099440093492177, + "grad_norm": 0.279296875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47005 + }, + { + "epoch": 0.4099527306343863, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 47006 + }, + { + "epoch": 0.40996145191955485, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47007 + }, + { + "epoch": 0.40997017320472345, + "grad_norm": 0.2294921875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 47008 + }, + { + "epoch": 0.40997889448989205, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.977, + "step": 47009 + }, + { + "epoch": 0.4099876157750606, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 47010 + }, + { + "epoch": 0.4099963370602292, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 47011 + }, + { + "epoch": 0.4100050583453978, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 47012 + }, + { + "epoch": 0.41001377963056634, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 47013 + }, + { + "epoch": 0.41002250091573494, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 47014 + }, + { + "epoch": 0.41003122220090354, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 47015 + }, + { + "epoch": 0.4100399434860721, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 47016 + }, + { + "epoch": 0.4100486647712407, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 47017 + }, + { + "epoch": 0.4100573860564093, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 47018 + }, + { + "epoch": 0.41006610734157783, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 47019 + }, + { + "epoch": 0.41007482862674643, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 47020 + }, + { + "epoch": 0.41008354991191504, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 47021 + }, + { + "epoch": 0.4100922711970836, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 47022 + }, + { + "epoch": 0.4101009924822522, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 47023 + }, + { + "epoch": 0.4101097137674208, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0299, + "step": 47024 + }, + { + "epoch": 0.4101184350525893, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 47025 + }, + { + "epoch": 0.4101271563377579, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 47026 + }, + { + "epoch": 0.4101358776229265, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 47027 + }, + { + "epoch": 0.41014459890809507, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 47028 + }, + { + "epoch": 0.41015332019326367, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 47029 + }, + { + "epoch": 0.41016204147843227, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 47030 + }, + { + "epoch": 0.41017076276360087, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 47031 + }, + { + "epoch": 0.4101794840487694, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 47032 + }, + { + "epoch": 0.410188205333938, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 47033 + }, + { + "epoch": 0.4101969266191066, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 47034 + }, + { + "epoch": 0.41020564790427516, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 47035 + }, + { + "epoch": 0.41021436918944376, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 47036 + }, + { + "epoch": 0.41022309047461236, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 47037 + }, + { + "epoch": 0.4102318117597809, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 47038 + }, + { + "epoch": 0.4102405330449495, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 47039 + }, + { + "epoch": 0.4102492543301181, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 47040 + }, + { + "epoch": 0.41025797561528665, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 47041 + }, + { + "epoch": 0.41026669690045525, + "grad_norm": 0.228515625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 47042 + }, + { + "epoch": 0.41027541818562385, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 47043 + }, + { + "epoch": 0.4102841394707924, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 47044 + }, + { + "epoch": 0.410292860755961, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 47045 + }, + { + "epoch": 0.4103015820411296, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 47046 + }, + { + "epoch": 0.41031030332629814, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 47047 + }, + { + "epoch": 0.41031902461146674, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 47048 + }, + { + "epoch": 0.41032774589663534, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 47049 + }, + { + "epoch": 0.4103364671818039, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 47050 + }, + { + "epoch": 0.4103451884669725, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 47051 + }, + { + "epoch": 0.4103539097521411, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 47052 + }, + { + "epoch": 0.41036263103730963, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 47053 + }, + { + "epoch": 0.41037135232247823, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 47054 + }, + { + "epoch": 0.41038007360764683, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 47055 + }, + { + "epoch": 0.4103887948928154, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 47056 + }, + { + "epoch": 0.410397516177984, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 47057 + }, + { + "epoch": 0.4104062374631526, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 47058 + }, + { + "epoch": 0.4104149587483212, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 47059 + }, + { + "epoch": 0.4104236800334897, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 47060 + }, + { + "epoch": 0.4104324013186583, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 47061 + }, + { + "epoch": 0.4104411226038269, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 47062 + }, + { + "epoch": 0.41044984388899547, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 47063 + }, + { + "epoch": 0.41045856517416407, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 47064 + }, + { + "epoch": 0.41046728645933267, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 47065 + }, + { + "epoch": 0.4104760077445012, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 47066 + }, + { + "epoch": 0.4104847290296698, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 47067 + }, + { + "epoch": 0.4104934503148384, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9702, + "step": 47068 + }, + { + "epoch": 0.41050217160000696, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 47069 + }, + { + "epoch": 0.41051089288517556, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 47070 + }, + { + "epoch": 0.41051961417034416, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 47071 + }, + { + "epoch": 0.4105283354555127, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 47072 + }, + { + "epoch": 0.4105370567406813, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9796, + "step": 47073 + }, + { + "epoch": 0.4105457780258499, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 47074 + }, + { + "epoch": 0.41055449931101845, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 47075 + }, + { + "epoch": 0.41056322059618705, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.035, + "step": 47076 + }, + { + "epoch": 0.41057194188135565, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 47077 + }, + { + "epoch": 0.4105806631665242, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 47078 + }, + { + "epoch": 0.4105893844516928, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 47079 + }, + { + "epoch": 0.4105981057368614, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 47080 + }, + { + "epoch": 0.41060682702202994, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 47081 + }, + { + "epoch": 0.41061554830719854, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 47082 + }, + { + "epoch": 0.41062426959236714, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 47083 + }, + { + "epoch": 0.4106329908775357, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 47084 + }, + { + "epoch": 0.4106417121627043, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 47085 + }, + { + "epoch": 0.4106504334478729, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 47086 + }, + { + "epoch": 0.4106591547330415, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47087 + }, + { + "epoch": 0.41066787601821003, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 47088 + }, + { + "epoch": 0.41067659730337863, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 47089 + }, + { + "epoch": 0.41068531858854723, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 47090 + }, + { + "epoch": 0.4106940398737158, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 47091 + }, + { + "epoch": 0.4107027611588844, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 47092 + }, + { + "epoch": 0.410711482444053, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 47093 + }, + { + "epoch": 0.4107202037292215, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 47094 + }, + { + "epoch": 0.4107289250143901, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 47095 + }, + { + "epoch": 0.4107376462995587, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 47096 + }, + { + "epoch": 0.41074636758472727, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 47097 + }, + { + "epoch": 0.41075508886989587, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 47098 + }, + { + "epoch": 0.41076381015506447, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47099 + }, + { + "epoch": 0.410772531440233, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 47100 + }, + { + "epoch": 0.4107812527254016, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 47101 + }, + { + "epoch": 0.4107899740105702, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 47102 + }, + { + "epoch": 0.41079869529573876, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 47103 + }, + { + "epoch": 0.41080741658090736, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 47104 + }, + { + "epoch": 0.41081613786607596, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 47105 + }, + { + "epoch": 0.4108248591512445, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 47106 + }, + { + "epoch": 0.4108335804364131, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 47107 + }, + { + "epoch": 0.4108423017215817, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 47108 + }, + { + "epoch": 0.41085102300675025, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 47109 + }, + { + "epoch": 0.41085974429191885, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 47110 + }, + { + "epoch": 0.41086846557708745, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 47111 + }, + { + "epoch": 0.41087718686225605, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 47112 + }, + { + "epoch": 0.4108859081474246, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 47113 + }, + { + "epoch": 0.4108946294325932, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 47114 + }, + { + "epoch": 0.4109033507177618, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47115 + }, + { + "epoch": 0.41091207200293034, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47116 + }, + { + "epoch": 0.41092079328809894, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 47117 + }, + { + "epoch": 0.41092951457326754, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 47118 + }, + { + "epoch": 0.4109382358584361, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 47119 + }, + { + "epoch": 0.4109469571436047, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 47120 + }, + { + "epoch": 0.4109556784287733, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 47121 + }, + { + "epoch": 0.41096439971394183, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 47122 + }, + { + "epoch": 0.41097312099911043, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 47123 + }, + { + "epoch": 0.41098184228427903, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 47124 + }, + { + "epoch": 0.4109905635694476, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 47125 + }, + { + "epoch": 0.4109992848546162, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 47126 + }, + { + "epoch": 0.4110080061397848, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 47127 + }, + { + "epoch": 0.4110167274249533, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 47128 + }, + { + "epoch": 0.4110254487101219, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 47129 + }, + { + "epoch": 0.4110341699952905, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 47130 + }, + { + "epoch": 0.41104289128045907, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 47131 + }, + { + "epoch": 0.41105161256562767, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 47132 + }, + { + "epoch": 0.41106033385079627, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 47133 + }, + { + "epoch": 0.4110690551359648, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 47134 + }, + { + "epoch": 0.4110777764211334, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 47135 + }, + { + "epoch": 0.411086497706302, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 47136 + }, + { + "epoch": 0.41109521899147056, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 47137 + }, + { + "epoch": 0.41110394027663916, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 47138 + }, + { + "epoch": 0.41111266156180776, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 47139 + }, + { + "epoch": 0.41112138284697636, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47140 + }, + { + "epoch": 0.4111301041321449, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 47141 + }, + { + "epoch": 0.4111388254173135, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 47142 + }, + { + "epoch": 0.4111475467024821, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 47143 + }, + { + "epoch": 0.41115626798765065, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 47144 + }, + { + "epoch": 0.41116498927281925, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 47145 + }, + { + "epoch": 0.41117371055798785, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 47146 + }, + { + "epoch": 0.4111824318431564, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47147 + }, + { + "epoch": 0.411191153128325, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 47148 + }, + { + "epoch": 0.4111998744134936, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47149 + }, + { + "epoch": 0.41120859569866214, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 47150 + }, + { + "epoch": 0.41121731698383074, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 47151 + }, + { + "epoch": 0.41122603826899934, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 47152 + }, + { + "epoch": 0.4112347595541679, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 47153 + }, + { + "epoch": 0.4112434808393365, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 47154 + }, + { + "epoch": 0.4112522021245051, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 47155 + }, + { + "epoch": 0.41126092340967363, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 47156 + }, + { + "epoch": 0.41126964469484223, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47157 + }, + { + "epoch": 0.41127836598001083, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 47158 + }, + { + "epoch": 0.4112870872651794, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 47159 + }, + { + "epoch": 0.411295808550348, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 47160 + }, + { + "epoch": 0.4113045298355166, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 47161 + }, + { + "epoch": 0.4113132511206851, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 47162 + }, + { + "epoch": 0.4113219724058537, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 47163 + }, + { + "epoch": 0.4113306936910223, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 47164 + }, + { + "epoch": 0.41133941497619086, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 47165 + }, + { + "epoch": 0.41134813626135946, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 47166 + }, + { + "epoch": 0.41135685754652807, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 47167 + }, + { + "epoch": 0.41136557883169667, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 47168 + }, + { + "epoch": 0.4113743001168652, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 47169 + }, + { + "epoch": 0.4113830214020338, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 47170 + }, + { + "epoch": 0.4113917426872024, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0295, + "step": 47171 + }, + { + "epoch": 0.41140046397237096, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 47172 + }, + { + "epoch": 0.41140918525753956, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 47173 + }, + { + "epoch": 0.41141790654270816, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 47174 + }, + { + "epoch": 0.4114266278278767, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 47175 + }, + { + "epoch": 0.4114353491130453, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 47176 + }, + { + "epoch": 0.4114440703982139, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 47177 + }, + { + "epoch": 0.41145279168338245, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 47178 + }, + { + "epoch": 0.41146151296855105, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 47179 + }, + { + "epoch": 0.41147023425371965, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 47180 + }, + { + "epoch": 0.4114789555388882, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 47181 + }, + { + "epoch": 0.4114876768240568, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 47182 + }, + { + "epoch": 0.4114963981092254, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 47183 + }, + { + "epoch": 0.41150511939439394, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 47184 + }, + { + "epoch": 0.41151384067956254, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.983, + "step": 47185 + }, + { + "epoch": 0.41152256196473114, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 47186 + }, + { + "epoch": 0.4115312832498997, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9845, + "step": 47187 + }, + { + "epoch": 0.4115400045350683, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 47188 + }, + { + "epoch": 0.4115487258202369, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 47189 + }, + { + "epoch": 0.4115574471054054, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 47190 + }, + { + "epoch": 0.411566168390574, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 47191 + }, + { + "epoch": 0.41157488967574263, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 47192 + }, + { + "epoch": 0.4115836109609112, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 47193 + }, + { + "epoch": 0.4115923322460798, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47194 + }, + { + "epoch": 0.4116010535312484, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 47195 + }, + { + "epoch": 0.411609774816417, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 47196 + }, + { + "epoch": 0.4116184961015855, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 47197 + }, + { + "epoch": 0.4116272173867541, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 47198 + }, + { + "epoch": 0.4116359386719227, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 47199 + }, + { + "epoch": 0.41164465995709126, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 47200 + }, + { + "epoch": 0.41165338124225986, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 47201 + }, + { + "epoch": 0.41166210252742846, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 47202 + }, + { + "epoch": 0.411670823812597, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 47203 + }, + { + "epoch": 0.4116795450977656, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 47204 + }, + { + "epoch": 0.4116882663829342, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 47205 + }, + { + "epoch": 0.41169698766810275, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 47206 + }, + { + "epoch": 0.41170570895327135, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 47207 + }, + { + "epoch": 0.41171443023843995, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 47208 + }, + { + "epoch": 0.4117231515236085, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 47209 + }, + { + "epoch": 0.4117318728087771, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 47210 + }, + { + "epoch": 0.4117405940939457, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47211 + }, + { + "epoch": 0.41174931537911424, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 47212 + }, + { + "epoch": 0.41175803666428284, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 47213 + }, + { + "epoch": 0.41176675794945145, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 47214 + }, + { + "epoch": 0.41177547923462, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 47215 + }, + { + "epoch": 0.4117842005197886, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 47216 + }, + { + "epoch": 0.4117929218049572, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 47217 + }, + { + "epoch": 0.41180164309012574, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 47218 + }, + { + "epoch": 0.41181036437529434, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 47219 + }, + { + "epoch": 0.41181908566046294, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47220 + }, + { + "epoch": 0.41182780694563154, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 47221 + }, + { + "epoch": 0.4118365282308001, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 47222 + }, + { + "epoch": 0.4118452495159687, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 47223 + }, + { + "epoch": 0.4118539708011373, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 47224 + }, + { + "epoch": 0.4118626920863058, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 47225 + }, + { + "epoch": 0.4118714133714744, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 47226 + }, + { + "epoch": 0.411880134656643, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 47227 + }, + { + "epoch": 0.41188885594181157, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 47228 + }, + { + "epoch": 0.41189757722698017, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 47229 + }, + { + "epoch": 0.41190629851214877, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 47230 + }, + { + "epoch": 0.4119150197973173, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 47231 + }, + { + "epoch": 0.4119237410824859, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 47232 + }, + { + "epoch": 0.4119324623676545, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 47233 + }, + { + "epoch": 0.41194118365282306, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 47234 + }, + { + "epoch": 0.41194990493799166, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 47235 + }, + { + "epoch": 0.41195862622316026, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 47236 + }, + { + "epoch": 0.4119673475083288, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47237 + }, + { + "epoch": 0.4119760687934974, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 47238 + }, + { + "epoch": 0.411984790078666, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 47239 + }, + { + "epoch": 0.41199351136383455, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 47240 + }, + { + "epoch": 0.41200223264900315, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 47241 + }, + { + "epoch": 0.41201095393417175, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 47242 + }, + { + "epoch": 0.4120196752193403, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9839, + "step": 47243 + }, + { + "epoch": 0.4120283965045089, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 47244 + }, + { + "epoch": 0.4120371177896775, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 47245 + }, + { + "epoch": 0.41204583907484604, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 47246 + }, + { + "epoch": 0.41205456036001464, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 47247 + }, + { + "epoch": 0.41206328164518324, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 47248 + }, + { + "epoch": 0.41207200293035184, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 47249 + }, + { + "epoch": 0.4120807242155204, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 47250 + }, + { + "epoch": 0.412089445500689, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 47251 + }, + { + "epoch": 0.4120981667858576, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 47252 + }, + { + "epoch": 0.41210688807102613, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 47253 + }, + { + "epoch": 0.41211560935619473, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 47254 + }, + { + "epoch": 0.41212433064136333, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 47255 + }, + { + "epoch": 0.4121330519265319, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 47256 + }, + { + "epoch": 0.4121417732117005, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 47257 + }, + { + "epoch": 0.4121504944968691, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47258 + }, + { + "epoch": 0.4121592157820376, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 47259 + }, + { + "epoch": 0.4121679370672062, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 47260 + }, + { + "epoch": 0.4121766583523748, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0274, + "step": 47261 + }, + { + "epoch": 0.41218537963754337, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 47262 + }, + { + "epoch": 0.41219410092271197, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 47263 + }, + { + "epoch": 0.41220282220788057, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 47264 + }, + { + "epoch": 0.4122115434930491, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 47265 + }, + { + "epoch": 0.4122202647782177, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 47266 + }, + { + "epoch": 0.4122289860633863, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 47267 + }, + { + "epoch": 0.41223770734855486, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 47268 + }, + { + "epoch": 0.41224642863372346, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 47269 + }, + { + "epoch": 0.41225514991889206, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 47270 + }, + { + "epoch": 0.4122638712040606, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 47271 + }, + { + "epoch": 0.4122725924892292, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 47272 + }, + { + "epoch": 0.4122813137743978, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 47273 + }, + { + "epoch": 0.41229003505956635, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 47274 + }, + { + "epoch": 0.41229875634473495, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 47275 + }, + { + "epoch": 0.41230747762990355, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 47276 + }, + { + "epoch": 0.41231619891507215, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 47277 + }, + { + "epoch": 0.4123249202002407, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 47278 + }, + { + "epoch": 0.4123336414854093, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.982, + "step": 47279 + }, + { + "epoch": 0.4123423627705779, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 47280 + }, + { + "epoch": 0.41235108405574644, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47281 + }, + { + "epoch": 0.41235980534091504, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 47282 + }, + { + "epoch": 0.41236852662608364, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 47283 + }, + { + "epoch": 0.4123772479112522, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 47284 + }, + { + "epoch": 0.4123859691964208, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 47285 + }, + { + "epoch": 0.4123946904815894, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 47286 + }, + { + "epoch": 0.41240341176675793, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 47287 + }, + { + "epoch": 0.41241213305192653, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 47288 + }, + { + "epoch": 0.41242085433709513, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 47289 + }, + { + "epoch": 0.4124295756222637, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 47290 + }, + { + "epoch": 0.4124382969074323, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47291 + }, + { + "epoch": 0.4124470181926009, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 47292 + }, + { + "epoch": 0.4124557394777694, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 47293 + }, + { + "epoch": 0.412464460762938, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 47294 + }, + { + "epoch": 0.4124731820481066, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 47295 + }, + { + "epoch": 0.41248190333327517, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 47296 + }, + { + "epoch": 0.41249062461844377, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.028, + "step": 47297 + }, + { + "epoch": 0.41249934590361237, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 47298 + }, + { + "epoch": 0.4125080671887809, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 47299 + }, + { + "epoch": 0.4125167884739495, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 47300 + }, + { + "epoch": 0.4125255097591181, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 47301 + }, + { + "epoch": 0.41253423104428666, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 47302 + }, + { + "epoch": 0.41254295232945526, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47303 + }, + { + "epoch": 0.41255167361462386, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 47304 + }, + { + "epoch": 0.41256039489979246, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 47305 + }, + { + "epoch": 0.412569116184961, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 47306 + }, + { + "epoch": 0.4125778374701296, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 47307 + }, + { + "epoch": 0.4125865587552982, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 47308 + }, + { + "epoch": 0.41259528004046675, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 47309 + }, + { + "epoch": 0.41260400132563535, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 47310 + }, + { + "epoch": 0.41261272261080395, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 47311 + }, + { + "epoch": 0.4126214438959725, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 47312 + }, + { + "epoch": 0.4126301651811411, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 47313 + }, + { + "epoch": 0.4126388864663097, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 47314 + }, + { + "epoch": 0.41264760775147824, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 47315 + }, + { + "epoch": 0.41265632903664684, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 47316 + }, + { + "epoch": 0.41266505032181544, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 47317 + }, + { + "epoch": 0.412673771606984, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 47318 + }, + { + "epoch": 0.4126824928921526, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 47319 + }, + { + "epoch": 0.4126912141773212, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47320 + }, + { + "epoch": 0.41269993546248973, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 47321 + }, + { + "epoch": 0.41270865674765833, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 47322 + }, + { + "epoch": 0.41271737803282693, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 47323 + }, + { + "epoch": 0.4127260993179955, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 47324 + }, + { + "epoch": 0.4127348206031641, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 47325 + }, + { + "epoch": 0.4127435418883327, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47326 + }, + { + "epoch": 0.4127522631735012, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 47327 + }, + { + "epoch": 0.4127609844586698, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 47328 + }, + { + "epoch": 0.4127697057438384, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 47329 + }, + { + "epoch": 0.41277842702900697, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 47330 + }, + { + "epoch": 0.41278714831417557, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 47331 + }, + { + "epoch": 0.41279586959934417, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 47332 + }, + { + "epoch": 0.41280459088451277, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47333 + }, + { + "epoch": 0.4128133121696813, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 47334 + }, + { + "epoch": 0.4128220334548499, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 47335 + }, + { + "epoch": 0.4128307547400185, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 47336 + }, + { + "epoch": 0.41283947602518706, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 47337 + }, + { + "epoch": 0.41284819731035566, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 47338 + }, + { + "epoch": 0.41285691859552426, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 47339 + }, + { + "epoch": 0.4128656398806928, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 47340 + }, + { + "epoch": 0.4128743611658614, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 47341 + }, + { + "epoch": 0.41288308245103, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.98, + "step": 47342 + }, + { + "epoch": 0.41289180373619855, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 47343 + }, + { + "epoch": 0.41290052502136715, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 47344 + }, + { + "epoch": 0.41290924630653575, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 47345 + }, + { + "epoch": 0.4129179675917043, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 47346 + }, + { + "epoch": 0.4129266888768729, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 47347 + }, + { + "epoch": 0.4129354101620415, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 47348 + }, + { + "epoch": 0.41294413144721004, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 47349 + }, + { + "epoch": 0.41295285273237864, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 47350 + }, + { + "epoch": 0.41296157401754724, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 47351 + }, + { + "epoch": 0.4129702953027158, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 47352 + }, + { + "epoch": 0.4129790165878844, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 47353 + }, + { + "epoch": 0.412987737873053, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47354 + }, + { + "epoch": 0.41299645915822153, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 47355 + }, + { + "epoch": 0.41300518044339013, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 47356 + }, + { + "epoch": 0.41301390172855873, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 47357 + }, + { + "epoch": 0.41302262301372733, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 47358 + }, + { + "epoch": 0.4130313442988959, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 47359 + }, + { + "epoch": 0.4130400655840645, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 47360 + }, + { + "epoch": 0.4130487868692331, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 47361 + }, + { + "epoch": 0.4130575081544016, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 47362 + }, + { + "epoch": 0.4130662294395702, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 47363 + }, + { + "epoch": 0.4130749507247388, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 47364 + }, + { + "epoch": 0.41308367200990737, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9774, + "step": 47365 + }, + { + "epoch": 0.41309239329507597, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 47366 + }, + { + "epoch": 0.41310111458024457, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 47367 + }, + { + "epoch": 0.4131098358654131, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 47368 + }, + { + "epoch": 0.4131185571505817, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 47369 + }, + { + "epoch": 0.4131272784357503, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 47370 + }, + { + "epoch": 0.41313599972091886, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 47371 + }, + { + "epoch": 0.41314472100608746, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 47372 + }, + { + "epoch": 0.41315344229125606, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 47373 + }, + { + "epoch": 0.4131621635764246, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 47374 + }, + { + "epoch": 0.4131708848615932, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 47375 + }, + { + "epoch": 0.4131796061467618, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 47376 + }, + { + "epoch": 0.41318832743193035, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 47377 + }, + { + "epoch": 0.41319704871709895, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 47378 + }, + { + "epoch": 0.41320577000226755, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 47379 + }, + { + "epoch": 0.4132144912874361, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 47380 + }, + { + "epoch": 0.4132232125726047, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 47381 + }, + { + "epoch": 0.4132319338577733, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 47382 + }, + { + "epoch": 0.41324065514294184, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 47383 + }, + { + "epoch": 0.41324937642811044, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 47384 + }, + { + "epoch": 0.41325809771327904, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 47385 + }, + { + "epoch": 0.41326681899844764, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 47386 + }, + { + "epoch": 0.4132755402836162, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 47387 + }, + { + "epoch": 0.4132842615687848, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 47388 + }, + { + "epoch": 0.4132929828539534, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 47389 + }, + { + "epoch": 0.41330170413912193, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 47390 + }, + { + "epoch": 0.41331042542429053, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9784, + "step": 47391 + }, + { + "epoch": 0.41331914670945913, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 47392 + }, + { + "epoch": 0.4133278679946277, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 47393 + }, + { + "epoch": 0.4133365892797963, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 47394 + }, + { + "epoch": 0.4133453105649649, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 47395 + }, + { + "epoch": 0.4133540318501334, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 47396 + }, + { + "epoch": 0.413362753135302, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 47397 + }, + { + "epoch": 0.4133714744204706, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0304, + "step": 47398 + }, + { + "epoch": 0.41338019570563916, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 47399 + }, + { + "epoch": 0.41338891699080776, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9739, + "step": 47400 + }, + { + "epoch": 0.41339763827597636, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 47401 + }, + { + "epoch": 0.4134063595611449, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47402 + }, + { + "epoch": 0.4134150808463135, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 47403 + }, + { + "epoch": 0.4134238021314821, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 47404 + }, + { + "epoch": 0.41343252341665065, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 47405 + }, + { + "epoch": 0.41344124470181925, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 47406 + }, + { + "epoch": 0.41344996598698786, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9748, + "step": 47407 + }, + { + "epoch": 0.4134586872721564, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 47408 + }, + { + "epoch": 0.413467408557325, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 47409 + }, + { + "epoch": 0.4134761298424936, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 47410 + }, + { + "epoch": 0.41348485112766215, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47411 + }, + { + "epoch": 0.41349357241283075, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 47412 + }, + { + "epoch": 0.41350229369799935, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 47413 + }, + { + "epoch": 0.41351101498316795, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 47414 + }, + { + "epoch": 0.4135197362683365, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47415 + }, + { + "epoch": 0.4135284575535051, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 47416 + }, + { + "epoch": 0.4135371788386737, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 47417 + }, + { + "epoch": 0.41354590012384224, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 47418 + }, + { + "epoch": 0.41355462140901084, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 47419 + }, + { + "epoch": 0.41356334269417944, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 47420 + }, + { + "epoch": 0.413572063979348, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 47421 + }, + { + "epoch": 0.4135807852645166, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 47422 + }, + { + "epoch": 0.4135895065496852, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 47423 + }, + { + "epoch": 0.4135982278348537, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 47424 + }, + { + "epoch": 0.4136069491200223, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 47425 + }, + { + "epoch": 0.4136156704051909, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 47426 + }, + { + "epoch": 0.41362439169035947, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 47427 + }, + { + "epoch": 0.41363311297552807, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 47428 + }, + { + "epoch": 0.4136418342606967, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 47429 + }, + { + "epoch": 0.4136505555458652, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 47430 + }, + { + "epoch": 0.4136592768310338, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 47431 + }, + { + "epoch": 0.4136679981162024, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 47432 + }, + { + "epoch": 0.41367671940137096, + "grad_norm": 0.326171875, + "learning_rate": 0.0005, + "loss": 0.9797, + "step": 47433 + }, + { + "epoch": 0.41368544068653956, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 47434 + }, + { + "epoch": 0.41369416197170816, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 47435 + }, + { + "epoch": 0.4137028832568767, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 47436 + }, + { + "epoch": 0.4137116045420453, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 47437 + }, + { + "epoch": 0.4137203258272139, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 47438 + }, + { + "epoch": 0.41372904711238245, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 47439 + }, + { + "epoch": 0.41373776839755105, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 47440 + }, + { + "epoch": 0.41374648968271965, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 47441 + }, + { + "epoch": 0.41375521096788825, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 47442 + }, + { + "epoch": 0.4137639322530568, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 47443 + }, + { + "epoch": 0.4137726535382254, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47444 + }, + { + "epoch": 0.413781374823394, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 47445 + }, + { + "epoch": 0.41379009610856254, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 47446 + }, + { + "epoch": 0.41379881739373114, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 47447 + }, + { + "epoch": 0.41380753867889974, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 47448 + }, + { + "epoch": 0.4138162599640683, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 47449 + }, + { + "epoch": 0.4138249812492369, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 47450 + }, + { + "epoch": 0.4138337025344055, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 47451 + }, + { + "epoch": 0.41384242381957403, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 47452 + }, + { + "epoch": 0.41385114510474263, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 47453 + }, + { + "epoch": 0.41385986638991124, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 47454 + }, + { + "epoch": 0.4138685876750798, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 47455 + }, + { + "epoch": 0.4138773089602484, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47456 + }, + { + "epoch": 0.413886030245417, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 47457 + }, + { + "epoch": 0.4138947515305855, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 47458 + }, + { + "epoch": 0.4139034728157541, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 47459 + }, + { + "epoch": 0.4139121941009227, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 47460 + }, + { + "epoch": 0.41392091538609127, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 47461 + }, + { + "epoch": 0.41392963667125987, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 47462 + }, + { + "epoch": 0.41393835795642847, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 47463 + }, + { + "epoch": 0.413947079241597, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 47464 + }, + { + "epoch": 0.4139558005267656, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 47465 + }, + { + "epoch": 0.4139645218119342, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 47466 + }, + { + "epoch": 0.4139732430971028, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 47467 + }, + { + "epoch": 0.41398196438227136, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0236, + "step": 47468 + }, + { + "epoch": 0.41399068566743996, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 47469 + }, + { + "epoch": 0.41399940695260856, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47470 + }, + { + "epoch": 0.4140081282377771, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 47471 + }, + { + "epoch": 0.4140168495229457, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 47472 + }, + { + "epoch": 0.4140255708081143, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 47473 + }, + { + "epoch": 0.41403429209328285, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 47474 + }, + { + "epoch": 0.41404301337845145, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 47475 + }, + { + "epoch": 0.41405173466362005, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 47476 + }, + { + "epoch": 0.4140604559487886, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 47477 + }, + { + "epoch": 0.4140691772339572, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 47478 + }, + { + "epoch": 0.4140778985191258, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 47479 + }, + { + "epoch": 0.41408661980429434, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 47480 + }, + { + "epoch": 0.41409534108946294, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 47481 + }, + { + "epoch": 0.41410406237463154, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 47482 + }, + { + "epoch": 0.4141127836598001, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 47483 + }, + { + "epoch": 0.4141215049449687, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 47484 + }, + { + "epoch": 0.4141302262301373, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 47485 + }, + { + "epoch": 0.41413894751530583, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 47486 + }, + { + "epoch": 0.41414766880047443, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 47487 + }, + { + "epoch": 0.41415639008564303, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 47488 + }, + { + "epoch": 0.4141651113708116, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 47489 + }, + { + "epoch": 0.4141738326559802, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 47490 + }, + { + "epoch": 0.4141825539411488, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9767, + "step": 47491 + }, + { + "epoch": 0.4141912752263173, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 47492 + }, + { + "epoch": 0.4141999965114859, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 47493 + }, + { + "epoch": 0.4142087177966545, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 47494 + }, + { + "epoch": 0.4142174390818231, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 47495 + }, + { + "epoch": 0.41422616036699167, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 47496 + }, + { + "epoch": 0.41423488165216027, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 47497 + }, + { + "epoch": 0.41424360293732887, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 47498 + }, + { + "epoch": 0.4142523242224974, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47499 + }, + { + "epoch": 0.414261045507666, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 47500 + }, + { + "epoch": 0.4142697667928346, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 47501 + }, + { + "epoch": 0.41427848807800316, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 47502 + }, + { + "epoch": 0.41428720936317176, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 47503 + }, + { + "epoch": 0.41429593064834036, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 47504 + }, + { + "epoch": 0.4143046519335089, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 47505 + }, + { + "epoch": 0.4143133732186775, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 47506 + }, + { + "epoch": 0.4143220945038461, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 47507 + }, + { + "epoch": 0.41433081578901465, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 47508 + }, + { + "epoch": 0.41433953707418325, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 47509 + }, + { + "epoch": 0.41434825835935185, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 47510 + }, + { + "epoch": 0.4143569796445204, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 47511 + }, + { + "epoch": 0.414365700929689, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 47512 + }, + { + "epoch": 0.4143744222148576, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 47513 + }, + { + "epoch": 0.41438314350002614, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 47514 + }, + { + "epoch": 0.41439186478519474, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9716, + "step": 47515 + }, + { + "epoch": 0.41440058607036334, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 47516 + }, + { + "epoch": 0.4144093073555319, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 47517 + }, + { + "epoch": 0.4144180286407005, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 47518 + }, + { + "epoch": 0.4144267499258691, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 47519 + }, + { + "epoch": 0.41443547121103763, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 47520 + }, + { + "epoch": 0.41444419249620623, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47521 + }, + { + "epoch": 0.41445291378137483, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 47522 + }, + { + "epoch": 0.41446163506654343, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 47523 + }, + { + "epoch": 0.414470356351712, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 47524 + }, + { + "epoch": 0.4144790776368806, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 47525 + }, + { + "epoch": 0.4144877989220492, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 47526 + }, + { + "epoch": 0.4144965202072177, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 47527 + }, + { + "epoch": 0.4145052414923863, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 47528 + }, + { + "epoch": 0.4145139627775549, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47529 + }, + { + "epoch": 0.41452268406272347, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 47530 + }, + { + "epoch": 0.41453140534789207, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 47531 + }, + { + "epoch": 0.41454012663306067, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0309, + "step": 47532 + }, + { + "epoch": 0.4145488479182292, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 47533 + }, + { + "epoch": 0.4145575692033978, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 47534 + }, + { + "epoch": 0.4145662904885664, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 47535 + }, + { + "epoch": 0.41457501177373496, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 47536 + }, + { + "epoch": 0.41458373305890356, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 47537 + }, + { + "epoch": 0.41459245434407216, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 47538 + }, + { + "epoch": 0.4146011756292407, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 47539 + }, + { + "epoch": 0.4146098969144093, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 47540 + }, + { + "epoch": 0.4146186181995779, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 47541 + }, + { + "epoch": 0.41462733948474645, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 47542 + }, + { + "epoch": 0.41463606076991505, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 47543 + }, + { + "epoch": 0.41464478205508365, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 47544 + }, + { + "epoch": 0.4146535033402522, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9824, + "step": 47545 + }, + { + "epoch": 0.4146622246254208, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 47546 + }, + { + "epoch": 0.4146709459105894, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 47547 + }, + { + "epoch": 0.41467966719575794, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 47548 + }, + { + "epoch": 0.41468838848092654, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 47549 + }, + { + "epoch": 0.41469710976609514, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 47550 + }, + { + "epoch": 0.41470583105126374, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 47551 + }, + { + "epoch": 0.4147145523364323, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 47552 + }, + { + "epoch": 0.4147232736216009, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 47553 + }, + { + "epoch": 0.4147319949067695, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 47554 + }, + { + "epoch": 0.41474071619193803, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 47555 + }, + { + "epoch": 0.41474943747710663, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 47556 + }, + { + "epoch": 0.41475815876227523, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 47557 + }, + { + "epoch": 0.4147668800474438, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0339, + "step": 47558 + }, + { + "epoch": 0.4147756013326124, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 47559 + }, + { + "epoch": 0.414784322617781, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 47560 + }, + { + "epoch": 0.4147930439029495, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 47561 + }, + { + "epoch": 0.4148017651881181, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 47562 + }, + { + "epoch": 0.4148104864732867, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 47563 + }, + { + "epoch": 0.41481920775845527, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 47564 + }, + { + "epoch": 0.41482792904362387, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 47565 + }, + { + "epoch": 0.41483665032879247, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 47566 + }, + { + "epoch": 0.414845371613961, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 47567 + }, + { + "epoch": 0.4148540928991296, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 47568 + }, + { + "epoch": 0.4148628141842982, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 47569 + }, + { + "epoch": 0.41487153546946676, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47570 + }, + { + "epoch": 0.41488025675463536, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 47571 + }, + { + "epoch": 0.41488897803980396, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 47572 + }, + { + "epoch": 0.4148976993249725, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 47573 + }, + { + "epoch": 0.4149064206101411, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 47574 + }, + { + "epoch": 0.4149151418953097, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 47575 + }, + { + "epoch": 0.4149238631804783, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 47576 + }, + { + "epoch": 0.41493258446564685, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 47577 + }, + { + "epoch": 0.41494130575081545, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 47578 + }, + { + "epoch": 0.41495002703598405, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 47579 + }, + { + "epoch": 0.4149587483211526, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9795, + "step": 47580 + }, + { + "epoch": 0.4149674696063212, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 47581 + }, + { + "epoch": 0.4149761908914898, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 47582 + }, + { + "epoch": 0.41498491217665834, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 47583 + }, + { + "epoch": 0.41499363346182694, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 47584 + }, + { + "epoch": 0.41500235474699554, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 47585 + }, + { + "epoch": 0.4150110760321641, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 47586 + }, + { + "epoch": 0.4150197973173327, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 47587 + }, + { + "epoch": 0.4150285186025013, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 47588 + }, + { + "epoch": 0.41503723988766983, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 47589 + }, + { + "epoch": 0.41504596117283843, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47590 + }, + { + "epoch": 0.41505468245800703, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 47591 + }, + { + "epoch": 0.4150634037431756, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 47592 + }, + { + "epoch": 0.4150721250283442, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 47593 + }, + { + "epoch": 0.4150808463135128, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 47594 + }, + { + "epoch": 0.4150895675986813, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9755, + "step": 47595 + }, + { + "epoch": 0.4150982888838499, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 47596 + }, + { + "epoch": 0.4151070101690185, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 47597 + }, + { + "epoch": 0.41511573145418706, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 47598 + }, + { + "epoch": 0.41512445273935566, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 47599 + }, + { + "epoch": 0.41513317402452427, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47600 + }, + { + "epoch": 0.4151418953096928, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 47601 + }, + { + "epoch": 0.4151506165948614, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 47602 + }, + { + "epoch": 0.41515933788003, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 47603 + }, + { + "epoch": 0.4151680591651986, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 47604 + }, + { + "epoch": 0.41517678045036716, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 47605 + }, + { + "epoch": 0.41518550173553576, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 47606 + }, + { + "epoch": 0.41519422302070436, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 47607 + }, + { + "epoch": 0.4152029443058729, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 47608 + }, + { + "epoch": 0.4152116655910415, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 47609 + }, + { + "epoch": 0.4152203868762101, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 47610 + }, + { + "epoch": 0.41522910816137865, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 47611 + }, + { + "epoch": 0.41523782944654725, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 47612 + }, + { + "epoch": 0.41524655073171585, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 47613 + }, + { + "epoch": 0.4152552720168844, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 47614 + }, + { + "epoch": 0.415263993302053, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 47615 + }, + { + "epoch": 0.4152727145872216, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 47616 + }, + { + "epoch": 0.41528143587239014, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 47617 + }, + { + "epoch": 0.41529015715755874, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 47618 + }, + { + "epoch": 0.41529887844272734, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 47619 + }, + { + "epoch": 0.4153075997278959, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47620 + }, + { + "epoch": 0.4153163210130645, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 47621 + }, + { + "epoch": 0.4153250422982331, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9803, + "step": 47622 + }, + { + "epoch": 0.4153337635834016, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 47623 + }, + { + "epoch": 0.4153424848685702, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 47624 + }, + { + "epoch": 0.41535120615373883, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 47625 + }, + { + "epoch": 0.4153599274389074, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 47626 + }, + { + "epoch": 0.415368648724076, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 47627 + }, + { + "epoch": 0.4153773700092446, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 47628 + }, + { + "epoch": 0.4153860912944131, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 47629 + }, + { + "epoch": 0.4153948125795817, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 47630 + }, + { + "epoch": 0.4154035338647503, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 47631 + }, + { + "epoch": 0.4154122551499189, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 47632 + }, + { + "epoch": 0.41542097643508746, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 47633 + }, + { + "epoch": 0.41542969772025606, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 47634 + }, + { + "epoch": 0.41543841900542466, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 47635 + }, + { + "epoch": 0.4154471402905932, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 47636 + }, + { + "epoch": 0.4154558615757618, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 47637 + }, + { + "epoch": 0.4154645828609304, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 47638 + }, + { + "epoch": 0.41547330414609895, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 47639 + }, + { + "epoch": 0.41548202543126755, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0268, + "step": 47640 + }, + { + "epoch": 0.41549074671643615, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 47641 + }, + { + "epoch": 0.4154994680016047, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 47642 + }, + { + "epoch": 0.4155081892867733, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 47643 + }, + { + "epoch": 0.4155169105719419, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 47644 + }, + { + "epoch": 0.41552563185711044, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 47645 + }, + { + "epoch": 0.41553435314227904, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 47646 + }, + { + "epoch": 0.41554307442744765, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 47647 + }, + { + "epoch": 0.4155517957126162, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 47648 + }, + { + "epoch": 0.4155605169977848, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 47649 + }, + { + "epoch": 0.4155692382829534, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 47650 + }, + { + "epoch": 0.41557795956812194, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 47651 + }, + { + "epoch": 0.41558668085329054, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 47652 + }, + { + "epoch": 0.41559540213845914, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 47653 + }, + { + "epoch": 0.4156041234236277, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 47654 + }, + { + "epoch": 0.4156128447087963, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 47655 + }, + { + "epoch": 0.4156215659939649, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 47656 + }, + { + "epoch": 0.4156302872791334, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 47657 + }, + { + "epoch": 0.415639008564302, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 47658 + }, + { + "epoch": 0.4156477298494706, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 47659 + }, + { + "epoch": 0.4156564511346392, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 47660 + }, + { + "epoch": 0.41566517241980777, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 47661 + }, + { + "epoch": 0.41567389370497637, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 47662 + }, + { + "epoch": 0.41568261499014497, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9792, + "step": 47663 + }, + { + "epoch": 0.4156913362753135, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 47664 + }, + { + "epoch": 0.4157000575604821, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 47665 + }, + { + "epoch": 0.4157087788456507, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 47666 + }, + { + "epoch": 0.41571750013081926, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 47667 + }, + { + "epoch": 0.41572622141598786, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 47668 + }, + { + "epoch": 0.41573494270115646, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 47669 + }, + { + "epoch": 0.415743663986325, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 47670 + }, + { + "epoch": 0.4157523852714936, + "grad_norm": 0.291015625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 47671 + }, + { + "epoch": 0.4157611065566622, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 47672 + }, + { + "epoch": 0.41576982784183075, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9751, + "step": 47673 + }, + { + "epoch": 0.41577854912699935, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 47674 + }, + { + "epoch": 0.41578727041216795, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 47675 + }, + { + "epoch": 0.4157959916973365, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 47676 + }, + { + "epoch": 0.4158047129825051, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 47677 + }, + { + "epoch": 0.4158134342676737, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 47678 + }, + { + "epoch": 0.41582215555284224, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 47679 + }, + { + "epoch": 0.41583087683801084, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 47680 + }, + { + "epoch": 0.41583959812317944, + "grad_norm": 0.21484375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 47681 + }, + { + "epoch": 0.415848319408348, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47682 + }, + { + "epoch": 0.4158570406935166, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9827, + "step": 47683 + }, + { + "epoch": 0.4158657619786852, + "grad_norm": 0.17578125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 47684 + }, + { + "epoch": 0.4158744832638538, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 47685 + }, + { + "epoch": 0.41588320454902233, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 47686 + }, + { + "epoch": 0.41589192583419093, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 47687 + }, + { + "epoch": 0.41590064711935953, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 47688 + }, + { + "epoch": 0.4159093684045281, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 47689 + }, + { + "epoch": 0.4159180896896967, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 47690 + }, + { + "epoch": 0.4159268109748653, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 47691 + }, + { + "epoch": 0.4159355322600338, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9802, + "step": 47692 + }, + { + "epoch": 0.4159442535452024, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 47693 + }, + { + "epoch": 0.415952974830371, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 47694 + }, + { + "epoch": 0.41596169611553957, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 47695 + }, + { + "epoch": 0.41597041740070817, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 47696 + }, + { + "epoch": 0.41597913868587677, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 47697 + }, + { + "epoch": 0.4159878599710453, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 47698 + }, + { + "epoch": 0.4159965812562139, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 47699 + }, + { + "epoch": 0.4160053025413825, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9864, + "step": 47700 + }, + { + "epoch": 0.41601402382655106, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 47701 + }, + { + "epoch": 0.41602274511171966, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 47702 + }, + { + "epoch": 0.41603146639688826, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 47703 + }, + { + "epoch": 0.4160401876820568, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 47704 + }, + { + "epoch": 0.4160489089672254, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9845, + "step": 47705 + }, + { + "epoch": 0.416057630252394, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 47706 + }, + { + "epoch": 0.41606635153756255, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 47707 + }, + { + "epoch": 0.41607507282273115, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 47708 + }, + { + "epoch": 0.41608379410789975, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 47709 + }, + { + "epoch": 0.4160925153930683, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 47710 + }, + { + "epoch": 0.4161012366782369, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 47711 + }, + { + "epoch": 0.4161099579634055, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 47712 + }, + { + "epoch": 0.4161186792485741, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 47713 + }, + { + "epoch": 0.41612740053374264, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 47714 + }, + { + "epoch": 0.41613612181891124, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 47715 + }, + { + "epoch": 0.41614484310407984, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 47716 + }, + { + "epoch": 0.4161535643892484, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 47717 + }, + { + "epoch": 0.416162285674417, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 47718 + }, + { + "epoch": 0.4161710069595856, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 47719 + }, + { + "epoch": 0.41617972824475413, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 47720 + }, + { + "epoch": 0.41618844952992273, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 47721 + }, + { + "epoch": 0.41619717081509133, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 47722 + }, + { + "epoch": 0.4162058921002599, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 47723 + }, + { + "epoch": 0.4162146133854285, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 47724 + }, + { + "epoch": 0.4162233346705971, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 47725 + }, + { + "epoch": 0.4162320559557656, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 47726 + }, + { + "epoch": 0.4162407772409342, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9815, + "step": 47727 + }, + { + "epoch": 0.4162494985261028, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47728 + }, + { + "epoch": 0.41625821981127137, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 47729 + }, + { + "epoch": 0.41626694109643997, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 47730 + }, + { + "epoch": 0.41627566238160857, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 47731 + }, + { + "epoch": 0.4162843836667771, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 47732 + }, + { + "epoch": 0.4162931049519457, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 47733 + }, + { + "epoch": 0.4163018262371143, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 47734 + }, + { + "epoch": 0.41631054752228286, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 47735 + }, + { + "epoch": 0.41631926880745146, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 47736 + }, + { + "epoch": 0.41632799009262006, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 47737 + }, + { + "epoch": 0.4163367113777886, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 47738 + }, + { + "epoch": 0.4163454326629572, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9828, + "step": 47739 + }, + { + "epoch": 0.4163541539481258, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 47740 + }, + { + "epoch": 0.4163628752332944, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 47741 + }, + { + "epoch": 0.41637159651846295, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 47742 + }, + { + "epoch": 0.41638031780363155, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 47743 + }, + { + "epoch": 0.41638903908880015, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 47744 + }, + { + "epoch": 0.4163977603739687, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 47745 + }, + { + "epoch": 0.4164064816591373, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 47746 + }, + { + "epoch": 0.4164152029443059, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 47747 + }, + { + "epoch": 0.41642392422947444, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 47748 + }, + { + "epoch": 0.41643264551464304, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 47749 + }, + { + "epoch": 0.41644136679981164, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 47750 + }, + { + "epoch": 0.4164500880849802, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 47751 + }, + { + "epoch": 0.4164588093701488, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 47752 + }, + { + "epoch": 0.4164675306553174, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 47753 + }, + { + "epoch": 0.41647625194048593, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 47754 + }, + { + "epoch": 0.41648497322565453, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 47755 + }, + { + "epoch": 0.41649369451082313, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 47756 + }, + { + "epoch": 0.4165024157959917, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 47757 + }, + { + "epoch": 0.4165111370811603, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 47758 + }, + { + "epoch": 0.4165198583663289, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 47759 + }, + { + "epoch": 0.4165285796514974, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 47760 + }, + { + "epoch": 0.416537300936666, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 47761 + }, + { + "epoch": 0.4165460222218346, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 47762 + }, + { + "epoch": 0.41655474350700317, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 47763 + }, + { + "epoch": 0.41656346479217177, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 47764 + }, + { + "epoch": 0.41657218607734037, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 47765 + }, + { + "epoch": 0.4165809073625089, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 47766 + }, + { + "epoch": 0.4165896286476775, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 47767 + }, + { + "epoch": 0.4165983499328461, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 47768 + }, + { + "epoch": 0.4166070712180147, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 47769 + }, + { + "epoch": 0.41661579250318326, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 47770 + }, + { + "epoch": 0.41662451378835186, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 47771 + }, + { + "epoch": 0.41663323507352046, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 47772 + }, + { + "epoch": 0.416641956358689, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 47773 + }, + { + "epoch": 0.4166506776438576, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 47774 + }, + { + "epoch": 0.4166593989290262, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0265, + "step": 47775 + }, + { + "epoch": 0.41666812021419475, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 47776 + }, + { + "epoch": 0.41667684149936335, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 47777 + }, + { + "epoch": 0.41668556278453195, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 47778 + }, + { + "epoch": 0.4166942840697005, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 47779 + }, + { + "epoch": 0.4167030053548691, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 47780 + }, + { + "epoch": 0.4167117266400377, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 47781 + }, + { + "epoch": 0.41672044792520624, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 47782 + }, + { + "epoch": 0.41672916921037484, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 47783 + }, + { + "epoch": 0.41673789049554344, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 47784 + }, + { + "epoch": 0.416746611780712, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 47785 + }, + { + "epoch": 0.4167553330658806, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 47786 + }, + { + "epoch": 0.4167640543510492, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 47787 + }, + { + "epoch": 0.41677277563621773, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 47788 + }, + { + "epoch": 0.41678149692138633, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 47789 + }, + { + "epoch": 0.41679021820655493, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 47790 + }, + { + "epoch": 0.4167989394917235, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 47791 + }, + { + "epoch": 0.4168076607768921, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 47792 + }, + { + "epoch": 0.4168163820620607, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 47793 + }, + { + "epoch": 0.4168251033472292, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 47794 + }, + { + "epoch": 0.4168338246323978, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 47795 + }, + { + "epoch": 0.4168425459175664, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 47796 + }, + { + "epoch": 0.416851267202735, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 47797 + }, + { + "epoch": 0.41685998848790357, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 47798 + }, + { + "epoch": 0.41686870977307217, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 47799 + }, + { + "epoch": 0.41687743105824077, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 47800 + }, + { + "epoch": 0.4168861523434093, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 47801 + }, + { + "epoch": 0.4168948736285779, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 47802 + }, + { + "epoch": 0.4169035949137465, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 47803 + }, + { + "epoch": 0.41691231619891506, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 47804 + }, + { + "epoch": 0.41692103748408366, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 47805 + }, + { + "epoch": 0.41692975876925226, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 47806 + }, + { + "epoch": 0.4169384800544208, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 47807 + }, + { + "epoch": 0.4169472013395894, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 47808 + }, + { + "epoch": 0.416955922624758, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 47809 + }, + { + "epoch": 0.41696464390992655, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 47810 + }, + { + "epoch": 0.41697336519509515, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 47811 + }, + { + "epoch": 0.41698208648026375, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9842, + "step": 47812 + }, + { + "epoch": 0.4169908077654323, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 47813 + }, + { + "epoch": 0.4169995290506009, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 47814 + }, + { + "epoch": 0.4170082503357695, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 47815 + }, + { + "epoch": 0.41701697162093804, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 47816 + }, + { + "epoch": 0.41702569290610664, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 47817 + }, + { + "epoch": 0.41703441419127524, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 47818 + }, + { + "epoch": 0.4170431354764438, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 47819 + }, + { + "epoch": 0.4170518567616124, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 47820 + }, + { + "epoch": 0.417060578046781, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 47821 + }, + { + "epoch": 0.4170692993319496, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 47822 + }, + { + "epoch": 0.41707802061711813, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 47823 + }, + { + "epoch": 0.41708674190228673, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 47824 + }, + { + "epoch": 0.41709546318745533, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 47825 + }, + { + "epoch": 0.4171041844726239, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 47826 + }, + { + "epoch": 0.4171129057577925, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 47827 + }, + { + "epoch": 0.4171216270429611, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 47828 + }, + { + "epoch": 0.4171303483281296, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 47829 + }, + { + "epoch": 0.4171390696132982, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 47830 + }, + { + "epoch": 0.4171477908984668, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 47831 + }, + { + "epoch": 0.41715651218363536, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 47832 + }, + { + "epoch": 0.41716523346880396, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 47833 + }, + { + "epoch": 0.41717395475397256, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 47834 + }, + { + "epoch": 0.4171826760391411, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.023, + "step": 47835 + }, + { + "epoch": 0.4171913973243097, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 47836 + }, + { + "epoch": 0.4172001186094783, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 47837 + }, + { + "epoch": 0.41720883989464685, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 47838 + }, + { + "epoch": 0.41721756117981545, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 47839 + }, + { + "epoch": 0.41722628246498406, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0233, + "step": 47840 + }, + { + "epoch": 0.4172350037501526, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 47841 + }, + { + "epoch": 0.4172437250353212, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 47842 + }, + { + "epoch": 0.4172524463204898, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 47843 + }, + { + "epoch": 0.41726116760565835, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 47844 + }, + { + "epoch": 0.41726988889082695, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 47845 + }, + { + "epoch": 0.41727861017599555, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 47846 + }, + { + "epoch": 0.4172873314611641, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9816, + "step": 47847 + }, + { + "epoch": 0.4172960527463327, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 47848 + }, + { + "epoch": 0.4173047740315013, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 47849 + }, + { + "epoch": 0.4173134953166699, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 47850 + }, + { + "epoch": 0.41732221660183844, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 47851 + }, + { + "epoch": 0.41733093788700704, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 47852 + }, + { + "epoch": 0.41733965917217564, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 47853 + }, + { + "epoch": 0.4173483804573442, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 47854 + }, + { + "epoch": 0.4173571017425128, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 47855 + }, + { + "epoch": 0.4173658230276814, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 47856 + }, + { + "epoch": 0.4173745443128499, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 47857 + }, + { + "epoch": 0.4173832655980185, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 47858 + }, + { + "epoch": 0.4173919868831871, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 47859 + }, + { + "epoch": 0.41740070816835567, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 47860 + }, + { + "epoch": 0.41740942945352427, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 47861 + }, + { + "epoch": 0.4174181507386929, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 47862 + }, + { + "epoch": 0.4174268720238614, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 47863 + }, + { + "epoch": 0.41743559330903, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 47864 + }, + { + "epoch": 0.4174443145941986, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 47865 + }, + { + "epoch": 0.41745303587936716, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 47866 + }, + { + "epoch": 0.41746175716453576, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 47867 + }, + { + "epoch": 0.41747047844970436, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 47868 + }, + { + "epoch": 0.4174791997348729, + "grad_norm": 0.2001953125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 47869 + }, + { + "epoch": 0.4174879210200415, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 47870 + }, + { + "epoch": 0.4174966423052101, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 47871 + }, + { + "epoch": 0.41750536359037865, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 47872 + }, + { + "epoch": 0.41751408487554725, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 47873 + }, + { + "epoch": 0.41752280616071585, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 47874 + }, + { + "epoch": 0.4175315274458844, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 47875 + }, + { + "epoch": 0.417540248731053, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47876 + }, + { + "epoch": 0.4175489700162216, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 47877 + }, + { + "epoch": 0.4175576913013902, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47878 + }, + { + "epoch": 0.41756641258655874, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 47879 + }, + { + "epoch": 0.41757513387172734, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 47880 + }, + { + "epoch": 0.41758385515689594, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 47881 + }, + { + "epoch": 0.4175925764420645, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 47882 + }, + { + "epoch": 0.4176012977272331, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 47883 + }, + { + "epoch": 0.4176100190124017, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 47884 + }, + { + "epoch": 0.41761874029757023, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 47885 + }, + { + "epoch": 0.41762746158273883, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 47886 + }, + { + "epoch": 0.41763618286790744, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 47887 + }, + { + "epoch": 0.417644904153076, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 47888 + }, + { + "epoch": 0.4176536254382446, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 47889 + }, + { + "epoch": 0.4176623467234132, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 47890 + }, + { + "epoch": 0.4176710680085817, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 47891 + }, + { + "epoch": 0.4176797892937503, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 47892 + }, + { + "epoch": 0.4176885105789189, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 47893 + }, + { + "epoch": 0.41769723186408747, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 47894 + }, + { + "epoch": 0.41770595314925607, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 47895 + }, + { + "epoch": 0.41771467443442467, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9876, + "step": 47896 + }, + { + "epoch": 0.4177233957195932, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 47897 + }, + { + "epoch": 0.4177321170047618, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 47898 + }, + { + "epoch": 0.4177408382899304, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 47899 + }, + { + "epoch": 0.41774955957509896, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 47900 + }, + { + "epoch": 0.41775828086026756, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 47901 + }, + { + "epoch": 0.41776700214543616, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 47902 + }, + { + "epoch": 0.4177757234306047, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 47903 + }, + { + "epoch": 0.4177844447157733, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 47904 + }, + { + "epoch": 0.4177931660009419, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 47905 + }, + { + "epoch": 0.4178018872861105, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0287, + "step": 47906 + }, + { + "epoch": 0.41781060857127905, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 47907 + }, + { + "epoch": 0.41781932985644765, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 47908 + }, + { + "epoch": 0.41782805114161625, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9763, + "step": 47909 + }, + { + "epoch": 0.4178367724267848, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.017, + "step": 47910 + }, + { + "epoch": 0.4178454937119534, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 47911 + }, + { + "epoch": 0.417854214997122, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 47912 + }, + { + "epoch": 0.41786293628229054, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 47913 + }, + { + "epoch": 0.41787165756745914, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 47914 + }, + { + "epoch": 0.41788037885262774, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 47915 + }, + { + "epoch": 0.4178891001377963, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 47916 + }, + { + "epoch": 0.4178978214229649, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 47917 + }, + { + "epoch": 0.4179065427081335, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 47918 + }, + { + "epoch": 0.41791526399330203, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 47919 + }, + { + "epoch": 0.41792398527847063, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 47920 + }, + { + "epoch": 0.41793270656363923, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 47921 + }, + { + "epoch": 0.4179414278488078, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 47922 + }, + { + "epoch": 0.4179501491339764, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 47923 + }, + { + "epoch": 0.417958870419145, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 47924 + }, + { + "epoch": 0.4179675917043135, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 47925 + }, + { + "epoch": 0.4179763129894821, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 47926 + }, + { + "epoch": 0.4179850342746507, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 47927 + }, + { + "epoch": 0.41799375555981927, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0254, + "step": 47928 + }, + { + "epoch": 0.41800247684498787, + "grad_norm": 0.25, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 47929 + }, + { + "epoch": 0.41801119813015647, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 47930 + }, + { + "epoch": 0.41801991941532507, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 47931 + }, + { + "epoch": 0.4180286407004936, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 47932 + }, + { + "epoch": 0.4180373619856622, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9808, + "step": 47933 + }, + { + "epoch": 0.4180460832708308, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 47934 + }, + { + "epoch": 0.41805480455599936, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 47935 + }, + { + "epoch": 0.41806352584116796, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.9886, + "step": 47936 + }, + { + "epoch": 0.41807224712633656, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 47937 + }, + { + "epoch": 0.4180809684115051, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 47938 + }, + { + "epoch": 0.4180896896966737, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 47939 + }, + { + "epoch": 0.4180984109818423, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0323, + "step": 47940 + }, + { + "epoch": 0.41810713226701085, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 47941 + }, + { + "epoch": 0.41811585355217945, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 47942 + }, + { + "epoch": 0.41812457483734805, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 47943 + }, + { + "epoch": 0.4181332961225166, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 47944 + }, + { + "epoch": 0.4181420174076852, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 47945 + }, + { + "epoch": 0.4181507386928538, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 47946 + }, + { + "epoch": 0.41815945997802234, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 47947 + }, + { + "epoch": 0.41816818126319094, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 47948 + }, + { + "epoch": 0.41817690254835954, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 47949 + }, + { + "epoch": 0.4181856238335281, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 47950 + }, + { + "epoch": 0.4181943451186967, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 47951 + }, + { + "epoch": 0.4182030664038653, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 47952 + }, + { + "epoch": 0.41821178768903383, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 47953 + }, + { + "epoch": 0.41822050897420243, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 47954 + }, + { + "epoch": 0.41822923025937103, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 47955 + }, + { + "epoch": 0.4182379515445396, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 47956 + }, + { + "epoch": 0.4182466728297082, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 47957 + }, + { + "epoch": 0.4182553941148768, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 47958 + }, + { + "epoch": 0.4182641154000454, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9769, + "step": 47959 + }, + { + "epoch": 0.4182728366852139, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 47960 + }, + { + "epoch": 0.4182815579703825, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 47961 + }, + { + "epoch": 0.4182902792555511, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 47962 + }, + { + "epoch": 0.41829900054071967, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 47963 + }, + { + "epoch": 0.41830772182588827, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 47964 + }, + { + "epoch": 0.41831644311105687, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 47965 + }, + { + "epoch": 0.4183251643962254, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 47966 + }, + { + "epoch": 0.418333885681394, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 47967 + }, + { + "epoch": 0.4183426069665626, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 47968 + }, + { + "epoch": 0.41835132825173116, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 47969 + }, + { + "epoch": 0.41836004953689976, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 47970 + }, + { + "epoch": 0.41836877082206836, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 47971 + }, + { + "epoch": 0.4183774921072369, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 47972 + }, + { + "epoch": 0.4183862133924055, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 47973 + }, + { + "epoch": 0.4183949346775741, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 47974 + }, + { + "epoch": 0.41840365596274265, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 47975 + }, + { + "epoch": 0.41841237724791125, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 47976 + }, + { + "epoch": 0.41842109853307985, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 47977 + }, + { + "epoch": 0.4184298198182484, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 47978 + }, + { + "epoch": 0.418438541103417, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 47979 + }, + { + "epoch": 0.4184472623885856, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9904, + "step": 47980 + }, + { + "epoch": 0.41845598367375414, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 47981 + }, + { + "epoch": 0.41846470495892274, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 47982 + }, + { + "epoch": 0.41847342624409134, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 47983 + }, + { + "epoch": 0.4184821475292599, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 47984 + }, + { + "epoch": 0.4184908688144285, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 47985 + }, + { + "epoch": 0.4184995900995971, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 47986 + }, + { + "epoch": 0.4185083113847657, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 47987 + }, + { + "epoch": 0.41851703266993423, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 47988 + }, + { + "epoch": 0.41852575395510283, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 47989 + }, + { + "epoch": 0.41853447524027143, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 47990 + }, + { + "epoch": 0.41854319652544, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 47991 + }, + { + "epoch": 0.4185519178106086, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9868, + "step": 47992 + }, + { + "epoch": 0.4185606390957772, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 47993 + }, + { + "epoch": 0.4185693603809457, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 47994 + }, + { + "epoch": 0.4185780816661143, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 47995 + }, + { + "epoch": 0.4185868029512829, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 47996 + }, + { + "epoch": 0.41859552423645147, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 47997 + }, + { + "epoch": 0.41860424552162007, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 47998 + }, + { + "epoch": 0.41861296680678867, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 47999 + }, + { + "epoch": 0.4186216880919572, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 48000 + }, + { + "epoch": 0.4186304093771258, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 48001 + }, + { + "epoch": 0.4186391306622944, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48002 + }, + { + "epoch": 0.41864785194746296, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 48003 + }, + { + "epoch": 0.41865657323263156, + "grad_norm": 0.2353515625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 48004 + }, + { + "epoch": 0.41866529451780016, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 48005 + }, + { + "epoch": 0.4186740158029687, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 48006 + }, + { + "epoch": 0.4186827370881373, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 48007 + }, + { + "epoch": 0.4186914583733059, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 48008 + }, + { + "epoch": 0.41870017965847445, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 48009 + }, + { + "epoch": 0.41870890094364305, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 48010 + }, + { + "epoch": 0.41871762222881165, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 48011 + }, + { + "epoch": 0.4187263435139802, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 48012 + }, + { + "epoch": 0.4187350647991488, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 48013 + }, + { + "epoch": 0.4187437860843174, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 48014 + }, + { + "epoch": 0.418752507369486, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 48015 + }, + { + "epoch": 0.41876122865465454, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 48016 + }, + { + "epoch": 0.41876994993982314, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 48017 + }, + { + "epoch": 0.41877867122499174, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 48018 + }, + { + "epoch": 0.4187873925101603, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 48019 + }, + { + "epoch": 0.4187961137953289, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 48020 + }, + { + "epoch": 0.4188048350804975, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 48021 + }, + { + "epoch": 0.41881355636566603, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 48022 + }, + { + "epoch": 0.41882227765083463, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 48023 + }, + { + "epoch": 0.41883099893600323, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 48024 + }, + { + "epoch": 0.4188397202211718, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48025 + }, + { + "epoch": 0.4188484415063404, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 48026 + }, + { + "epoch": 0.418857162791509, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 48027 + }, + { + "epoch": 0.4188658840766775, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 48028 + }, + { + "epoch": 0.4188746053618461, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 48029 + }, + { + "epoch": 0.4188833266470147, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 48030 + }, + { + "epoch": 0.41889204793218326, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 48031 + }, + { + "epoch": 0.41890076921735186, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 48032 + }, + { + "epoch": 0.41890949050252047, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 48033 + }, + { + "epoch": 0.418918211787689, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48034 + }, + { + "epoch": 0.4189269330728576, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 48035 + }, + { + "epoch": 0.4189356543580262, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 48036 + }, + { + "epoch": 0.41894437564319476, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 48037 + }, + { + "epoch": 0.41895309692836336, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 48038 + }, + { + "epoch": 0.41896181821353196, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0288, + "step": 48039 + }, + { + "epoch": 0.41897053949870056, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 48040 + }, + { + "epoch": 0.4189792607838691, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 48041 + }, + { + "epoch": 0.4189879820690377, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 48042 + }, + { + "epoch": 0.4189967033542063, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 48043 + }, + { + "epoch": 0.41900542463937485, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48044 + }, + { + "epoch": 0.41901414592454345, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 48045 + }, + { + "epoch": 0.41902286720971205, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 48046 + }, + { + "epoch": 0.4190315884948806, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 48047 + }, + { + "epoch": 0.4190403097800492, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48048 + }, + { + "epoch": 0.4190490310652178, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 48049 + }, + { + "epoch": 0.41905775235038634, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 48050 + }, + { + "epoch": 0.41906647363555494, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 48051 + }, + { + "epoch": 0.41907519492072354, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 48052 + }, + { + "epoch": 0.4190839162058921, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 48053 + }, + { + "epoch": 0.4190926374910607, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48054 + }, + { + "epoch": 0.4191013587762293, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 48055 + }, + { + "epoch": 0.4191100800613978, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 48056 + }, + { + "epoch": 0.4191188013465664, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 48057 + }, + { + "epoch": 0.41912752263173503, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 48058 + }, + { + "epoch": 0.4191362439169036, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 48059 + }, + { + "epoch": 0.4191449652020722, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 48060 + }, + { + "epoch": 0.4191536864872408, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0174, + "step": 48061 + }, + { + "epoch": 0.4191624077724093, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 48062 + }, + { + "epoch": 0.4191711290575779, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 48063 + }, + { + "epoch": 0.4191798503427465, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 48064 + }, + { + "epoch": 0.41918857162791506, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48065 + }, + { + "epoch": 0.41919729291308366, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 48066 + }, + { + "epoch": 0.41920601419825226, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 48067 + }, + { + "epoch": 0.41921473548342086, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48068 + }, + { + "epoch": 0.4192234567685894, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0242, + "step": 48069 + }, + { + "epoch": 0.419232178053758, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 48070 + }, + { + "epoch": 0.4192408993389266, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 48071 + }, + { + "epoch": 0.41924962062409515, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 48072 + }, + { + "epoch": 0.41925834190926375, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 48073 + }, + { + "epoch": 0.41926706319443235, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0336, + "step": 48074 + }, + { + "epoch": 0.4192757844796009, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 48075 + }, + { + "epoch": 0.4192845057647695, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 48076 + }, + { + "epoch": 0.4192932270499381, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 48077 + }, + { + "epoch": 0.41930194833510664, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 48078 + }, + { + "epoch": 0.41931066962027524, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0258, + "step": 48079 + }, + { + "epoch": 0.41931939090544385, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 48080 + }, + { + "epoch": 0.4193281121906124, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 48081 + }, + { + "epoch": 0.419336833475781, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 48082 + }, + { + "epoch": 0.4193455547609496, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 48083 + }, + { + "epoch": 0.41935427604611814, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9851, + "step": 48084 + }, + { + "epoch": 0.41936299733128674, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 48085 + }, + { + "epoch": 0.41937171861645534, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 48086 + }, + { + "epoch": 0.4193804399016239, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 48087 + }, + { + "epoch": 0.4193891611867925, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 48088 + }, + { + "epoch": 0.4193978824719611, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 48089 + }, + { + "epoch": 0.4194066037571296, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9961, + "step": 48090 + }, + { + "epoch": 0.4194153250422982, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 48091 + }, + { + "epoch": 0.4194240463274668, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48092 + }, + { + "epoch": 0.41943276761263537, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 48093 + }, + { + "epoch": 0.41944148889780397, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 48094 + }, + { + "epoch": 0.41945021018297257, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48095 + }, + { + "epoch": 0.41945893146814117, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 48096 + }, + { + "epoch": 0.4194676527533097, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 48097 + }, + { + "epoch": 0.4194763740384783, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 48098 + }, + { + "epoch": 0.4194850953236469, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48099 + }, + { + "epoch": 0.41949381660881546, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 48100 + }, + { + "epoch": 0.41950253789398406, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 48101 + }, + { + "epoch": 0.41951125917915266, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 48102 + }, + { + "epoch": 0.4195199804643212, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 48103 + }, + { + "epoch": 0.4195287017494898, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 48104 + }, + { + "epoch": 0.4195374230346584, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 48105 + }, + { + "epoch": 0.41954614431982695, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48106 + }, + { + "epoch": 0.41955486560499555, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48107 + }, + { + "epoch": 0.41956358689016415, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48108 + }, + { + "epoch": 0.4195723081753327, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 48109 + }, + { + "epoch": 0.4195810294605013, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 48110 + }, + { + "epoch": 0.4195897507456699, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 48111 + }, + { + "epoch": 0.41959847203083844, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 48112 + }, + { + "epoch": 0.41960719331600704, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0111, + "step": 48113 + }, + { + "epoch": 0.41961591460117564, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 48114 + }, + { + "epoch": 0.4196246358863442, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 0.9847, + "step": 48115 + }, + { + "epoch": 0.4196333571715128, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 48116 + }, + { + "epoch": 0.4196420784566814, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 48117 + }, + { + "epoch": 0.41965079974184993, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 48118 + }, + { + "epoch": 0.41965952102701853, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 48119 + }, + { + "epoch": 0.41966824231218713, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 48120 + }, + { + "epoch": 0.4196769635973557, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 48121 + }, + { + "epoch": 0.4196856848825243, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 48122 + }, + { + "epoch": 0.4196944061676929, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 48123 + }, + { + "epoch": 0.4197031274528615, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0257, + "step": 48124 + }, + { + "epoch": 0.41971184873803, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 48125 + }, + { + "epoch": 0.4197205700231986, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 48126 + }, + { + "epoch": 0.4197292913083672, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48127 + }, + { + "epoch": 0.41973801259353577, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48128 + }, + { + "epoch": 0.41974673387870437, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 48129 + }, + { + "epoch": 0.41975545516387297, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 48130 + }, + { + "epoch": 0.4197641764490415, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 48131 + }, + { + "epoch": 0.4197728977342101, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 48132 + }, + { + "epoch": 0.4197816190193787, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 48133 + }, + { + "epoch": 0.41979034030454726, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48134 + }, + { + "epoch": 0.41979906158971586, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 48135 + }, + { + "epoch": 0.41980778287488446, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 48136 + }, + { + "epoch": 0.419816504160053, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 48137 + }, + { + "epoch": 0.4198252254452216, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 48138 + }, + { + "epoch": 0.4198339467303902, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48139 + }, + { + "epoch": 0.41984266801555875, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 48140 + }, + { + "epoch": 0.41985138930072735, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48141 + }, + { + "epoch": 0.41986011058589595, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 48142 + }, + { + "epoch": 0.4198688318710645, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 48143 + }, + { + "epoch": 0.4198775531562331, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 48144 + }, + { + "epoch": 0.4198862744414017, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 48145 + }, + { + "epoch": 0.41989499572657024, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 48146 + }, + { + "epoch": 0.41990371701173884, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0311, + "step": 48147 + }, + { + "epoch": 0.41991243829690744, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 48148 + }, + { + "epoch": 0.41992115958207604, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 48149 + }, + { + "epoch": 0.4199298808672446, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 48150 + }, + { + "epoch": 0.4199386021524132, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 48151 + }, + { + "epoch": 0.4199473234375818, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 48152 + }, + { + "epoch": 0.41995604472275033, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 48153 + }, + { + "epoch": 0.41996476600791893, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 48154 + }, + { + "epoch": 0.41997348729308753, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 48155 + }, + { + "epoch": 0.4199822085782561, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 48156 + }, + { + "epoch": 0.4199909298634247, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 48157 + }, + { + "epoch": 0.4199996511485933, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 48158 + }, + { + "epoch": 0.4200083724337618, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 48159 + }, + { + "epoch": 0.4200170937189304, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 48160 + }, + { + "epoch": 0.420025815004099, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9893, + "step": 48161 + }, + { + "epoch": 0.42003453628926757, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 48162 + }, + { + "epoch": 0.42004325757443617, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 48163 + }, + { + "epoch": 0.42005197885960477, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48164 + }, + { + "epoch": 0.4200607001447733, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 48165 + }, + { + "epoch": 0.4200694214299419, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 48166 + }, + { + "epoch": 0.4200781427151105, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 48167 + }, + { + "epoch": 0.42008686400027906, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 48168 + }, + { + "epoch": 0.42009558528544766, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 48169 + }, + { + "epoch": 0.42010430657061626, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 48170 + }, + { + "epoch": 0.4201130278557848, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 48171 + }, + { + "epoch": 0.4201217491409534, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 48172 + }, + { + "epoch": 0.420130470426122, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9921, + "step": 48173 + }, + { + "epoch": 0.42013919171129055, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 48174 + }, + { + "epoch": 0.42014791299645915, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 48175 + }, + { + "epoch": 0.42015663428162775, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 48176 + }, + { + "epoch": 0.42016535556679635, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 48177 + }, + { + "epoch": 0.4201740768519649, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 48178 + }, + { + "epoch": 0.4201827981371335, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 48179 + }, + { + "epoch": 0.4201915194223021, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 48180 + }, + { + "epoch": 0.42020024070747064, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 48181 + }, + { + "epoch": 0.42020896199263924, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 48182 + }, + { + "epoch": 0.42021768327780784, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 48183 + }, + { + "epoch": 0.4202264045629764, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48184 + }, + { + "epoch": 0.420235125848145, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 48185 + }, + { + "epoch": 0.4202438471333136, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 48186 + }, + { + "epoch": 0.42025256841848213, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 48187 + }, + { + "epoch": 0.42026128970365073, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 48188 + }, + { + "epoch": 0.42027001098881933, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 48189 + }, + { + "epoch": 0.4202787322739879, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0167, + "step": 48190 + }, + { + "epoch": 0.4202874535591565, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 48191 + }, + { + "epoch": 0.4202961748443251, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 48192 + }, + { + "epoch": 0.4203048961294936, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 48193 + }, + { + "epoch": 0.4203136174146622, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 48194 + }, + { + "epoch": 0.4203223386998308, + "grad_norm": 0.076171875, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 48195 + }, + { + "epoch": 0.42033105998499937, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9826, + "step": 48196 + }, + { + "epoch": 0.42033978127016797, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 48197 + }, + { + "epoch": 0.42034850255533657, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 48198 + }, + { + "epoch": 0.4203572238405051, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 48199 + }, + { + "epoch": 0.4203659451256737, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 48200 + }, + { + "epoch": 0.4203746664108423, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 48201 + }, + { + "epoch": 0.42038338769601086, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 48202 + }, + { + "epoch": 0.42039210898117946, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 48203 + }, + { + "epoch": 0.42040083026634806, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 48204 + }, + { + "epoch": 0.42040955155151666, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 48205 + }, + { + "epoch": 0.4204182728366852, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 48206 + }, + { + "epoch": 0.4204269941218538, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9857, + "step": 48207 + }, + { + "epoch": 0.4204357154070224, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 48208 + }, + { + "epoch": 0.42044443669219095, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 48209 + }, + { + "epoch": 0.42045315797735955, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48210 + }, + { + "epoch": 0.42046187926252815, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 48211 + }, + { + "epoch": 0.4204706005476967, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 48212 + }, + { + "epoch": 0.4204793218328653, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 48213 + }, + { + "epoch": 0.4204880431180339, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 48214 + }, + { + "epoch": 0.42049676440320244, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 48215 + }, + { + "epoch": 0.42050548568837104, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 48216 + }, + { + "epoch": 0.42051420697353964, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 48217 + }, + { + "epoch": 0.4205229282587082, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 48218 + }, + { + "epoch": 0.4205316495438768, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 48219 + }, + { + "epoch": 0.4205403708290454, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 48220 + }, + { + "epoch": 0.42054909211421393, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 48221 + }, + { + "epoch": 0.42055781339938253, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 48222 + }, + { + "epoch": 0.42056653468455113, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 48223 + }, + { + "epoch": 0.4205752559697197, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 48224 + }, + { + "epoch": 0.4205839772548883, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 48225 + }, + { + "epoch": 0.4205926985400569, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 48226 + }, + { + "epoch": 0.4206014198252254, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 48227 + }, + { + "epoch": 0.420610141110394, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 48228 + }, + { + "epoch": 0.4206188623955626, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48229 + }, + { + "epoch": 0.42062758368073117, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 48230 + }, + { + "epoch": 0.42063630496589977, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 48231 + }, + { + "epoch": 0.42064502625106837, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 48232 + }, + { + "epoch": 0.42065374753623697, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 48233 + }, + { + "epoch": 0.4206624688214055, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 48234 + }, + { + "epoch": 0.4206711901065741, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 48235 + }, + { + "epoch": 0.4206799113917427, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48236 + }, + { + "epoch": 0.42068863267691126, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 48237 + }, + { + "epoch": 0.42069735396207986, + "grad_norm": 0.2041015625, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 48238 + }, + { + "epoch": 0.42070607524724846, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 48239 + }, + { + "epoch": 0.420714796532417, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 48240 + }, + { + "epoch": 0.4207235178175856, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 48241 + }, + { + "epoch": 0.4207322391027542, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 48242 + }, + { + "epoch": 0.42074096038792275, + "grad_norm": 0.2197265625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 48243 + }, + { + "epoch": 0.42074968167309135, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 48244 + }, + { + "epoch": 0.42075840295825995, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 48245 + }, + { + "epoch": 0.4207671242434285, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 48246 + }, + { + "epoch": 0.4207758455285971, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 48247 + }, + { + "epoch": 0.4207845668137657, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 48248 + }, + { + "epoch": 0.42079328809893424, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 48249 + }, + { + "epoch": 0.42080200938410284, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 48250 + }, + { + "epoch": 0.42081073066927144, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 48251 + }, + { + "epoch": 0.42081945195444, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48252 + }, + { + "epoch": 0.4208281732396086, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 48253 + }, + { + "epoch": 0.4208368945247772, + "grad_norm": 0.208984375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 48254 + }, + { + "epoch": 0.42084561580994573, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 48255 + }, + { + "epoch": 0.42085433709511433, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 48256 + }, + { + "epoch": 0.42086305838028293, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 48257 + }, + { + "epoch": 0.42087177966545153, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 48258 + }, + { + "epoch": 0.4208805009506201, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 48259 + }, + { + "epoch": 0.4208892222357887, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48260 + }, + { + "epoch": 0.4208979435209573, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 48261 + }, + { + "epoch": 0.4209066648061258, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 48262 + }, + { + "epoch": 0.4209153860912944, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 48263 + }, + { + "epoch": 0.420924107376463, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 48264 + }, + { + "epoch": 0.42093282866163156, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 48265 + }, + { + "epoch": 0.42094154994680016, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 48266 + }, + { + "epoch": 0.42095027123196876, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 48267 + }, + { + "epoch": 0.4209589925171373, + "grad_norm": 0.212890625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 48268 + }, + { + "epoch": 0.4209677138023059, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 48269 + }, + { + "epoch": 0.4209764350874745, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48270 + }, + { + "epoch": 0.42098515637264305, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 48271 + }, + { + "epoch": 0.42099387765781165, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 48272 + }, + { + "epoch": 0.42100259894298026, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 48273 + }, + { + "epoch": 0.4210113202281488, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 48274 + }, + { + "epoch": 0.4210200415133174, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 48275 + }, + { + "epoch": 0.421028762798486, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 48276 + }, + { + "epoch": 0.42103748408365455, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 48277 + }, + { + "epoch": 0.42104620536882315, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 48278 + }, + { + "epoch": 0.42105492665399175, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 48279 + }, + { + "epoch": 0.4210636479391603, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 48280 + }, + { + "epoch": 0.4210723692243289, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 48281 + }, + { + "epoch": 0.4210810905094975, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 48282 + }, + { + "epoch": 0.42108981179466604, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 48283 + }, + { + "epoch": 0.42109853307983464, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 48284 + }, + { + "epoch": 0.42110725436500324, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0285, + "step": 48285 + }, + { + "epoch": 0.42111597565017184, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 48286 + }, + { + "epoch": 0.4211246969353404, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 48287 + }, + { + "epoch": 0.421133418220509, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 48288 + }, + { + "epoch": 0.4211421395056776, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 48289 + }, + { + "epoch": 0.4211508607908461, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 48290 + }, + { + "epoch": 0.4211595820760147, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 48291 + }, + { + "epoch": 0.4211683033611833, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 48292 + }, + { + "epoch": 0.42117702464635187, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 48293 + }, + { + "epoch": 0.42118574593152047, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 48294 + }, + { + "epoch": 0.4211944672166891, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 48295 + }, + { + "epoch": 0.4212031885018576, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 48296 + }, + { + "epoch": 0.4212119097870262, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 48297 + }, + { + "epoch": 0.4212206310721948, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 48298 + }, + { + "epoch": 0.42122935235736336, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48299 + }, + { + "epoch": 0.42123807364253196, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 48300 + }, + { + "epoch": 0.42124679492770056, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 48301 + }, + { + "epoch": 0.4212555162128691, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 48302 + }, + { + "epoch": 0.4212642374980377, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 48303 + }, + { + "epoch": 0.4212729587832063, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 48304 + }, + { + "epoch": 0.42128168006837485, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 48305 + }, + { + "epoch": 0.42129040135354345, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 48306 + }, + { + "epoch": 0.42129912263871205, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 48307 + }, + { + "epoch": 0.4213078439238806, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9716, + "step": 48308 + }, + { + "epoch": 0.4213165652090492, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 48309 + }, + { + "epoch": 0.4213252864942178, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 48310 + }, + { + "epoch": 0.42133400777938634, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 48311 + }, + { + "epoch": 0.42134272906455494, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.987, + "step": 48312 + }, + { + "epoch": 0.42135145034972354, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 48313 + }, + { + "epoch": 0.42136017163489214, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9946, + "step": 48314 + }, + { + "epoch": 0.4213688929200607, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 48315 + }, + { + "epoch": 0.4213776142052293, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 48316 + }, + { + "epoch": 0.4213863354903979, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 48317 + }, + { + "epoch": 0.42139505677556643, + "grad_norm": 0.1748046875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48318 + }, + { + "epoch": 0.42140377806073503, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 48319 + }, + { + "epoch": 0.42141249934590364, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 48320 + }, + { + "epoch": 0.4214212206310722, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 48321 + }, + { + "epoch": 0.4214299419162408, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 48322 + }, + { + "epoch": 0.4214386632014094, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48323 + }, + { + "epoch": 0.4214473844865779, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 48324 + }, + { + "epoch": 0.4214561057717465, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 48325 + }, + { + "epoch": 0.4214648270569151, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 48326 + }, + { + "epoch": 0.42147354834208367, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 48327 + }, + { + "epoch": 0.42148226962725227, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 48328 + }, + { + "epoch": 0.42149099091242087, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 48329 + }, + { + "epoch": 0.4214997121975894, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 48330 + }, + { + "epoch": 0.421508433482758, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 48331 + }, + { + "epoch": 0.4215171547679266, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 48332 + }, + { + "epoch": 0.42152587605309516, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 48333 + }, + { + "epoch": 0.42153459733826376, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 48334 + }, + { + "epoch": 0.42154331862343236, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 48335 + }, + { + "epoch": 0.4215520399086009, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 48336 + }, + { + "epoch": 0.4215607611937695, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 48337 + }, + { + "epoch": 0.4215694824789381, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 48338 + }, + { + "epoch": 0.42157820376410665, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 48339 + }, + { + "epoch": 0.42158692504927525, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 48340 + }, + { + "epoch": 0.42159564633444385, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 48341 + }, + { + "epoch": 0.42160436761961245, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 48342 + }, + { + "epoch": 0.421613088904781, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 48343 + }, + { + "epoch": 0.4216218101899496, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 48344 + }, + { + "epoch": 0.4216305314751182, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 48345 + }, + { + "epoch": 0.42163925276028674, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 48346 + }, + { + "epoch": 0.42164797404545534, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 48347 + }, + { + "epoch": 0.42165669533062394, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 48348 + }, + { + "epoch": 0.4216654166157925, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 48349 + }, + { + "epoch": 0.4216741379009611, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48350 + }, + { + "epoch": 0.4216828591861297, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48351 + }, + { + "epoch": 0.42169158047129823, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0248, + "step": 48352 + }, + { + "epoch": 0.42170030175646683, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 48353 + }, + { + "epoch": 0.42170902304163543, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 0.9797, + "step": 48354 + }, + { + "epoch": 0.421717744326804, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0252, + "step": 48355 + }, + { + "epoch": 0.4217264656119726, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 48356 + }, + { + "epoch": 0.4217351868971412, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 48357 + }, + { + "epoch": 0.4217439081823097, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 48358 + }, + { + "epoch": 0.4217526294674783, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 48359 + }, + { + "epoch": 0.4217613507526469, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 48360 + }, + { + "epoch": 0.42177007203781547, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 48361 + }, + { + "epoch": 0.42177879332298407, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 48362 + }, + { + "epoch": 0.42178751460815267, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 48363 + }, + { + "epoch": 0.4217962358933212, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 48364 + }, + { + "epoch": 0.4218049571784898, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 48365 + }, + { + "epoch": 0.4218136784636584, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48366 + }, + { + "epoch": 0.42182239974882696, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 48367 + }, + { + "epoch": 0.42183112103399556, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 48368 + }, + { + "epoch": 0.42183984231916416, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0267, + "step": 48369 + }, + { + "epoch": 0.42184856360433276, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48370 + }, + { + "epoch": 0.4218572848895013, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 48371 + }, + { + "epoch": 0.4218660061746699, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 48372 + }, + { + "epoch": 0.4218747274598385, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 48373 + }, + { + "epoch": 0.42188344874500705, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 48374 + }, + { + "epoch": 0.42189217003017565, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 48375 + }, + { + "epoch": 0.42190089131534425, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 48376 + }, + { + "epoch": 0.4219096126005128, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 48377 + }, + { + "epoch": 0.4219183338856814, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 48378 + }, + { + "epoch": 0.42192705517085, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 48379 + }, + { + "epoch": 0.42193577645601854, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 48380 + }, + { + "epoch": 0.42194449774118714, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.992, + "step": 48381 + }, + { + "epoch": 0.42195321902635574, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 48382 + }, + { + "epoch": 0.4219619403115243, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48383 + }, + { + "epoch": 0.4219706615966929, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 48384 + }, + { + "epoch": 0.4219793828818615, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 48385 + }, + { + "epoch": 0.42198810416703003, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 48386 + }, + { + "epoch": 0.42199682545219863, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 48387 + }, + { + "epoch": 0.42200554673736723, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 48388 + }, + { + "epoch": 0.4220142680225358, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48389 + }, + { + "epoch": 0.4220229893077044, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 48390 + }, + { + "epoch": 0.422031710592873, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 48391 + }, + { + "epoch": 0.4220404318780415, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 48392 + }, + { + "epoch": 0.4220491531632101, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 48393 + }, + { + "epoch": 0.4220578744483787, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 48394 + }, + { + "epoch": 0.4220665957335473, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 48395 + }, + { + "epoch": 0.42207531701871587, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 48396 + }, + { + "epoch": 0.42208403830388447, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48397 + }, + { + "epoch": 0.42209275958905307, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48398 + }, + { + "epoch": 0.4221014808742216, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 48399 + }, + { + "epoch": 0.4221102021593902, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 48400 + }, + { + "epoch": 0.4221189234445588, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 48401 + }, + { + "epoch": 0.42212764472972736, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 48402 + }, + { + "epoch": 0.42213636601489596, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9942, + "step": 48403 + }, + { + "epoch": 0.42214508730006456, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 48404 + }, + { + "epoch": 0.4221538085852331, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 48405 + }, + { + "epoch": 0.4221625298704017, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 48406 + }, + { + "epoch": 0.4221712511555703, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 48407 + }, + { + "epoch": 0.42217997244073885, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 48408 + }, + { + "epoch": 0.42218869372590745, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 48409 + }, + { + "epoch": 0.42219741501107605, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 48410 + }, + { + "epoch": 0.4222061362962446, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 48411 + }, + { + "epoch": 0.4222148575814132, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 48412 + }, + { + "epoch": 0.4222235788665818, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9976, + "step": 48413 + }, + { + "epoch": 0.42223230015175034, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 48414 + }, + { + "epoch": 0.42224102143691894, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 48415 + }, + { + "epoch": 0.42224974272208754, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48416 + }, + { + "epoch": 0.4222584640072561, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 48417 + }, + { + "epoch": 0.4222671852924247, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 48418 + }, + { + "epoch": 0.4222759065775933, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 48419 + }, + { + "epoch": 0.42228462786276183, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 48420 + }, + { + "epoch": 0.42229334914793043, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48421 + }, + { + "epoch": 0.42230207043309903, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 48422 + }, + { + "epoch": 0.42231079171826763, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 48423 + }, + { + "epoch": 0.4223195130034362, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0325, + "step": 48424 + }, + { + "epoch": 0.4223282342886048, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 48425 + }, + { + "epoch": 0.4223369555737734, + "grad_norm": 0.2275390625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 48426 + }, + { + "epoch": 0.4223456768589419, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 48427 + }, + { + "epoch": 0.4223543981441105, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48428 + }, + { + "epoch": 0.4223631194292791, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 48429 + }, + { + "epoch": 0.42237184071444767, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 48430 + }, + { + "epoch": 0.42238056199961627, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48431 + }, + { + "epoch": 0.42238928328478487, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 0.9892, + "step": 48432 + }, + { + "epoch": 0.4223980045699534, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 48433 + }, + { + "epoch": 0.422406725855122, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 48434 + }, + { + "epoch": 0.4224154471402906, + "grad_norm": 0.248046875, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 48435 + }, + { + "epoch": 0.42242416842545916, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 48436 + }, + { + "epoch": 0.42243288971062776, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 48437 + }, + { + "epoch": 0.42244161099579636, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 48438 + }, + { + "epoch": 0.4224503322809649, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 48439 + }, + { + "epoch": 0.4224590535661335, + "grad_norm": 0.232421875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 48440 + }, + { + "epoch": 0.4224677748513021, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 48441 + }, + { + "epoch": 0.42247649613647065, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48442 + }, + { + "epoch": 0.42248521742163925, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 48443 + }, + { + "epoch": 0.42249393870680785, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 48444 + }, + { + "epoch": 0.4225026599919764, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 48445 + }, + { + "epoch": 0.422511381277145, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 48446 + }, + { + "epoch": 0.4225201025623136, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 48447 + }, + { + "epoch": 0.42252882384748214, + "grad_norm": 0.26953125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 48448 + }, + { + "epoch": 0.42253754513265074, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 48449 + }, + { + "epoch": 0.42254626641781934, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 48450 + }, + { + "epoch": 0.42255498770298794, + "grad_norm": 0.216796875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 48451 + }, + { + "epoch": 0.4225637089881565, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 48452 + }, + { + "epoch": 0.4225724302733251, + "grad_norm": 0.19921875, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 48453 + }, + { + "epoch": 0.4225811515584937, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 48454 + }, + { + "epoch": 0.42258987284366223, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 48455 + }, + { + "epoch": 0.42259859412883083, + "grad_norm": 0.203125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48456 + }, + { + "epoch": 0.42260731541399943, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 48457 + }, + { + "epoch": 0.422616036699168, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48458 + }, + { + "epoch": 0.4226247579843366, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 48459 + }, + { + "epoch": 0.4226334792695052, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 48460 + }, + { + "epoch": 0.4226422005546737, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 48461 + }, + { + "epoch": 0.4226509218398423, + "grad_norm": 0.240234375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 48462 + }, + { + "epoch": 0.4226596431250109, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 48463 + }, + { + "epoch": 0.42266836441017946, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 48464 + }, + { + "epoch": 0.42267708569534806, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 48465 + }, + { + "epoch": 0.42268580698051667, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.983, + "step": 48466 + }, + { + "epoch": 0.4226945282656852, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48467 + }, + { + "epoch": 0.4227032495508538, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 48468 + }, + { + "epoch": 0.4227119708360224, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 48469 + }, + { + "epoch": 0.42272069212119096, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48470 + }, + { + "epoch": 0.42272941340635956, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 48471 + }, + { + "epoch": 0.42273813469152816, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 48472 + }, + { + "epoch": 0.4227468559766967, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 48473 + }, + { + "epoch": 0.4227555772618653, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 48474 + }, + { + "epoch": 0.4227642985470339, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 48475 + }, + { + "epoch": 0.42277301983220245, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 48476 + }, + { + "epoch": 0.42278174111737105, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 48477 + }, + { + "epoch": 0.42279046240253965, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 48478 + }, + { + "epoch": 0.42279918368770825, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 48479 + }, + { + "epoch": 0.4228079049728768, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 48480 + }, + { + "epoch": 0.4228166262580454, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 48481 + }, + { + "epoch": 0.422825347543214, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 48482 + }, + { + "epoch": 0.42283406882838254, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 48483 + }, + { + "epoch": 0.42284279011355114, + "grad_norm": 0.171875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 48484 + }, + { + "epoch": 0.42285151139871974, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 48485 + }, + { + "epoch": 0.4228602326838883, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 48486 + }, + { + "epoch": 0.4228689539690569, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 48487 + }, + { + "epoch": 0.4228776752542255, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 48488 + }, + { + "epoch": 0.422886396539394, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 48489 + }, + { + "epoch": 0.4228951178245626, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 48490 + }, + { + "epoch": 0.4229038391097312, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 48491 + }, + { + "epoch": 0.4229125603948998, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 48492 + }, + { + "epoch": 0.4229212816800684, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 48493 + }, + { + "epoch": 0.422930002965237, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9919, + "step": 48494 + }, + { + "epoch": 0.4229387242504055, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 48495 + }, + { + "epoch": 0.4229474455355741, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 48496 + }, + { + "epoch": 0.4229561668207427, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 48497 + }, + { + "epoch": 0.42296488810591126, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 48498 + }, + { + "epoch": 0.42297360939107986, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 48499 + }, + { + "epoch": 0.42298233067624846, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.9859, + "step": 48500 + }, + { + "epoch": 0.422991051961417, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 48501 + }, + { + "epoch": 0.4229997732465856, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 48502 + }, + { + "epoch": 0.4230084945317542, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9804, + "step": 48503 + }, + { + "epoch": 0.4230172158169228, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 48504 + }, + { + "epoch": 0.42302593710209135, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 48505 + }, + { + "epoch": 0.42303465838725995, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 48506 + }, + { + "epoch": 0.42304337967242855, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 48507 + }, + { + "epoch": 0.4230521009575971, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 48508 + }, + { + "epoch": 0.4230608222427657, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 48509 + }, + { + "epoch": 0.4230695435279343, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 48510 + }, + { + "epoch": 0.42307826481310284, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 48511 + }, + { + "epoch": 0.42308698609827144, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 48512 + }, + { + "epoch": 0.42309570738344005, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 48513 + }, + { + "epoch": 0.4231044286686086, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 48514 + }, + { + "epoch": 0.4231131499537772, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 48515 + }, + { + "epoch": 0.4231218712389458, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 48516 + }, + { + "epoch": 0.42313059252411434, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 48517 + }, + { + "epoch": 0.42313931380928294, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 48518 + }, + { + "epoch": 0.42314803509445154, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 48519 + }, + { + "epoch": 0.4231567563796201, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 48520 + }, + { + "epoch": 0.4231654776647887, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 48521 + }, + { + "epoch": 0.4231741989499573, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 48522 + }, + { + "epoch": 0.4231829202351258, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0307, + "step": 48523 + }, + { + "epoch": 0.4231916415202944, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 48524 + }, + { + "epoch": 0.423200362805463, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 48525 + }, + { + "epoch": 0.42320908409063157, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48526 + }, + { + "epoch": 0.42321780537580017, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 48527 + }, + { + "epoch": 0.42322652666096877, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 48528 + }, + { + "epoch": 0.4232352479461373, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 48529 + }, + { + "epoch": 0.4232439692313059, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 48530 + }, + { + "epoch": 0.4232526905164745, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48531 + }, + { + "epoch": 0.4232614118016431, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 48532 + }, + { + "epoch": 0.42327013308681166, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 48533 + }, + { + "epoch": 0.42327885437198026, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 48534 + }, + { + "epoch": 0.42328757565714886, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 48535 + }, + { + "epoch": 0.4232962969423174, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 48536 + }, + { + "epoch": 0.423305018227486, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 48537 + }, + { + "epoch": 0.4233137395126546, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 48538 + }, + { + "epoch": 0.42332246079782315, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 48539 + }, + { + "epoch": 0.42333118208299175, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 48540 + }, + { + "epoch": 0.42333990336816035, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0239, + "step": 48541 + }, + { + "epoch": 0.4233486246533289, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 48542 + }, + { + "epoch": 0.4233573459384975, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 48543 + }, + { + "epoch": 0.4233660672236661, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48544 + }, + { + "epoch": 0.42337478850883464, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 48545 + }, + { + "epoch": 0.42338350979400324, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 48546 + }, + { + "epoch": 0.42339223107917184, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 48547 + }, + { + "epoch": 0.4234009523643404, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 48548 + }, + { + "epoch": 0.423409673649509, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 48549 + }, + { + "epoch": 0.4234183949346776, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9809, + "step": 48550 + }, + { + "epoch": 0.42342711621984613, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 48551 + }, + { + "epoch": 0.42343583750501473, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 48552 + }, + { + "epoch": 0.42344455879018333, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 48553 + }, + { + "epoch": 0.4234532800753519, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 48554 + }, + { + "epoch": 0.4234620013605205, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 48555 + }, + { + "epoch": 0.4234707226456891, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 48556 + }, + { + "epoch": 0.4234794439308576, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 48557 + }, + { + "epoch": 0.4234881652160262, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48558 + }, + { + "epoch": 0.4234968865011948, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 48559 + }, + { + "epoch": 0.4235056077863634, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 48560 + }, + { + "epoch": 0.42351432907153197, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 48561 + }, + { + "epoch": 0.42352305035670057, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 48562 + }, + { + "epoch": 0.42353177164186917, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9848, + "step": 48563 + }, + { + "epoch": 0.4235404929270377, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0321, + "step": 48564 + }, + { + "epoch": 0.4235492142122063, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 48565 + }, + { + "epoch": 0.4235579354973749, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 48566 + }, + { + "epoch": 0.42356665678254346, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48567 + }, + { + "epoch": 0.42357537806771206, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 48568 + }, + { + "epoch": 0.42358409935288066, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 48569 + }, + { + "epoch": 0.4235928206380492, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 48570 + }, + { + "epoch": 0.4236015419232178, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 48571 + }, + { + "epoch": 0.4236102632083864, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 48572 + }, + { + "epoch": 0.42361898449355495, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 48573 + }, + { + "epoch": 0.42362770577872355, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 48574 + }, + { + "epoch": 0.42363642706389215, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 48575 + }, + { + "epoch": 0.4236451483490607, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48576 + }, + { + "epoch": 0.4236538696342293, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9866, + "step": 48577 + }, + { + "epoch": 0.4236625909193979, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 48578 + }, + { + "epoch": 0.42367131220456644, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 48579 + }, + { + "epoch": 0.42368003348973504, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 48580 + }, + { + "epoch": 0.42368875477490364, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 48581 + }, + { + "epoch": 0.4236974760600722, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48582 + }, + { + "epoch": 0.4237061973452408, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 48583 + }, + { + "epoch": 0.4237149186304094, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 48584 + }, + { + "epoch": 0.42372363991557793, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 48585 + }, + { + "epoch": 0.42373236120074653, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 48586 + }, + { + "epoch": 0.42374108248591513, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 48587 + }, + { + "epoch": 0.42374980377108373, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 48588 + }, + { + "epoch": 0.4237585250562523, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48589 + }, + { + "epoch": 0.4237672463414209, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 48590 + }, + { + "epoch": 0.4237759676265895, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9875, + "step": 48591 + }, + { + "epoch": 0.423784688911758, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 48592 + }, + { + "epoch": 0.4237934101969266, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 48593 + }, + { + "epoch": 0.4238021314820952, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48594 + }, + { + "epoch": 0.42381085276726377, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 48595 + }, + { + "epoch": 0.42381957405243237, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 48596 + }, + { + "epoch": 0.42382829533760097, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 48597 + }, + { + "epoch": 0.4238370166227695, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 48598 + }, + { + "epoch": 0.4238457379079381, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 48599 + }, + { + "epoch": 0.4238544591931067, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 48600 + }, + { + "epoch": 0.42386318047827526, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 48601 + }, + { + "epoch": 0.42387190176344386, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 48602 + }, + { + "epoch": 0.42388062304861246, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 48603 + }, + { + "epoch": 0.423889344333781, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 48604 + }, + { + "epoch": 0.4238980656189496, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 48605 + }, + { + "epoch": 0.4239067869041182, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 48606 + }, + { + "epoch": 0.42391550818928675, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0296, + "step": 48607 + }, + { + "epoch": 0.42392422947445535, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 48608 + }, + { + "epoch": 0.42393295075962395, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 48609 + }, + { + "epoch": 0.4239416720447925, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 48610 + }, + { + "epoch": 0.4239503933299611, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48611 + }, + { + "epoch": 0.4239591146151297, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 48612 + }, + { + "epoch": 0.4239678359002983, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 48613 + }, + { + "epoch": 0.42397655718546684, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 48614 + }, + { + "epoch": 0.42398527847063544, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 48615 + }, + { + "epoch": 0.42399399975580404, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48616 + }, + { + "epoch": 0.4240027210409726, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48617 + }, + { + "epoch": 0.4240114423261412, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 48618 + }, + { + "epoch": 0.4240201636113098, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 48619 + }, + { + "epoch": 0.42402888489647833, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48620 + }, + { + "epoch": 0.42403760618164693, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9776, + "step": 48621 + }, + { + "epoch": 0.42404632746681553, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 48622 + }, + { + "epoch": 0.4240550487519841, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 48623 + }, + { + "epoch": 0.4240637700371527, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 48624 + }, + { + "epoch": 0.4240724913223213, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 48625 + }, + { + "epoch": 0.4240812126074898, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 48626 + }, + { + "epoch": 0.4240899338926584, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 48627 + }, + { + "epoch": 0.424098655177827, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 48628 + }, + { + "epoch": 0.42410737646299557, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 48629 + }, + { + "epoch": 0.42411609774816417, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 48630 + }, + { + "epoch": 0.42412481903333277, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 48631 + }, + { + "epoch": 0.4241335403185013, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 48632 + }, + { + "epoch": 0.4241422616036699, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 48633 + }, + { + "epoch": 0.4241509828888385, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 48634 + }, + { + "epoch": 0.42415970417400706, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 48635 + }, + { + "epoch": 0.42416842545917566, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 48636 + }, + { + "epoch": 0.42417714674434426, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 48637 + }, + { + "epoch": 0.4241858680295128, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48638 + }, + { + "epoch": 0.4241945893146814, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0237, + "step": 48639 + }, + { + "epoch": 0.42420331059985, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 48640 + }, + { + "epoch": 0.4242120318850186, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 48641 + }, + { + "epoch": 0.42422075317018715, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48642 + }, + { + "epoch": 0.42422947445535575, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 48643 + }, + { + "epoch": 0.42423819574052435, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 48644 + }, + { + "epoch": 0.4242469170256929, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 48645 + }, + { + "epoch": 0.4242556383108615, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 48646 + }, + { + "epoch": 0.4242643595960301, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 48647 + }, + { + "epoch": 0.42427308088119864, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 48648 + }, + { + "epoch": 0.42428180216636724, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 48649 + }, + { + "epoch": 0.42429052345153584, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 48650 + }, + { + "epoch": 0.4242992447367044, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 48651 + }, + { + "epoch": 0.424307966021873, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 48652 + }, + { + "epoch": 0.4243166873070416, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 48653 + }, + { + "epoch": 0.42432540859221013, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 48654 + }, + { + "epoch": 0.42433412987737873, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 48655 + }, + { + "epoch": 0.42434285116254733, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48656 + }, + { + "epoch": 0.4243515724477159, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 48657 + }, + { + "epoch": 0.4243602937328845, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 48658 + }, + { + "epoch": 0.4243690150180531, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 48659 + }, + { + "epoch": 0.4243777363032216, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 48660 + }, + { + "epoch": 0.4243864575883902, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 48661 + }, + { + "epoch": 0.4243951788735588, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 48662 + }, + { + "epoch": 0.42440390015872737, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 48663 + }, + { + "epoch": 0.42441262144389597, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48664 + }, + { + "epoch": 0.42442134272906457, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 48665 + }, + { + "epoch": 0.4244300640142331, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 48666 + }, + { + "epoch": 0.4244387852994017, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 48667 + }, + { + "epoch": 0.4244475065845703, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 48668 + }, + { + "epoch": 0.4244562278697389, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 48669 + }, + { + "epoch": 0.42446494915490746, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 48670 + }, + { + "epoch": 0.42447367044007606, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 48671 + }, + { + "epoch": 0.42448239172524466, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 48672 + }, + { + "epoch": 0.4244911130104132, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 48673 + }, + { + "epoch": 0.4244998342955818, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48674 + }, + { + "epoch": 0.4245085555807504, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 48675 + }, + { + "epoch": 0.42451727686591895, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 48676 + }, + { + "epoch": 0.42452599815108755, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 48677 + }, + { + "epoch": 0.42453471943625615, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 48678 + }, + { + "epoch": 0.4245434407214247, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48679 + }, + { + "epoch": 0.4245521620065933, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48680 + }, + { + "epoch": 0.4245608832917619, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 48681 + }, + { + "epoch": 0.42456960457693044, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 48682 + }, + { + "epoch": 0.42457832586209904, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 48683 + }, + { + "epoch": 0.42458704714726764, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 48684 + }, + { + "epoch": 0.4245957684324362, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 48685 + }, + { + "epoch": 0.4246044897176048, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 48686 + }, + { + "epoch": 0.4246132110027734, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 48687 + }, + { + "epoch": 0.42462193228794193, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 48688 + }, + { + "epoch": 0.42463065357311053, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 48689 + }, + { + "epoch": 0.42463937485827913, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9821, + "step": 48690 + }, + { + "epoch": 0.4246480961434477, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 48691 + }, + { + "epoch": 0.4246568174286163, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48692 + }, + { + "epoch": 0.4246655387137849, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 48693 + }, + { + "epoch": 0.4246742599989534, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 48694 + }, + { + "epoch": 0.424682981284122, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 48695 + }, + { + "epoch": 0.4246917025692906, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 48696 + }, + { + "epoch": 0.4247004238544592, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 48697 + }, + { + "epoch": 0.42470914513962776, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 48698 + }, + { + "epoch": 0.42471786642479636, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 48699 + }, + { + "epoch": 0.42472658770996496, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 48700 + }, + { + "epoch": 0.4247353089951335, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0269, + "step": 48701 + }, + { + "epoch": 0.4247440302803021, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 48702 + }, + { + "epoch": 0.4247527515654707, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 48703 + }, + { + "epoch": 0.42476147285063925, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 48704 + }, + { + "epoch": 0.42477019413580785, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0124, + "step": 48705 + }, + { + "epoch": 0.42477891542097646, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48706 + }, + { + "epoch": 0.424787636706145, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0245, + "step": 48707 + }, + { + "epoch": 0.4247963579913136, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 48708 + }, + { + "epoch": 0.4248050792764822, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0284, + "step": 48709 + }, + { + "epoch": 0.42481380056165075, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 48710 + }, + { + "epoch": 0.42482252184681935, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 48711 + }, + { + "epoch": 0.42483124313198795, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 48712 + }, + { + "epoch": 0.4248399644171565, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 48713 + }, + { + "epoch": 0.4248486857023251, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 48714 + }, + { + "epoch": 0.4248574069874937, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 48715 + }, + { + "epoch": 0.42486612827266224, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 48716 + }, + { + "epoch": 0.42487484955783084, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 48717 + }, + { + "epoch": 0.42488357084299944, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 48718 + }, + { + "epoch": 0.424892292128168, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 48719 + }, + { + "epoch": 0.4249010134133366, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 48720 + }, + { + "epoch": 0.4249097346985052, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 48721 + }, + { + "epoch": 0.4249184559836738, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 48722 + }, + { + "epoch": 0.4249271772688423, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 48723 + }, + { + "epoch": 0.4249358985540109, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 48724 + }, + { + "epoch": 0.4249446198391795, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 48725 + }, + { + "epoch": 0.42495334112434807, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 48726 + }, + { + "epoch": 0.42496206240951667, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 48727 + }, + { + "epoch": 0.4249707836946853, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 48728 + }, + { + "epoch": 0.4249795049798538, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9974, + "step": 48729 + }, + { + "epoch": 0.4249882262650224, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 48730 + }, + { + "epoch": 0.424996947550191, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 48731 + }, + { + "epoch": 0.42500566883535956, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9927, + "step": 48732 + }, + { + "epoch": 0.42501439012052816, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 48733 + }, + { + "epoch": 0.42502311140569676, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 48734 + }, + { + "epoch": 0.4250318326908653, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 48735 + }, + { + "epoch": 0.4250405539760339, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.015, + "step": 48736 + }, + { + "epoch": 0.4250492752612025, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 48737 + }, + { + "epoch": 0.42505799654637105, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 48738 + }, + { + "epoch": 0.42506671783153965, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 48739 + }, + { + "epoch": 0.42507543911670825, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 48740 + }, + { + "epoch": 0.4250841604018768, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0282, + "step": 48741 + }, + { + "epoch": 0.4250928816870454, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 48742 + }, + { + "epoch": 0.425101602972214, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0273, + "step": 48743 + }, + { + "epoch": 0.42511032425738254, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 48744 + }, + { + "epoch": 0.42511904554255114, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 48745 + }, + { + "epoch": 0.42512776682771974, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 48746 + }, + { + "epoch": 0.4251364881128883, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 48747 + }, + { + "epoch": 0.4251452093980569, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 48748 + }, + { + "epoch": 0.4251539306832255, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9912, + "step": 48749 + }, + { + "epoch": 0.4251626519683941, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 48750 + }, + { + "epoch": 0.42517137325356263, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 48751 + }, + { + "epoch": 0.42518009453873123, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 48752 + }, + { + "epoch": 0.42518881582389983, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 48753 + }, + { + "epoch": 0.4251975371090684, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 48754 + }, + { + "epoch": 0.425206258394237, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 48755 + }, + { + "epoch": 0.4252149796794056, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 48756 + }, + { + "epoch": 0.4252237009645741, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 48757 + }, + { + "epoch": 0.4252324222497427, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48758 + }, + { + "epoch": 0.4252411435349113, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 48759 + }, + { + "epoch": 0.42524986482007987, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 48760 + }, + { + "epoch": 0.42525858610524847, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 48761 + }, + { + "epoch": 0.42526730739041707, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 48762 + }, + { + "epoch": 0.4252760286755856, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 48763 + }, + { + "epoch": 0.4252847499607542, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48764 + }, + { + "epoch": 0.4252934712459228, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 48765 + }, + { + "epoch": 0.42530219253109136, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 48766 + }, + { + "epoch": 0.42531091381625996, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0208, + "step": 48767 + }, + { + "epoch": 0.42531963510142856, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 48768 + }, + { + "epoch": 0.4253283563865971, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 48769 + }, + { + "epoch": 0.4253370776717657, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 48770 + }, + { + "epoch": 0.4253457989569343, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 48771 + }, + { + "epoch": 0.42535452024210285, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 48772 + }, + { + "epoch": 0.42536324152727145, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 48773 + }, + { + "epoch": 0.42537196281244005, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 48774 + }, + { + "epoch": 0.4253806840976086, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9898, + "step": 48775 + }, + { + "epoch": 0.4253894053827772, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 48776 + }, + { + "epoch": 0.4253981266679458, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 48777 + }, + { + "epoch": 0.4254068479531144, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 48778 + }, + { + "epoch": 0.42541556923828294, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0227, + "step": 48779 + }, + { + "epoch": 0.42542429052345154, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 48780 + }, + { + "epoch": 0.42543301180862014, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 48781 + }, + { + "epoch": 0.4254417330937887, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 48782 + }, + { + "epoch": 0.4254504543789573, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 48783 + }, + { + "epoch": 0.4254591756641259, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9922, + "step": 48784 + }, + { + "epoch": 0.42546789694929443, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 48785 + }, + { + "epoch": 0.42547661823446303, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 48786 + }, + { + "epoch": 0.42548533951963163, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 48787 + }, + { + "epoch": 0.4254940608048002, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 48788 + }, + { + "epoch": 0.4255027820899688, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48789 + }, + { + "epoch": 0.4255115033751374, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 48790 + }, + { + "epoch": 0.4255202246603059, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 48791 + }, + { + "epoch": 0.4255289459454745, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 48792 + }, + { + "epoch": 0.4255376672306431, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 48793 + }, + { + "epoch": 0.42554638851581167, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 48794 + }, + { + "epoch": 0.42555510980098027, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 48795 + }, + { + "epoch": 0.42556383108614887, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 48796 + }, + { + "epoch": 0.4255725523713174, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 48797 + }, + { + "epoch": 0.425581273656486, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 48798 + }, + { + "epoch": 0.4255899949416546, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 48799 + }, + { + "epoch": 0.42559871622682316, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 48800 + }, + { + "epoch": 0.42560743751199176, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 48801 + }, + { + "epoch": 0.42561615879716036, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 48802 + }, + { + "epoch": 0.4256248800823289, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 48803 + }, + { + "epoch": 0.4256336013674975, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 48804 + }, + { + "epoch": 0.4256423226526661, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 48805 + }, + { + "epoch": 0.4256510439378347, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 48806 + }, + { + "epoch": 0.42565976522300325, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 48807 + }, + { + "epoch": 0.42566848650817185, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48808 + }, + { + "epoch": 0.42567720779334045, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 48809 + }, + { + "epoch": 0.425685929078509, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 48810 + }, + { + "epoch": 0.4256946503636776, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 48811 + }, + { + "epoch": 0.4257033716488462, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 48812 + }, + { + "epoch": 0.42571209293401474, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9865, + "step": 48813 + }, + { + "epoch": 0.42572081421918334, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0266, + "step": 48814 + }, + { + "epoch": 0.42572953550435194, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 48815 + }, + { + "epoch": 0.4257382567895205, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48816 + }, + { + "epoch": 0.4257469780746891, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 48817 + }, + { + "epoch": 0.4257556993598577, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 48818 + }, + { + "epoch": 0.42576442064502623, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 48819 + }, + { + "epoch": 0.42577314193019483, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 48820 + }, + { + "epoch": 0.42578186321536343, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9959, + "step": 48821 + }, + { + "epoch": 0.425790584500532, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 48822 + }, + { + "epoch": 0.4257993057857006, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 48823 + }, + { + "epoch": 0.4258080270708692, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 48824 + }, + { + "epoch": 0.4258167483560377, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 48825 + }, + { + "epoch": 0.4258254696412063, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 48826 + }, + { + "epoch": 0.4258341909263749, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 48827 + }, + { + "epoch": 0.42584291221154347, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 48828 + }, + { + "epoch": 0.42585163349671207, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48829 + }, + { + "epoch": 0.42586035478188067, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0019, + "step": 48830 + }, + { + "epoch": 0.42586907606704927, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 48831 + }, + { + "epoch": 0.4258777973522178, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 48832 + }, + { + "epoch": 0.4258865186373864, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 48833 + }, + { + "epoch": 0.425895239922555, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 48834 + }, + { + "epoch": 0.42590396120772356, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 48835 + }, + { + "epoch": 0.42591268249289216, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 48836 + }, + { + "epoch": 0.42592140377806076, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 48837 + }, + { + "epoch": 0.4259301250632293, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 48838 + }, + { + "epoch": 0.4259388463483979, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 48839 + }, + { + "epoch": 0.4259475676335665, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 48840 + }, + { + "epoch": 0.42595628891873505, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 48841 + }, + { + "epoch": 0.42596501020390365, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 48842 + }, + { + "epoch": 0.42597373148907225, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 48843 + }, + { + "epoch": 0.4259824527742408, + "grad_norm": 0.302734375, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 48844 + }, + { + "epoch": 0.4259911740594094, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 48845 + }, + { + "epoch": 0.425999895344578, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 48846 + }, + { + "epoch": 0.42600861662974654, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 48847 + }, + { + "epoch": 0.42601733791491514, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 48848 + }, + { + "epoch": 0.42602605920008374, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 48849 + }, + { + "epoch": 0.4260347804852523, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 48850 + }, + { + "epoch": 0.4260435017704209, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 48851 + }, + { + "epoch": 0.4260522230555895, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48852 + }, + { + "epoch": 0.42606094434075803, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 48853 + }, + { + "epoch": 0.42606966562592663, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 48854 + }, + { + "epoch": 0.42607838691109523, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 48855 + }, + { + "epoch": 0.4260871081962638, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48856 + }, + { + "epoch": 0.4260958294814324, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 48857 + }, + { + "epoch": 0.426104550766601, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 48858 + }, + { + "epoch": 0.4261132720517696, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 48859 + }, + { + "epoch": 0.4261219933369381, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 48860 + }, + { + "epoch": 0.4261307146221067, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 48861 + }, + { + "epoch": 0.4261394359072753, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 48862 + }, + { + "epoch": 0.42614815719244387, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 48863 + }, + { + "epoch": 0.42615687847761247, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 48864 + }, + { + "epoch": 0.42616559976278107, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 48865 + }, + { + "epoch": 0.4261743210479496, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 48866 + }, + { + "epoch": 0.4261830423331182, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 48867 + }, + { + "epoch": 0.4261917636182868, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 48868 + }, + { + "epoch": 0.42620048490345536, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 48869 + }, + { + "epoch": 0.42620920618862396, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.984, + "step": 48870 + }, + { + "epoch": 0.42621792747379256, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0276, + "step": 48871 + }, + { + "epoch": 0.4262266487589611, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 48872 + }, + { + "epoch": 0.4262353700441297, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 48873 + }, + { + "epoch": 0.4262440913292983, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 48874 + }, + { + "epoch": 0.42625281261446685, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48875 + }, + { + "epoch": 0.42626153389963545, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 48876 + }, + { + "epoch": 0.42627025518480405, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 48877 + }, + { + "epoch": 0.4262789764699726, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48878 + }, + { + "epoch": 0.4262876977551412, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 48879 + }, + { + "epoch": 0.4262964190403098, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9882, + "step": 48880 + }, + { + "epoch": 0.42630514032547834, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48881 + }, + { + "epoch": 0.42631386161064694, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 48882 + }, + { + "epoch": 0.42632258289581554, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 48883 + }, + { + "epoch": 0.4263313041809841, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 48884 + }, + { + "epoch": 0.4263400254661527, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 48885 + }, + { + "epoch": 0.4263487467513213, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 48886 + }, + { + "epoch": 0.4263574680364899, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 48887 + }, + { + "epoch": 0.42636618932165843, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48888 + }, + { + "epoch": 0.42637491060682703, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 48889 + }, + { + "epoch": 0.42638363189199563, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 48890 + }, + { + "epoch": 0.4263923531771642, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 48891 + }, + { + "epoch": 0.4264010744623328, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 48892 + }, + { + "epoch": 0.4264097957475014, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9798, + "step": 48893 + }, + { + "epoch": 0.4264185170326699, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9896, + "step": 48894 + }, + { + "epoch": 0.4264272383178385, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 48895 + }, + { + "epoch": 0.4264359596030071, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 48896 + }, + { + "epoch": 0.42644468088817566, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 48897 + }, + { + "epoch": 0.42645340217334426, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 48898 + }, + { + "epoch": 0.42646212345851287, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48899 + }, + { + "epoch": 0.4264708447436814, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 48900 + }, + { + "epoch": 0.42647956602885, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 48901 + }, + { + "epoch": 0.4264882873140186, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 48902 + }, + { + "epoch": 0.42649700859918716, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 48903 + }, + { + "epoch": 0.42650572988435576, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 48904 + }, + { + "epoch": 0.42651445116952436, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 48905 + }, + { + "epoch": 0.4265231724546929, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 48906 + }, + { + "epoch": 0.4265318937398615, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 48907 + }, + { + "epoch": 0.4265406150250301, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 48908 + }, + { + "epoch": 0.42654933631019865, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 48909 + }, + { + "epoch": 0.42655805759536725, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 48910 + }, + { + "epoch": 0.42656677888053585, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 48911 + }, + { + "epoch": 0.4265755001657044, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9987, + "step": 48912 + }, + { + "epoch": 0.426584221450873, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 48913 + }, + { + "epoch": 0.4265929427360416, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 48914 + }, + { + "epoch": 0.4266016640212102, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 48915 + }, + { + "epoch": 0.42661038530637874, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 48916 + }, + { + "epoch": 0.42661910659154734, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9958, + "step": 48917 + }, + { + "epoch": 0.42662782787671594, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 48918 + }, + { + "epoch": 0.4266365491618845, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 48919 + }, + { + "epoch": 0.4266452704470531, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 0.9907, + "step": 48920 + }, + { + "epoch": 0.4266539917322217, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 48921 + }, + { + "epoch": 0.4266627130173902, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 48922 + }, + { + "epoch": 0.4266714343025588, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0344, + "step": 48923 + }, + { + "epoch": 0.4266801555877274, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 48924 + }, + { + "epoch": 0.426688876872896, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 48925 + }, + { + "epoch": 0.4266975981580646, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48926 + }, + { + "epoch": 0.4267063194432332, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.02, + "step": 48927 + }, + { + "epoch": 0.4267150407284017, + "grad_norm": 0.30078125, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 48928 + }, + { + "epoch": 0.4267237620135703, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 48929 + }, + { + "epoch": 0.4267324832987389, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 48930 + }, + { + "epoch": 0.42674120458390746, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9811, + "step": 48931 + }, + { + "epoch": 0.42674992586907606, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0205, + "step": 48932 + }, + { + "epoch": 0.42675864715424466, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.986, + "step": 48933 + }, + { + "epoch": 0.4267673684394132, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 48934 + }, + { + "epoch": 0.4267760897245818, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 48935 + }, + { + "epoch": 0.4267848110097504, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9776, + "step": 48936 + }, + { + "epoch": 0.42679353229491895, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48937 + }, + { + "epoch": 0.42680225358008755, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 48938 + }, + { + "epoch": 0.42681097486525615, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 48939 + }, + { + "epoch": 0.4268196961504247, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 48940 + }, + { + "epoch": 0.4268284174355933, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9856, + "step": 48941 + }, + { + "epoch": 0.4268371387207619, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9754, + "step": 48942 + }, + { + "epoch": 0.4268458600059305, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 48943 + }, + { + "epoch": 0.42685458129109904, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 48944 + }, + { + "epoch": 0.42686330257626764, + "grad_norm": 0.25390625, + "learning_rate": 0.0005, + "loss": 1.0062, + "step": 48945 + }, + { + "epoch": 0.42687202386143624, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0271, + "step": 48946 + }, + { + "epoch": 0.4268807451466048, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 48947 + }, + { + "epoch": 0.4268894664317734, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 48948 + }, + { + "epoch": 0.426898187716942, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 48949 + }, + { + "epoch": 0.42690690900211054, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 48950 + }, + { + "epoch": 0.42691563028727914, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0219, + "step": 48951 + }, + { + "epoch": 0.42692435157244774, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0272, + "step": 48952 + }, + { + "epoch": 0.4269330728576163, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0263, + "step": 48953 + }, + { + "epoch": 0.4269417941427849, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 48954 + }, + { + "epoch": 0.4269505154279535, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 48955 + }, + { + "epoch": 0.426959236713122, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 48956 + }, + { + "epoch": 0.4269679579982906, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9759, + "step": 48957 + }, + { + "epoch": 0.4269766792834592, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 48958 + }, + { + "epoch": 0.42698540056862777, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 48959 + }, + { + "epoch": 0.42699412185379637, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 48960 + }, + { + "epoch": 0.42700284313896497, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 48961 + }, + { + "epoch": 0.4270115644241335, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 48962 + }, + { + "epoch": 0.4270202857093021, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 48963 + }, + { + "epoch": 0.4270290069944707, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 48964 + }, + { + "epoch": 0.42703772827963926, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9846, + "step": 48965 + }, + { + "epoch": 0.42704644956480786, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 48966 + }, + { + "epoch": 0.42705517084997646, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 48967 + }, + { + "epoch": 0.42706389213514506, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9763, + "step": 48968 + }, + { + "epoch": 0.4270726134203136, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 48969 + }, + { + "epoch": 0.4270813347054822, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 48970 + }, + { + "epoch": 0.4270900559906508, + "grad_norm": 0.1884765625, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 48971 + }, + { + "epoch": 0.42709877727581935, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 48972 + }, + { + "epoch": 0.42710749856098795, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 48973 + }, + { + "epoch": 0.42711621984615655, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0375, + "step": 48974 + }, + { + "epoch": 0.4271249411313251, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 48975 + }, + { + "epoch": 0.4271336624164937, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9821, + "step": 48976 + }, + { + "epoch": 0.4271423837016623, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0211, + "step": 48977 + }, + { + "epoch": 0.42715110498683084, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9853, + "step": 48978 + }, + { + "epoch": 0.42715982627199944, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 48979 + }, + { + "epoch": 0.42716854755716804, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 48980 + }, + { + "epoch": 0.4271772688423366, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9836, + "step": 48981 + }, + { + "epoch": 0.4271859901275052, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 48982 + }, + { + "epoch": 0.4271947114126738, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9901, + "step": 48983 + }, + { + "epoch": 0.42720343269784233, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 48984 + }, + { + "epoch": 0.42721215398301093, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 48985 + }, + { + "epoch": 0.42722087526817953, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 48986 + }, + { + "epoch": 0.4272295965533481, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 48987 + }, + { + "epoch": 0.4272383178385167, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 48988 + }, + { + "epoch": 0.4272470391236853, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 48989 + }, + { + "epoch": 0.4272557604088538, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 48990 + }, + { + "epoch": 0.4272644816940224, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9884, + "step": 48991 + }, + { + "epoch": 0.427273202979191, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 48992 + }, + { + "epoch": 0.42728192426435957, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 48993 + }, + { + "epoch": 0.42729064554952817, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 48994 + }, + { + "epoch": 0.42729936683469677, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 48995 + }, + { + "epoch": 0.42730808811986537, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 48996 + }, + { + "epoch": 0.4273168094050339, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 48997 + }, + { + "epoch": 0.4273255306902025, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 48998 + }, + { + "epoch": 0.4273342519753711, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 48999 + }, + { + "epoch": 0.42734297326053966, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.027, + "step": 49000 + }, + { + "epoch": 0.42735169454570826, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 49001 + }, + { + "epoch": 0.42736041583087686, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 49002 + }, + { + "epoch": 0.4273691371160454, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 49003 + }, + { + "epoch": 0.427377858401214, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 49004 + }, + { + "epoch": 0.4273865796863826, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9863, + "step": 49005 + }, + { + "epoch": 0.42739530097155115, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 49006 + }, + { + "epoch": 0.42740402225671975, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 49007 + }, + { + "epoch": 0.42741274354188835, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49008 + }, + { + "epoch": 0.4274214648270569, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 49009 + }, + { + "epoch": 0.4274301861122255, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0121, + "step": 49010 + }, + { + "epoch": 0.4274389073973941, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 49011 + }, + { + "epoch": 0.42744762868256264, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 49012 + }, + { + "epoch": 0.42745634996773124, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 49013 + }, + { + "epoch": 0.42746507125289984, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49014 + }, + { + "epoch": 0.4274737925380684, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49015 + }, + { + "epoch": 0.427482513823237, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9669, + "step": 49016 + }, + { + "epoch": 0.4274912351084056, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 49017 + }, + { + "epoch": 0.42749995639357413, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 49018 + }, + { + "epoch": 0.42750867767874273, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0259, + "step": 49019 + }, + { + "epoch": 0.42751739896391133, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 49020 + }, + { + "epoch": 0.4275261202490799, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 49021 + }, + { + "epoch": 0.4275348415342485, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49022 + }, + { + "epoch": 0.4275435628194171, + "grad_norm": 0.1875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 49023 + }, + { + "epoch": 0.4275522841045857, + "grad_norm": 0.1826171875, + "learning_rate": 0.0005, + "loss": 0.9877, + "step": 49024 + }, + { + "epoch": 0.4275610053897542, + "grad_norm": 0.1845703125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 49025 + }, + { + "epoch": 0.4275697266749228, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 49026 + }, + { + "epoch": 0.4275784479600914, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 49027 + }, + { + "epoch": 0.42758716924525997, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 49028 + }, + { + "epoch": 0.42759589053042857, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 49029 + }, + { + "epoch": 0.42760461181559717, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49030 + }, + { + "epoch": 0.4276133331007657, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 49031 + }, + { + "epoch": 0.4276220543859343, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 49032 + }, + { + "epoch": 0.4276307756711029, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 49033 + }, + { + "epoch": 0.42763949695627146, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.988, + "step": 49034 + }, + { + "epoch": 0.42764821824144006, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9705, + "step": 49035 + }, + { + "epoch": 0.42765693952660866, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 49036 + }, + { + "epoch": 0.4276656608117772, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 49037 + }, + { + "epoch": 0.4276743820969458, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 49038 + }, + { + "epoch": 0.4276831033821144, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 49039 + }, + { + "epoch": 0.42769182466728295, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49040 + }, + { + "epoch": 0.42770054595245155, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 49041 + }, + { + "epoch": 0.42770926723762015, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 49042 + }, + { + "epoch": 0.4277179885227887, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49043 + }, + { + "epoch": 0.4277267098079573, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 49044 + }, + { + "epoch": 0.4277354310931259, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 49045 + }, + { + "epoch": 0.42774415237829444, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 49046 + }, + { + "epoch": 0.42775287366346304, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 49047 + }, + { + "epoch": 0.42776159494863164, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49048 + }, + { + "epoch": 0.4277703162338002, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 49049 + }, + { + "epoch": 0.4277790375189688, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 49050 + }, + { + "epoch": 0.4277877588041374, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 49051 + }, + { + "epoch": 0.427796480089306, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 49052 + }, + { + "epoch": 0.42780520137447453, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49053 + }, + { + "epoch": 0.42781392265964313, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9771, + "step": 49054 + }, + { + "epoch": 0.42782264394481173, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0223, + "step": 49055 + }, + { + "epoch": 0.4278313652299803, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 0.9928, + "step": 49056 + }, + { + "epoch": 0.4278400865151489, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9716, + "step": 49057 + }, + { + "epoch": 0.4278488078003175, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 49058 + }, + { + "epoch": 0.427857529085486, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 49059 + }, + { + "epoch": 0.4278662503706546, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 49060 + }, + { + "epoch": 0.4278749716558232, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 49061 + }, + { + "epoch": 0.42788369294099177, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.019, + "step": 49062 + }, + { + "epoch": 0.42789241422616037, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 49063 + }, + { + "epoch": 0.42790113551132897, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.999, + "step": 49064 + }, + { + "epoch": 0.4279098567964975, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 49065 + }, + { + "epoch": 0.4279185780816661, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 49066 + }, + { + "epoch": 0.4279272993668347, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 49067 + }, + { + "epoch": 0.42793602065200326, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 49068 + }, + { + "epoch": 0.42794474193717186, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 49069 + }, + { + "epoch": 0.42795346322234046, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 49070 + }, + { + "epoch": 0.427962184507509, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0104, + "step": 49071 + }, + { + "epoch": 0.4279709057926776, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 49072 + }, + { + "epoch": 0.4279796270778462, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 49073 + }, + { + "epoch": 0.42798834836301475, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 49074 + }, + { + "epoch": 0.42799706964818335, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0345, + "step": 49075 + }, + { + "epoch": 0.42800579093335195, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 49076 + }, + { + "epoch": 0.42801451221852055, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 49077 + }, + { + "epoch": 0.4280232335036891, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 49078 + }, + { + "epoch": 0.4280319547888577, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 49079 + }, + { + "epoch": 0.4280406760740263, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0356, + "step": 49080 + }, + { + "epoch": 0.42804939735919484, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49081 + }, + { + "epoch": 0.42805811864436344, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 49082 + }, + { + "epoch": 0.42806683992953204, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 49083 + }, + { + "epoch": 0.4280755612147006, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0071, + "step": 49084 + }, + { + "epoch": 0.4280842824998692, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 49085 + }, + { + "epoch": 0.4280930037850378, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 49086 + }, + { + "epoch": 0.42810172507020633, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 49087 + }, + { + "epoch": 0.42811044635537493, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 49088 + }, + { + "epoch": 0.42811916764054353, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 49089 + }, + { + "epoch": 0.4281278889257121, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 49090 + }, + { + "epoch": 0.4281366102108807, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 49091 + }, + { + "epoch": 0.4281453314960493, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 49092 + }, + { + "epoch": 0.4281540527812178, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 49093 + }, + { + "epoch": 0.4281627740663864, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 49094 + }, + { + "epoch": 0.428171495351555, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 49095 + }, + { + "epoch": 0.42818021663672357, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9731, + "step": 49096 + }, + { + "epoch": 0.42818893792189217, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 49097 + }, + { + "epoch": 0.42819765920706077, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0001, + "step": 49098 + }, + { + "epoch": 0.4282063804922293, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 49099 + }, + { + "epoch": 0.4282151017773979, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 49100 + }, + { + "epoch": 0.4282238230625665, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0129, + "step": 49101 + }, + { + "epoch": 0.42823254434773506, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 49102 + }, + { + "epoch": 0.42824126563290366, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 49103 + }, + { + "epoch": 0.42824998691807226, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49104 + }, + { + "epoch": 0.42825870820324086, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 49105 + }, + { + "epoch": 0.4282674294884094, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9825, + "step": 49106 + }, + { + "epoch": 0.428276150773578, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 49107 + }, + { + "epoch": 0.4282848720587466, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 49108 + }, + { + "epoch": 0.42829359334391515, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 49109 + }, + { + "epoch": 0.42830231462908375, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 49110 + }, + { + "epoch": 0.42831103591425235, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 49111 + }, + { + "epoch": 0.4283197571994209, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 49112 + }, + { + "epoch": 0.4283284784845895, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 49113 + }, + { + "epoch": 0.4283371997697581, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9791, + "step": 49114 + }, + { + "epoch": 0.42834592105492664, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9965, + "step": 49115 + }, + { + "epoch": 0.42835464234009524, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 49116 + }, + { + "epoch": 0.42836336362526384, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 49117 + }, + { + "epoch": 0.4283720849104324, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0346, + "step": 49118 + }, + { + "epoch": 0.428380806195601, + "grad_norm": 0.1904296875, + "learning_rate": 0.0005, + "loss": 1.0297, + "step": 49119 + }, + { + "epoch": 0.4283895274807696, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0058, + "step": 49120 + }, + { + "epoch": 0.4283982487659381, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49121 + }, + { + "epoch": 0.42840697005110673, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 49122 + }, + { + "epoch": 0.42841569133627533, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 49123 + }, + { + "epoch": 0.4284244126214439, + "grad_norm": 0.259765625, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 49124 + }, + { + "epoch": 0.4284331339066125, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 49125 + }, + { + "epoch": 0.4284418551917811, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 49126 + }, + { + "epoch": 0.4284505764769496, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 49127 + }, + { + "epoch": 0.4284592977621182, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 49128 + }, + { + "epoch": 0.4284680190472868, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 49129 + }, + { + "epoch": 0.42847674033245536, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 49130 + }, + { + "epoch": 0.42848546161762396, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49131 + }, + { + "epoch": 0.42849418290279256, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9888, + "step": 49132 + }, + { + "epoch": 0.42850290418796116, + "grad_norm": 0.12353515625, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 49133 + }, + { + "epoch": 0.4285116254731297, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 49134 + }, + { + "epoch": 0.4285203467582983, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 49135 + }, + { + "epoch": 0.4285290680434669, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.9816, + "step": 49136 + }, + { + "epoch": 0.42853778932863545, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.029, + "step": 49137 + }, + { + "epoch": 0.42854651061380405, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 49138 + }, + { + "epoch": 0.42855523189897266, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 49139 + }, + { + "epoch": 0.4285639531841412, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 49140 + }, + { + "epoch": 0.4285726744693098, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 49141 + }, + { + "epoch": 0.4285813957544784, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 49142 + }, + { + "epoch": 0.42859011703964695, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 49143 + }, + { + "epoch": 0.42859883832481555, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 49144 + }, + { + "epoch": 0.42860755960998415, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0146, + "step": 49145 + }, + { + "epoch": 0.4286162808951527, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0192, + "step": 49146 + }, + { + "epoch": 0.4286250021803213, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 49147 + }, + { + "epoch": 0.4286337234654899, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 49148 + }, + { + "epoch": 0.42864244475065844, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 49149 + }, + { + "epoch": 0.42865116603582704, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 49150 + }, + { + "epoch": 0.42865988732099564, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 49151 + }, + { + "epoch": 0.4286686086061642, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49152 + }, + { + "epoch": 0.4286773298913328, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.026, + "step": 49153 + }, + { + "epoch": 0.4286860511765014, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0079, + "step": 49154 + }, + { + "epoch": 0.4286947724616699, + "grad_norm": 0.16015625, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 49155 + }, + { + "epoch": 0.4287034937468385, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 49156 + }, + { + "epoch": 0.4287122150320071, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 49157 + }, + { + "epoch": 0.42872093631717567, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 49158 + }, + { + "epoch": 0.42872965760234427, + "grad_norm": 0.2236328125, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 49159 + }, + { + "epoch": 0.42873837888751287, + "grad_norm": 0.306640625, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 49160 + }, + { + "epoch": 0.4287471001726815, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 49161 + }, + { + "epoch": 0.42875582145785, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 49162 + }, + { + "epoch": 0.4287645427430186, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 49163 + }, + { + "epoch": 0.4287732640281872, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49164 + }, + { + "epoch": 0.42878198531335576, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0291, + "step": 49165 + }, + { + "epoch": 0.42879070659852436, + "grad_norm": 0.1982421875, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 49166 + }, + { + "epoch": 0.42879942788369296, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 49167 + }, + { + "epoch": 0.4288081491688615, + "grad_norm": 0.1806640625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 49168 + }, + { + "epoch": 0.4288168704540301, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9936, + "step": 49169 + }, + { + "epoch": 0.4288255917391987, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 49170 + }, + { + "epoch": 0.42883431302436725, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 49171 + }, + { + "epoch": 0.42884303430953585, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 49172 + }, + { + "epoch": 0.42885175559470445, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 49173 + }, + { + "epoch": 0.428860476879873, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9786, + "step": 49174 + }, + { + "epoch": 0.4288691981650416, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 49175 + }, + { + "epoch": 0.4288779194502102, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 49176 + }, + { + "epoch": 0.42888664073537874, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 49177 + }, + { + "epoch": 0.42889536202054734, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 49178 + }, + { + "epoch": 0.42890408330571594, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.991, + "step": 49179 + }, + { + "epoch": 0.4289128045908845, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 49180 + }, + { + "epoch": 0.4289215258760531, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0156, + "step": 49181 + }, + { + "epoch": 0.4289302471612217, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 49182 + }, + { + "epoch": 0.42893896844639023, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 49183 + }, + { + "epoch": 0.42894768973155883, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 49184 + }, + { + "epoch": 0.42895641101672743, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 49185 + }, + { + "epoch": 0.42896513230189603, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 49186 + }, + { + "epoch": 0.4289738535870646, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49187 + }, + { + "epoch": 0.4289825748722332, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 49188 + }, + { + "epoch": 0.4289912961574018, + "grad_norm": 0.181640625, + "learning_rate": 0.0005, + "loss": 1.0134, + "step": 49189 + }, + { + "epoch": 0.4290000174425703, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 49190 + }, + { + "epoch": 0.4290087387277389, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 49191 + }, + { + "epoch": 0.4290174600129075, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 49192 + }, + { + "epoch": 0.42902618129807607, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 49193 + }, + { + "epoch": 0.42903490258324467, + "grad_norm": 0.1796875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49194 + }, + { + "epoch": 0.42904362386841327, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 49195 + }, + { + "epoch": 0.4290523451535818, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9981, + "step": 49196 + }, + { + "epoch": 0.4290610664387504, + "grad_norm": 0.166015625, + "learning_rate": 0.0005, + "loss": 0.9982, + "step": 49197 + }, + { + "epoch": 0.429069787723919, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 49198 + }, + { + "epoch": 0.42907850900908756, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 49199 + }, + { + "epoch": 0.42908723029425616, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0244, + "step": 49200 + }, + { + "epoch": 0.42909595157942476, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 49201 + }, + { + "epoch": 0.4291046728645933, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 49202 + }, + { + "epoch": 0.4291133941497619, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 0.9937, + "step": 49203 + }, + { + "epoch": 0.4291221154349305, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 49204 + }, + { + "epoch": 0.42913083672009905, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 49205 + }, + { + "epoch": 0.42913955800526765, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 49206 + }, + { + "epoch": 0.42914827929043625, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49207 + }, + { + "epoch": 0.4291570005756048, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0029, + "step": 49208 + }, + { + "epoch": 0.4291657218607734, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 49209 + }, + { + "epoch": 0.429174443145942, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 49210 + }, + { + "epoch": 0.42918316443111054, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 49211 + }, + { + "epoch": 0.42919188571627914, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 49212 + }, + { + "epoch": 0.42920060700144774, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 49213 + }, + { + "epoch": 0.42920932828661634, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 49214 + }, + { + "epoch": 0.4292180495717849, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 49215 + }, + { + "epoch": 0.4292267708569535, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49216 + }, + { + "epoch": 0.4292354921421221, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.9992, + "step": 49217 + }, + { + "epoch": 0.42924421342729063, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 49218 + }, + { + "epoch": 0.42925293471245923, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0165, + "step": 49219 + }, + { + "epoch": 0.42926165599762783, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 49220 + }, + { + "epoch": 0.4292703772827964, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9926, + "step": 49221 + }, + { + "epoch": 0.429279098567965, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 49222 + }, + { + "epoch": 0.4292878198531336, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 49223 + }, + { + "epoch": 0.4292965411383021, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9899, + "step": 49224 + }, + { + "epoch": 0.4293052624234707, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 49225 + }, + { + "epoch": 0.4293139837086393, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 49226 + }, + { + "epoch": 0.42932270499380787, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 49227 + }, + { + "epoch": 0.42933142627897647, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 0.9897, + "step": 49228 + }, + { + "epoch": 0.42934014756414507, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 49229 + }, + { + "epoch": 0.4293488688493136, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 49230 + }, + { + "epoch": 0.4293575901344822, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 49231 + }, + { + "epoch": 0.4293663114196508, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 49232 + }, + { + "epoch": 0.42937503270481936, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 49233 + }, + { + "epoch": 0.42938375398998796, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 49234 + }, + { + "epoch": 0.42939247527515656, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 49235 + }, + { + "epoch": 0.4294011965603251, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 49236 + }, + { + "epoch": 0.4294099178454937, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 49237 + }, + { + "epoch": 0.4294186391306623, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005, + "loss": 0.9966, + "step": 49238 + }, + { + "epoch": 0.42942736041583085, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.985, + "step": 49239 + }, + { + "epoch": 0.42943608170099945, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 49240 + }, + { + "epoch": 0.42944480298616805, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 49241 + }, + { + "epoch": 0.42945352427133665, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 49242 + }, + { + "epoch": 0.4294622455565052, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0179, + "step": 49243 + }, + { + "epoch": 0.4294709668416738, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 49244 + }, + { + "epoch": 0.4294796881268424, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 49245 + }, + { + "epoch": 0.42948840941201094, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 49246 + }, + { + "epoch": 0.42949713069717954, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 49247 + }, + { + "epoch": 0.42950585198234814, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 49248 + }, + { + "epoch": 0.4295145732675167, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.99, + "step": 49249 + }, + { + "epoch": 0.4295232945526853, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 0.9861, + "step": 49250 + }, + { + "epoch": 0.4295320158378539, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 49251 + }, + { + "epoch": 0.42954073712302243, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 49252 + }, + { + "epoch": 0.42954945840819103, + "grad_norm": 0.2470703125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 49253 + }, + { + "epoch": 0.42955817969335963, + "grad_norm": 0.1630859375, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 49254 + }, + { + "epoch": 0.4295669009785282, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 49255 + }, + { + "epoch": 0.4295756222636968, + "grad_norm": 0.287109375, + "learning_rate": 0.0005, + "loss": 1.0292, + "step": 49256 + }, + { + "epoch": 0.4295843435488654, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49257 + }, + { + "epoch": 0.4295930648340339, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 49258 + }, + { + "epoch": 0.4296017861192025, + "grad_norm": 0.2314453125, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 49259 + }, + { + "epoch": 0.4296105074043711, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 49260 + }, + { + "epoch": 0.42961922868953967, + "grad_norm": 0.2490234375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 49261 + }, + { + "epoch": 0.42962794997470827, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 49262 + }, + { + "epoch": 0.42963667125987687, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 49263 + }, + { + "epoch": 0.4296453925450454, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 49264 + }, + { + "epoch": 0.429654113830214, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 49265 + }, + { + "epoch": 0.4296628351153826, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 49266 + }, + { + "epoch": 0.42967155640055116, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 49267 + }, + { + "epoch": 0.42968027768571976, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 49268 + }, + { + "epoch": 0.42968899897088836, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 49269 + }, + { + "epoch": 0.42969772025605696, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 49270 + }, + { + "epoch": 0.4297064415412255, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 49271 + }, + { + "epoch": 0.4297151628263941, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 49272 + }, + { + "epoch": 0.4297238841115627, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 49273 + }, + { + "epoch": 0.42973260539673125, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0073, + "step": 49274 + }, + { + "epoch": 0.42974132668189985, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 49275 + }, + { + "epoch": 0.42975004796706845, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 49276 + }, + { + "epoch": 0.429758769252237, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 49277 + }, + { + "epoch": 0.4297674905374056, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 49278 + }, + { + "epoch": 0.4297762118225742, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 49279 + }, + { + "epoch": 0.42978493310774274, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 49280 + }, + { + "epoch": 0.42979365439291134, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 49281 + }, + { + "epoch": 0.42980237567807994, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 49282 + }, + { + "epoch": 0.4298110969632485, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0164, + "step": 49283 + }, + { + "epoch": 0.4298198182484171, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 49284 + }, + { + "epoch": 0.4298285395335857, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 49285 + }, + { + "epoch": 0.42983726081875423, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0191, + "step": 49286 + }, + { + "epoch": 0.42984598210392283, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 49287 + }, + { + "epoch": 0.42985470338909143, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 49288 + }, + { + "epoch": 0.42986342467426, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 49289 + }, + { + "epoch": 0.4298721459594286, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 49290 + }, + { + "epoch": 0.4298808672445972, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 49291 + }, + { + "epoch": 0.4298895885297657, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0144, + "step": 49292 + }, + { + "epoch": 0.4298983098149343, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 49293 + }, + { + "epoch": 0.4299070311001029, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 49294 + }, + { + "epoch": 0.4299157523852715, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0168, + "step": 49295 + }, + { + "epoch": 0.42992447367044007, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 49296 + }, + { + "epoch": 0.42993319495560867, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 49297 + }, + { + "epoch": 0.42994191624077727, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0103, + "step": 49298 + }, + { + "epoch": 0.4299506375259458, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 49299 + }, + { + "epoch": 0.4299593588111144, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 49300 + }, + { + "epoch": 0.429968080096283, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 49301 + }, + { + "epoch": 0.42997680138145156, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0353, + "step": 49302 + }, + { + "epoch": 0.42998552266662016, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 49303 + }, + { + "epoch": 0.42999424395178876, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 49304 + }, + { + "epoch": 0.4300029652369573, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 49305 + }, + { + "epoch": 0.4300116865221259, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 49306 + }, + { + "epoch": 0.4300204078072945, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 49307 + }, + { + "epoch": 0.43002912909246305, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 49308 + }, + { + "epoch": 0.43003785037763165, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 49309 + }, + { + "epoch": 0.43004657166280025, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0246, + "step": 49310 + }, + { + "epoch": 0.4300552929479688, + "grad_norm": 0.07177734375, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 49311 + }, + { + "epoch": 0.4300640142331374, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 49312 + }, + { + "epoch": 0.430072735518306, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 49313 + }, + { + "epoch": 0.43008145680347454, + "grad_norm": 0.162109375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 49314 + }, + { + "epoch": 0.43009017808864314, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 49315 + }, + { + "epoch": 0.43009889937381174, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 49316 + }, + { + "epoch": 0.4301076206589803, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9939, + "step": 49317 + }, + { + "epoch": 0.4301163419441489, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 49318 + }, + { + "epoch": 0.4301250632293175, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 49319 + }, + { + "epoch": 0.43013378451448603, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.993, + "step": 49320 + }, + { + "epoch": 0.43014250579965463, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 49321 + }, + { + "epoch": 0.43015122708482323, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 49322 + }, + { + "epoch": 0.43015994836999183, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.013, + "step": 49323 + }, + { + "epoch": 0.4301686696551604, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 49324 + }, + { + "epoch": 0.430177390940329, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0125, + "step": 49325 + }, + { + "epoch": 0.4301861122254976, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 49326 + }, + { + "epoch": 0.4301948335106661, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49327 + }, + { + "epoch": 0.4302035547958347, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 49328 + }, + { + "epoch": 0.4302122760810033, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 49329 + }, + { + "epoch": 0.43022099736617186, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0247, + "step": 49330 + }, + { + "epoch": 0.43022971865134046, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9978, + "step": 49331 + }, + { + "epoch": 0.43023843993650907, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 49332 + }, + { + "epoch": 0.4302471612216776, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 49333 + }, + { + "epoch": 0.4302558825068462, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 49334 + }, + { + "epoch": 0.4302646037920148, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 49335 + }, + { + "epoch": 0.43027332507718336, + "grad_norm": 0.3671875, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 49336 + }, + { + "epoch": 0.43028204636235196, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 49337 + }, + { + "epoch": 0.43029076764752056, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 49338 + }, + { + "epoch": 0.4302994889326891, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 49339 + }, + { + "epoch": 0.4303082102178577, + "grad_norm": 0.26171875, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 49340 + }, + { + "epoch": 0.4303169315030263, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 49341 + }, + { + "epoch": 0.43032565278819485, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 49342 + }, + { + "epoch": 0.43033437407336345, + "grad_norm": 0.2451171875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 49343 + }, + { + "epoch": 0.43034309535853205, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 49344 + }, + { + "epoch": 0.4303518166437006, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 49345 + }, + { + "epoch": 0.4303605379288692, + "grad_norm": 0.22265625, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 49346 + }, + { + "epoch": 0.4303692592140378, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0055, + "step": 49347 + }, + { + "epoch": 0.43037798049920634, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 49348 + }, + { + "epoch": 0.43038670178437494, + "grad_norm": 0.234375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 49349 + }, + { + "epoch": 0.43039542306954354, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 49350 + }, + { + "epoch": 0.43040414435471214, + "grad_norm": 0.255859375, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 49351 + }, + { + "epoch": 0.4304128656398807, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0256, + "step": 49352 + }, + { + "epoch": 0.4304215869250493, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 49353 + }, + { + "epoch": 0.4304303082102179, + "grad_norm": 0.21875, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 49354 + }, + { + "epoch": 0.4304390294953864, + "grad_norm": 0.1953125, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 49355 + }, + { + "epoch": 0.430447750780555, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 49356 + }, + { + "epoch": 0.4304564720657236, + "grad_norm": 0.2421875, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 49357 + }, + { + "epoch": 0.4304651933508922, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 49358 + }, + { + "epoch": 0.4304739146360608, + "grad_norm": 0.19140625, + "learning_rate": 0.0005, + "loss": 1.012, + "step": 49359 + }, + { + "epoch": 0.4304826359212294, + "grad_norm": 0.2138671875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 49360 + }, + { + "epoch": 0.4304913572063979, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 49361 + }, + { + "epoch": 0.4305000784915665, + "grad_norm": 0.23046875, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 49362 + }, + { + "epoch": 0.4305087997767351, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 49363 + }, + { + "epoch": 0.43051752106190366, + "grad_norm": 0.16796875, + "learning_rate": 0.0005, + "loss": 0.9858, + "step": 49364 + }, + { + "epoch": 0.43052624234707226, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 49365 + }, + { + "epoch": 0.43053496363224086, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0114, + "step": 49366 + }, + { + "epoch": 0.4305436849174094, + "grad_norm": 0.1962890625, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 49367 + }, + { + "epoch": 0.430552406202578, + "grad_norm": 0.251953125, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 49368 + }, + { + "epoch": 0.4305611274877466, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 49369 + }, + { + "epoch": 0.43056984877291515, + "grad_norm": 0.1923828125, + "learning_rate": 0.0005, + "loss": 1.0224, + "step": 49370 + }, + { + "epoch": 0.43057857005808375, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 49371 + }, + { + "epoch": 0.43058729134325235, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 49372 + }, + { + "epoch": 0.4305960126284209, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 49373 + }, + { + "epoch": 0.4306047339135895, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 49374 + }, + { + "epoch": 0.4306134551987581, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 49375 + }, + { + "epoch": 0.43062217648392664, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 49376 + }, + { + "epoch": 0.43063089776909524, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0089, + "step": 49377 + }, + { + "epoch": 0.43063961905426384, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 49378 + }, + { + "epoch": 0.43064834033943244, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 0.9781, + "step": 49379 + }, + { + "epoch": 0.430657061624601, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 0.9765, + "step": 49380 + }, + { + "epoch": 0.4306657829097696, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 49381 + }, + { + "epoch": 0.4306745041949382, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 49382 + }, + { + "epoch": 0.43068322548010674, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 49383 + }, + { + "epoch": 0.43069194676527534, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49384 + }, + { + "epoch": 0.43070066805044394, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 49385 + }, + { + "epoch": 0.4307093893356125, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 49386 + }, + { + "epoch": 0.4307181106207811, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0048, + "step": 49387 + }, + { + "epoch": 0.4307268319059497, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 49388 + }, + { + "epoch": 0.4307355531911182, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 49389 + }, + { + "epoch": 0.4307442744762868, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 1.032, + "step": 49390 + }, + { + "epoch": 0.4307529957614554, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0294, + "step": 49391 + }, + { + "epoch": 0.43076171704662397, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0193, + "step": 49392 + }, + { + "epoch": 0.43077043833179257, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 49393 + }, + { + "epoch": 0.43077915961696117, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0092, + "step": 49394 + }, + { + "epoch": 0.4307878809021297, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 49395 + }, + { + "epoch": 0.4307966021872983, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 49396 + }, + { + "epoch": 0.4308053234724669, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49397 + }, + { + "epoch": 0.43081404475763546, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 49398 + }, + { + "epoch": 0.43082276604280406, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0153, + "step": 49399 + }, + { + "epoch": 0.43083148732797266, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0238, + "step": 49400 + }, + { + "epoch": 0.4308402086131412, + "grad_norm": 0.07421875, + "learning_rate": 0.0005, + "loss": 1.0187, + "step": 49401 + }, + { + "epoch": 0.4308489298983098, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9881, + "step": 49402 + }, + { + "epoch": 0.4308576511834784, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 49403 + }, + { + "epoch": 0.430866372468647, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0061, + "step": 49404 + }, + { + "epoch": 0.43087509375381555, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 49405 + }, + { + "epoch": 0.43088381503898415, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9766, + "step": 49406 + }, + { + "epoch": 0.43089253632415275, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 49407 + }, + { + "epoch": 0.4309012576093213, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 0.9855, + "step": 49408 + }, + { + "epoch": 0.4309099788944899, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9955, + "step": 49409 + }, + { + "epoch": 0.4309187001796585, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 49410 + }, + { + "epoch": 0.43092742146482704, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 49411 + }, + { + "epoch": 0.43093614274999564, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0053, + "step": 49412 + }, + { + "epoch": 0.43094486403516424, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 49413 + }, + { + "epoch": 0.4309535853203328, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 49414 + }, + { + "epoch": 0.4309623066055014, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 49415 + }, + { + "epoch": 0.43097102789067, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 49416 + }, + { + "epoch": 0.43097974917583853, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 0.9945, + "step": 49417 + }, + { + "epoch": 0.43098847046100713, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 49418 + }, + { + "epoch": 0.43099719174617573, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49419 + }, + { + "epoch": 0.4310059130313443, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 49420 + }, + { + "epoch": 0.4310146343165129, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 49421 + }, + { + "epoch": 0.4310233556016815, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 49422 + }, + { + "epoch": 0.43103207688685, + "grad_norm": 0.154296875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49423 + }, + { + "epoch": 0.4310407981720186, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 0.9924, + "step": 49424 + }, + { + "epoch": 0.4310495194571872, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 49425 + }, + { + "epoch": 0.43105824074235577, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 49426 + }, + { + "epoch": 0.43106696202752437, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 49427 + }, + { + "epoch": 0.43107568331269297, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0277, + "step": 49428 + }, + { + "epoch": 0.4310844045978615, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 49429 + }, + { + "epoch": 0.4310931258830301, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 49430 + }, + { + "epoch": 0.4311018471681987, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 49431 + }, + { + "epoch": 0.4311105684533673, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 49432 + }, + { + "epoch": 0.43111928973853586, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 49433 + }, + { + "epoch": 0.43112801102370446, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0278, + "step": 49434 + }, + { + "epoch": 0.43113673230887306, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 49435 + }, + { + "epoch": 0.4311454535940416, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 49436 + }, + { + "epoch": 0.4311541748792102, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0195, + "step": 49437 + }, + { + "epoch": 0.4311628961643788, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 49438 + }, + { + "epoch": 0.43117161744954735, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9823, + "step": 49439 + }, + { + "epoch": 0.43118033873471595, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49440 + }, + { + "epoch": 0.43118906001988455, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0034, + "step": 49441 + }, + { + "epoch": 0.4311977813050531, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49442 + }, + { + "epoch": 0.4312065025902217, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 49443 + }, + { + "epoch": 0.4312152238753903, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 49444 + }, + { + "epoch": 0.43122394516055884, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 49445 + }, + { + "epoch": 0.43123266644572744, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0049, + "step": 49446 + }, + { + "epoch": 0.43124138773089604, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 49447 + }, + { + "epoch": 0.4312501090160646, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 49448 + }, + { + "epoch": 0.4312588303012332, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 49449 + }, + { + "epoch": 0.4312675515864018, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0148, + "step": 49450 + }, + { + "epoch": 0.43127627287157033, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 49451 + }, + { + "epoch": 0.43128499415673893, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.9748, + "step": 49452 + }, + { + "epoch": 0.43129371544190753, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 49453 + }, + { + "epoch": 0.4313024367270761, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 49454 + }, + { + "epoch": 0.4313111580122447, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 49455 + }, + { + "epoch": 0.4313198792974133, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 49456 + }, + { + "epoch": 0.4313286005825818, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 49457 + }, + { + "epoch": 0.4313373218677504, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9938, + "step": 49458 + }, + { + "epoch": 0.431346043152919, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9804, + "step": 49459 + }, + { + "epoch": 0.4313547644380876, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 49460 + }, + { + "epoch": 0.43136348572325617, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0202, + "step": 49461 + }, + { + "epoch": 0.43137220700842477, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0108, + "step": 49462 + }, + { + "epoch": 0.43138092829359337, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 49463 + }, + { + "epoch": 0.4313896495787619, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 49464 + }, + { + "epoch": 0.4313983708639305, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49465 + }, + { + "epoch": 0.4314070921490991, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 49466 + }, + { + "epoch": 0.43141581343426766, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 49467 + }, + { + "epoch": 0.43142453471943626, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 49468 + }, + { + "epoch": 0.43143325600460486, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 49469 + }, + { + "epoch": 0.4314419772897734, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 49470 + }, + { + "epoch": 0.431450698574942, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 49471 + }, + { + "epoch": 0.4314594198601106, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 49472 + }, + { + "epoch": 0.43146814114527915, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 49473 + }, + { + "epoch": 0.43147686243044775, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0204, + "step": 49474 + }, + { + "epoch": 0.43148558371561635, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 49475 + }, + { + "epoch": 0.4314943050007849, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 0.9894, + "step": 49476 + }, + { + "epoch": 0.4315030262859535, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 49477 + }, + { + "epoch": 0.4315117475711221, + "grad_norm": 0.2333984375, + "learning_rate": 0.0005, + "loss": 1.0128, + "step": 49478 + }, + { + "epoch": 0.43152046885629064, + "grad_norm": 0.2021484375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 49479 + }, + { + "epoch": 0.43152919014145924, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 49480 + }, + { + "epoch": 0.43153791142662784, + "grad_norm": 0.20703125, + "learning_rate": 0.0005, + "loss": 1.0158, + "step": 49481 + }, + { + "epoch": 0.4315466327117964, + "grad_norm": 0.1591796875, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 49482 + }, + { + "epoch": 0.431555353996965, + "grad_norm": 0.12451171875, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 49483 + }, + { + "epoch": 0.4315640752821336, + "grad_norm": 0.224609375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 49484 + }, + { + "epoch": 0.43157279656730213, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 1.0189, + "step": 49485 + }, + { + "epoch": 0.43158151785247073, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 49486 + }, + { + "epoch": 0.43159023913763933, + "grad_norm": 0.185546875, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 49487 + }, + { + "epoch": 0.43159896042280793, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 49488 + }, + { + "epoch": 0.4316076817079765, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 49489 + }, + { + "epoch": 0.4316164029931451, + "grad_norm": 0.201171875, + "learning_rate": 0.0005, + "loss": 1.0107, + "step": 49490 + }, + { + "epoch": 0.4316251242783137, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 49491 + }, + { + "epoch": 0.4316338455634822, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 0.9931, + "step": 49492 + }, + { + "epoch": 0.4316425668486508, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 49493 + }, + { + "epoch": 0.4316512881338194, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 49494 + }, + { + "epoch": 0.43166000941898797, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.018, + "step": 49495 + }, + { + "epoch": 0.43166873070415657, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 49496 + }, + { + "epoch": 0.43167745198932517, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.005, + "step": 49497 + }, + { + "epoch": 0.4316861732744937, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 49498 + }, + { + "epoch": 0.4316948945596623, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 49499 + }, + { + "epoch": 0.4317036158448309, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 1.0138, + "step": 49500 + }, + { + "epoch": 0.43171233712999946, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0018, + "step": 49501 + }, + { + "epoch": 0.43172105841516806, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0209, + "step": 49502 + }, + { + "epoch": 0.43172977970033666, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9916, + "step": 49503 + }, + { + "epoch": 0.4317385009855052, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 49504 + }, + { + "epoch": 0.4317472222706738, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 0.9913, + "step": 49505 + }, + { + "epoch": 0.4317559435558424, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 49506 + }, + { + "epoch": 0.43176466484101095, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 1.0109, + "step": 49507 + }, + { + "epoch": 0.43177338612617955, + "grad_norm": 0.10986328125, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 49508 + }, + { + "epoch": 0.43178210741134815, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 49509 + }, + { + "epoch": 0.4317908286965167, + "grad_norm": 0.11083984375, + "learning_rate": 0.0005, + "loss": 0.9781, + "step": 49510 + }, + { + "epoch": 0.4317995499816853, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 49511 + }, + { + "epoch": 0.4318082712668539, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 49512 + }, + { + "epoch": 0.43181699255202244, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0, + "step": 49513 + }, + { + "epoch": 0.43182571383719104, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0106, + "step": 49514 + }, + { + "epoch": 0.43183443512235964, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 49515 + }, + { + "epoch": 0.43184315640752824, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49516 + }, + { + "epoch": 0.4318518776926968, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 49517 + }, + { + "epoch": 0.4318605989778654, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 49518 + }, + { + "epoch": 0.431869320263034, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9889, + "step": 49519 + }, + { + "epoch": 0.43187804154820253, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 49520 + }, + { + "epoch": 0.43188676283337113, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 49521 + }, + { + "epoch": 0.43189548411853973, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 49522 + }, + { + "epoch": 0.4319042054037083, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9956, + "step": 49523 + }, + { + "epoch": 0.4319129266888769, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0155, + "step": 49524 + }, + { + "epoch": 0.4319216479740455, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 49525 + }, + { + "epoch": 0.431930369259214, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0253, + "step": 49526 + }, + { + "epoch": 0.4319390905443826, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 49527 + }, + { + "epoch": 0.4319478118295512, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9733, + "step": 49528 + }, + { + "epoch": 0.43195653311471977, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9825, + "step": 49529 + }, + { + "epoch": 0.43196525439988837, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9988, + "step": 49530 + }, + { + "epoch": 0.43197397568505697, + "grad_norm": 0.080078125, + "learning_rate": 0.0005, + "loss": 1.0171, + "step": 49531 + }, + { + "epoch": 0.4319826969702255, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 49532 + }, + { + "epoch": 0.4319914182553941, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0123, + "step": 49533 + }, + { + "epoch": 0.4320001395405627, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 49534 + }, + { + "epoch": 0.43200886082573126, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 49535 + }, + { + "epoch": 0.43201758211089986, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.024, + "step": 49536 + }, + { + "epoch": 0.43202630339606846, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0002, + "step": 49537 + }, + { + "epoch": 0.432035024681237, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49538 + }, + { + "epoch": 0.4320437459664056, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 49539 + }, + { + "epoch": 0.4320524672515742, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0017, + "step": 49540 + }, + { + "epoch": 0.4320611885367428, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 49541 + }, + { + "epoch": 0.43206990982191135, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 49542 + }, + { + "epoch": 0.43207863110707995, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 49543 + }, + { + "epoch": 0.43208735239224855, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 49544 + }, + { + "epoch": 0.4320960736774171, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 49545 + }, + { + "epoch": 0.4321047949625857, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0078, + "step": 49546 + }, + { + "epoch": 0.4321135162477543, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 49547 + }, + { + "epoch": 0.43212223753292284, + "grad_norm": 0.119140625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 49548 + }, + { + "epoch": 0.43213095881809144, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 49549 + }, + { + "epoch": 0.43213968010326004, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 1.0218, + "step": 49550 + }, + { + "epoch": 0.4321484013884286, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 49551 + }, + { + "epoch": 0.4321571226735972, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 49552 + }, + { + "epoch": 0.4321658439587658, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9838, + "step": 49553 + }, + { + "epoch": 0.4321745652439343, + "grad_norm": 0.11474609375, + "learning_rate": 0.0005, + "loss": 0.9872, + "step": 49554 + }, + { + "epoch": 0.43218328652910293, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 49555 + }, + { + "epoch": 0.43219200781427153, + "grad_norm": 0.1455078125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 49556 + }, + { + "epoch": 0.4322007290994401, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 49557 + }, + { + "epoch": 0.4322094503846087, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0169, + "step": 49558 + }, + { + "epoch": 0.4322181716697773, + "grad_norm": 0.1708984375, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 49559 + }, + { + "epoch": 0.4322268929549458, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0095, + "step": 49560 + }, + { + "epoch": 0.4322356142401144, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 49561 + }, + { + "epoch": 0.432244335525283, + "grad_norm": 0.2216796875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 49562 + }, + { + "epoch": 0.43225305681045156, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 49563 + }, + { + "epoch": 0.43226177809562016, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9996, + "step": 49564 + }, + { + "epoch": 0.43227049938078876, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9977, + "step": 49565 + }, + { + "epoch": 0.4322792206659573, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 49566 + }, + { + "epoch": 0.4322879419511259, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 0.9818, + "step": 49567 + }, + { + "epoch": 0.4322966632362945, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 49568 + }, + { + "epoch": 0.4323053845214631, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 49569 + }, + { + "epoch": 0.43231410580663165, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 49570 + }, + { + "epoch": 0.43232282709180025, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9909, + "step": 49571 + }, + { + "epoch": 0.43233154837696885, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 49572 + }, + { + "epoch": 0.4323402696621374, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 0.9972, + "step": 49573 + }, + { + "epoch": 0.432348990947306, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 49574 + }, + { + "epoch": 0.4323577122324746, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9918, + "step": 49575 + }, + { + "epoch": 0.43236643351764315, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 49576 + }, + { + "epoch": 0.43237515480281175, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9903, + "step": 49577 + }, + { + "epoch": 0.43238387608798035, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 49578 + }, + { + "epoch": 0.4323925973731489, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 49579 + }, + { + "epoch": 0.4324013186583175, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 49580 + }, + { + "epoch": 0.4324100399434861, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0184, + "step": 49581 + }, + { + "epoch": 0.43241876122865464, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 49582 + }, + { + "epoch": 0.43242748251382324, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 49583 + }, + { + "epoch": 0.43243620379899184, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 49584 + }, + { + "epoch": 0.4324449250841604, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.9947, + "step": 49585 + }, + { + "epoch": 0.432453646369329, + "grad_norm": 0.10400390625, + "learning_rate": 0.0005, + "loss": 1.0068, + "step": 49586 + }, + { + "epoch": 0.4324623676544976, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 49587 + }, + { + "epoch": 0.4324710889396661, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0251, + "step": 49588 + }, + { + "epoch": 0.4324798102248347, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49589 + }, + { + "epoch": 0.4324885315100033, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9933, + "step": 49590 + }, + { + "epoch": 0.43249725279517187, + "grad_norm": 0.1396484375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 49591 + }, + { + "epoch": 0.43250597408034047, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 1.0293, + "step": 49592 + }, + { + "epoch": 0.43251469536550907, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0024, + "step": 49593 + }, + { + "epoch": 0.4325234166506776, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 49594 + }, + { + "epoch": 0.4325321379358462, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 0.9995, + "step": 49595 + }, + { + "epoch": 0.4325408592210148, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 49596 + }, + { + "epoch": 0.4325495805061834, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 0.989, + "step": 49597 + }, + { + "epoch": 0.43255830179135196, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0065, + "step": 49598 + }, + { + "epoch": 0.43256702307652056, + "grad_norm": 0.1865234375, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 49599 + }, + { + "epoch": 0.43257574436168916, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 49600 + }, + { + "epoch": 0.4325844656468577, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 49601 + }, + { + "epoch": 0.4325931869320263, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 49602 + }, + { + "epoch": 0.4326019082171949, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 0.9906, + "step": 49603 + }, + { + "epoch": 0.43261062950236345, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 49604 + }, + { + "epoch": 0.43261935078753205, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 49605 + }, + { + "epoch": 0.43262807207270065, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 49606 + }, + { + "epoch": 0.4326367933578692, + "grad_norm": 0.189453125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 49607 + }, + { + "epoch": 0.4326455146430378, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 49608 + }, + { + "epoch": 0.4326542359282064, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 49609 + }, + { + "epoch": 0.43266295721337494, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 49610 + }, + { + "epoch": 0.43267167849854354, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0232, + "step": 49611 + }, + { + "epoch": 0.43268039978371214, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 49612 + }, + { + "epoch": 0.4326891210688807, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 49613 + }, + { + "epoch": 0.4326978423540493, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 49614 + }, + { + "epoch": 0.4327065636392179, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 49615 + }, + { + "epoch": 0.43271528492438643, + "grad_norm": 0.193359375, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 49616 + }, + { + "epoch": 0.43272400620955503, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 1.0025, + "step": 49617 + }, + { + "epoch": 0.43273272749472363, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 49618 + }, + { + "epoch": 0.4327414487798922, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 49619 + }, + { + "epoch": 0.4327501700650608, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9997, + "step": 49620 + }, + { + "epoch": 0.4327588913502294, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 49621 + }, + { + "epoch": 0.4327676126353979, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.03, + "step": 49622 + }, + { + "epoch": 0.4327763339205665, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 49623 + }, + { + "epoch": 0.4327850552057351, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 0.9934, + "step": 49624 + }, + { + "epoch": 0.4327937764909037, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.001, + "step": 49625 + }, + { + "epoch": 0.43280249777607227, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 49626 + }, + { + "epoch": 0.43281121906124087, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 49627 + }, + { + "epoch": 0.43281994034640947, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 49628 + }, + { + "epoch": 0.432828661631578, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9957, + "step": 49629 + }, + { + "epoch": 0.4328373829167466, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 49630 + }, + { + "epoch": 0.4328461042019152, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49631 + }, + { + "epoch": 0.43285482548708376, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0221, + "step": 49632 + }, + { + "epoch": 0.43286354677225236, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 49633 + }, + { + "epoch": 0.43287226805742096, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 49634 + }, + { + "epoch": 0.4328809893425895, + "grad_norm": 0.158203125, + "learning_rate": 0.0005, + "loss": 1.0201, + "step": 49635 + }, + { + "epoch": 0.4328897106277581, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 49636 + }, + { + "epoch": 0.4328984319129267, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0229, + "step": 49637 + }, + { + "epoch": 0.43290715319809525, + "grad_norm": 0.220703125, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 49638 + }, + { + "epoch": 0.43291587448326385, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.01, + "step": 49639 + }, + { + "epoch": 0.43292459576843245, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0181, + "step": 49640 + }, + { + "epoch": 0.432933317053601, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 49641 + }, + { + "epoch": 0.4329420383387696, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9736, + "step": 49642 + }, + { + "epoch": 0.4329507596239382, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 49643 + }, + { + "epoch": 0.43295948090910674, + "grad_norm": 0.1533203125, + "learning_rate": 0.0005, + "loss": 1.0032, + "step": 49644 + }, + { + "epoch": 0.43296820219427534, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49645 + }, + { + "epoch": 0.43297692347944394, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0143, + "step": 49646 + }, + { + "epoch": 0.4329856447646125, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 49647 + }, + { + "epoch": 0.4329943660497811, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 49648 + }, + { + "epoch": 0.4330030873349497, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 49649 + }, + { + "epoch": 0.4330118086201183, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.021, + "step": 49650 + }, + { + "epoch": 0.43302052990528683, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 49651 + }, + { + "epoch": 0.43302925119045543, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0222, + "step": 49652 + }, + { + "epoch": 0.43303797247562403, + "grad_norm": 0.197265625, + "learning_rate": 0.0005, + "loss": 0.9908, + "step": 49653 + }, + { + "epoch": 0.4330466937607926, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 49654 + }, + { + "epoch": 0.4330554150459612, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 49655 + }, + { + "epoch": 0.4330641363311298, + "grad_norm": 0.11376953125, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 49656 + }, + { + "epoch": 0.4330728576162983, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 49657 + }, + { + "epoch": 0.4330815789014669, + "grad_norm": 0.13671875, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 49658 + }, + { + "epoch": 0.4330903001866355, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 49659 + }, + { + "epoch": 0.43309902147180407, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0161, + "step": 49660 + }, + { + "epoch": 0.43310774275697267, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0262, + "step": 49661 + }, + { + "epoch": 0.43311646404214127, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 49662 + }, + { + "epoch": 0.4331251853273098, + "grad_norm": 0.1728515625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 49663 + }, + { + "epoch": 0.4331339066124784, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0074, + "step": 49664 + }, + { + "epoch": 0.433142627897647, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 0.994, + "step": 49665 + }, + { + "epoch": 0.43315134918281556, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 49666 + }, + { + "epoch": 0.43316007046798416, + "grad_norm": 0.1650390625, + "learning_rate": 0.0005, + "loss": 1.0149, + "step": 49667 + }, + { + "epoch": 0.43316879175315276, + "grad_norm": 0.10205078125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 49668 + }, + { + "epoch": 0.4331775130383213, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0206, + "step": 49669 + }, + { + "epoch": 0.4331862343234899, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 49670 + }, + { + "epoch": 0.4331949556086585, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0057, + "step": 49671 + }, + { + "epoch": 0.43320367689382705, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 49672 + }, + { + "epoch": 0.43321239817899565, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 49673 + }, + { + "epoch": 0.43322111946416425, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 49674 + }, + { + "epoch": 0.4332298407493328, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0014, + "step": 49675 + }, + { + "epoch": 0.4332385620345014, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 49676 + }, + { + "epoch": 0.43324728331967, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0234, + "step": 49677 + }, + { + "epoch": 0.4332560046048386, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 49678 + }, + { + "epoch": 0.43326472589000714, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 49679 + }, + { + "epoch": 0.43327344717517574, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9989, + "step": 49680 + }, + { + "epoch": 0.43328216846034434, + "grad_norm": 0.109375, + "learning_rate": 0.0005, + "loss": 1.014, + "step": 49681 + }, + { + "epoch": 0.4332908897455129, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0133, + "step": 49682 + }, + { + "epoch": 0.4332996110306815, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 49683 + }, + { + "epoch": 0.4333083323158501, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 49684 + }, + { + "epoch": 0.43331705360101863, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 49685 + }, + { + "epoch": 0.43332577488618723, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 49686 + }, + { + "epoch": 0.43333449617135583, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9751, + "step": 49687 + }, + { + "epoch": 0.4333432174565244, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9852, + "step": 49688 + }, + { + "epoch": 0.433351938741693, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 49689 + }, + { + "epoch": 0.4333606600268616, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0099, + "step": 49690 + }, + { + "epoch": 0.4333693813120301, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005, + "loss": 1.0141, + "step": 49691 + }, + { + "epoch": 0.4333781025971987, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0243, + "step": 49692 + }, + { + "epoch": 0.4333868238823673, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 49693 + }, + { + "epoch": 0.43339554516753587, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 49694 + }, + { + "epoch": 0.43340426645270447, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 49695 + }, + { + "epoch": 0.43341298773787307, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 49696 + }, + { + "epoch": 0.4334217090230416, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 49697 + }, + { + "epoch": 0.4334304303082102, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 49698 + }, + { + "epoch": 0.4334391515933788, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.003, + "step": 49699 + }, + { + "epoch": 0.43344787287854736, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 49700 + }, + { + "epoch": 0.43345659416371596, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49701 + }, + { + "epoch": 0.43346531544888456, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 49702 + }, + { + "epoch": 0.4334740367340531, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 49703 + }, + { + "epoch": 0.4334827580192217, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 49704 + }, + { + "epoch": 0.4334914793043903, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 49705 + }, + { + "epoch": 0.4335002005895589, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 0.9883, + "step": 49706 + }, + { + "epoch": 0.43350892187472745, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 49707 + }, + { + "epoch": 0.43351764315989605, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0063, + "step": 49708 + }, + { + "epoch": 0.43352636444506465, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0225, + "step": 49709 + }, + { + "epoch": 0.4335350857302332, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.011, + "step": 49710 + }, + { + "epoch": 0.4335438070154018, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0088, + "step": 49711 + }, + { + "epoch": 0.4335525283005704, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 49712 + }, + { + "epoch": 0.43356124958573894, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0176, + "step": 49713 + }, + { + "epoch": 0.43356997087090754, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 49714 + }, + { + "epoch": 0.43357869215607614, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0013, + "step": 49715 + }, + { + "epoch": 0.4335874134412447, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0047, + "step": 49716 + }, + { + "epoch": 0.4335961347264133, + "grad_norm": 0.11328125, + "learning_rate": 0.0005, + "loss": 0.9806, + "step": 49717 + }, + { + "epoch": 0.4336048560115819, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9985, + "step": 49718 + }, + { + "epoch": 0.43361357729675043, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 49719 + }, + { + "epoch": 0.43362229858191903, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0152, + "step": 49720 + }, + { + "epoch": 0.43363101986708763, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9949, + "step": 49721 + }, + { + "epoch": 0.4336397411522562, + "grad_norm": 0.130859375, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 49722 + }, + { + "epoch": 0.4336484624374248, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49723 + }, + { + "epoch": 0.4336571837225934, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 0.9929, + "step": 49724 + }, + { + "epoch": 0.4336659050077619, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 49725 + }, + { + "epoch": 0.4336746262929305, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 0.9975, + "step": 49726 + }, + { + "epoch": 0.4336833475780991, + "grad_norm": 0.1201171875, + "learning_rate": 0.0005, + "loss": 0.9944, + "step": 49727 + }, + { + "epoch": 0.43369206886326767, + "grad_norm": 0.1767578125, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 49728 + }, + { + "epoch": 0.43370079014843627, + "grad_norm": 0.11279296875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 49729 + }, + { + "epoch": 0.43370951143360487, + "grad_norm": 0.3046875, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 49730 + }, + { + "epoch": 0.4337182327187734, + "grad_norm": 0.24609375, + "learning_rate": 0.0005, + "loss": 1.0044, + "step": 49731 + }, + { + "epoch": 0.433726954003942, + "grad_norm": 0.267578125, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 49732 + }, + { + "epoch": 0.4337356752891106, + "grad_norm": 0.263671875, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 49733 + }, + { + "epoch": 0.4337443965742792, + "grad_norm": 0.18359375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 49734 + }, + { + "epoch": 0.43375311785944776, + "grad_norm": 0.205078125, + "learning_rate": 0.0005, + "loss": 1.0126, + "step": 49735 + }, + { + "epoch": 0.43376183914461636, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 49736 + }, + { + "epoch": 0.43377056042978496, + "grad_norm": 0.236328125, + "learning_rate": 0.0005, + "loss": 0.995, + "step": 49737 + }, + { + "epoch": 0.4337792817149535, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 49738 + }, + { + "epoch": 0.4337880030001221, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 49739 + }, + { + "epoch": 0.4337967242852907, + "grad_norm": 0.2080078125, + "learning_rate": 0.0005, + "loss": 1.0072, + "step": 49740 + }, + { + "epoch": 0.43380544557045925, + "grad_norm": 0.1171875, + "learning_rate": 0.0005, + "loss": 1.0342, + "step": 49741 + }, + { + "epoch": 0.43381416685562785, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 49742 + }, + { + "epoch": 0.43382288814079645, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 49743 + }, + { + "epoch": 0.433831609425965, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0091, + "step": 49744 + }, + { + "epoch": 0.4338403307111336, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 49745 + }, + { + "epoch": 0.4338490519963022, + "grad_norm": 0.1083984375, + "learning_rate": 0.0005, + "loss": 1.004, + "step": 49746 + }, + { + "epoch": 0.43385777328147074, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 49747 + }, + { + "epoch": 0.43386649456663934, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 49748 + }, + { + "epoch": 0.43387521585180794, + "grad_norm": 0.1240234375, + "learning_rate": 0.0005, + "loss": 1.0012, + "step": 49749 + }, + { + "epoch": 0.4338839371369765, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0005, + "step": 49750 + }, + { + "epoch": 0.4338926584221451, + "grad_norm": 0.1552734375, + "learning_rate": 0.0005, + "loss": 1.0212, + "step": 49751 + }, + { + "epoch": 0.4339013797073137, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 49752 + }, + { + "epoch": 0.43391010099248223, + "grad_norm": 0.09375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 49753 + }, + { + "epoch": 0.43391882227765083, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0302, + "step": 49754 + }, + { + "epoch": 0.43392754356281943, + "grad_norm": 0.1357421875, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 49755 + }, + { + "epoch": 0.433936264847988, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 49756 + }, + { + "epoch": 0.4339449861331566, + "grad_norm": 0.1298828125, + "learning_rate": 0.0005, + "loss": 1.0036, + "step": 49757 + }, + { + "epoch": 0.4339537074183252, + "grad_norm": 0.1669921875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49758 + }, + { + "epoch": 0.4339624287034938, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 49759 + }, + { + "epoch": 0.4339711499886623, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 49760 + }, + { + "epoch": 0.4339798712738309, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 49761 + }, + { + "epoch": 0.4339885925589995, + "grad_norm": 0.08544921875, + "learning_rate": 0.0005, + "loss": 0.997, + "step": 49762 + }, + { + "epoch": 0.43399731384416806, + "grad_norm": 0.11767578125, + "learning_rate": 0.0005, + "loss": 0.9867, + "step": 49763 + }, + { + "epoch": 0.43400603512933666, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 49764 + }, + { + "epoch": 0.43401475641450526, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0289, + "step": 49765 + }, + { + "epoch": 0.4340234776996738, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 49766 + }, + { + "epoch": 0.4340321989848424, + "grad_norm": 0.1123046875, + "learning_rate": 0.0005, + "loss": 0.9954, + "step": 49767 + }, + { + "epoch": 0.434040920270011, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0056, + "step": 49768 + }, + { + "epoch": 0.43404964155517956, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0159, + "step": 49769 + }, + { + "epoch": 0.43405836284034816, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 49770 + }, + { + "epoch": 0.43406708412551676, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49771 + }, + { + "epoch": 0.4340758054106853, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 0.9808, + "step": 49772 + }, + { + "epoch": 0.4340845266958539, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 49773 + }, + { + "epoch": 0.4340932479810225, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0135, + "step": 49774 + }, + { + "epoch": 0.43410196926619105, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 49775 + }, + { + "epoch": 0.43411069055135965, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 1.0115, + "step": 49776 + }, + { + "epoch": 0.43411941183652825, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 49777 + }, + { + "epoch": 0.4341281331216968, + "grad_norm": 0.125, + "learning_rate": 0.0005, + "loss": 1.0215, + "step": 49778 + }, + { + "epoch": 0.4341368544068654, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9915, + "step": 49779 + }, + { + "epoch": 0.434145575692034, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 49780 + }, + { + "epoch": 0.43415429697720254, + "grad_norm": 0.126953125, + "learning_rate": 0.0005, + "loss": 0.9873, + "step": 49781 + }, + { + "epoch": 0.43416301826237114, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 49782 + }, + { + "epoch": 0.43417173954753974, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0154, + "step": 49783 + }, + { + "epoch": 0.4341804608327083, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 49784 + }, + { + "epoch": 0.4341891821178769, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 49785 + }, + { + "epoch": 0.4341979034030455, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0216, + "step": 49786 + }, + { + "epoch": 0.4342066246882141, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9998, + "step": 49787 + }, + { + "epoch": 0.4342153459733826, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0084, + "step": 49788 + }, + { + "epoch": 0.4342240672585512, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0151, + "step": 49789 + }, + { + "epoch": 0.4342327885437198, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0249, + "step": 49790 + }, + { + "epoch": 0.4342415098288884, + "grad_norm": 0.14453125, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 49791 + }, + { + "epoch": 0.434250231114057, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0213, + "step": 49792 + }, + { + "epoch": 0.4342589523992256, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0011, + "step": 49793 + }, + { + "epoch": 0.4342676736843941, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 1.0003, + "step": 49794 + }, + { + "epoch": 0.4342763949695627, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9917, + "step": 49795 + }, + { + "epoch": 0.4342851162547313, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 49796 + }, + { + "epoch": 0.43429383753989986, + "grad_norm": 0.177734375, + "learning_rate": 0.0005, + "loss": 0.9834, + "step": 49797 + }, + { + "epoch": 0.43430255882506846, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0037, + "step": 49798 + }, + { + "epoch": 0.43431128011023706, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 49799 + }, + { + "epoch": 0.4343200013954056, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9951, + "step": 49800 + }, + { + "epoch": 0.4343287226805742, + "grad_norm": 0.1416015625, + "learning_rate": 0.0005, + "loss": 1.0075, + "step": 49801 + }, + { + "epoch": 0.4343374439657428, + "grad_norm": 0.138671875, + "learning_rate": 0.0005, + "loss": 1.0027, + "step": 49802 + }, + { + "epoch": 0.43434616525091135, + "grad_norm": 0.11962890625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 49803 + }, + { + "epoch": 0.43435488653607995, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.0197, + "step": 49804 + }, + { + "epoch": 0.43436360782124855, + "grad_norm": 0.107421875, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 49805 + }, + { + "epoch": 0.4343723291064171, + "grad_norm": 0.1787109375, + "learning_rate": 0.0005, + "loss": 0.9902, + "step": 49806 + }, + { + "epoch": 0.4343810503915857, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0033, + "step": 49807 + }, + { + "epoch": 0.4343897716767543, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 49808 + }, + { + "epoch": 0.43439849296192284, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 1.0188, + "step": 49809 + }, + { + "epoch": 0.43440721424709144, + "grad_norm": 0.1474609375, + "learning_rate": 0.0005, + "loss": 1.0051, + "step": 49810 + }, + { + "epoch": 0.43441593553226004, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 1.0131, + "step": 49811 + }, + { + "epoch": 0.4344246568174286, + "grad_norm": 0.23828125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 49812 + }, + { + "epoch": 0.4344333781025972, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 49813 + }, + { + "epoch": 0.4344420993877658, + "grad_norm": 0.1640625, + "learning_rate": 0.0005, + "loss": 0.9952, + "step": 49814 + }, + { + "epoch": 0.4344508206729344, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 49815 + }, + { + "epoch": 0.43445954195810293, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.996, + "step": 49816 + }, + { + "epoch": 0.43446826324327154, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0031, + "step": 49817 + }, + { + "epoch": 0.43447698452844014, + "grad_norm": 0.09716796875, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 49818 + }, + { + "epoch": 0.4344857058136087, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0016, + "step": 49819 + }, + { + "epoch": 0.4344944270987773, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.025, + "step": 49820 + }, + { + "epoch": 0.4345031483839459, + "grad_norm": 0.11572265625, + "learning_rate": 0.0005, + "loss": 0.9923, + "step": 49821 + }, + { + "epoch": 0.4345118696691144, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0117, + "step": 49822 + }, + { + "epoch": 0.434520590954283, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0116, + "step": 49823 + }, + { + "epoch": 0.4345293122394516, + "grad_norm": 0.1162109375, + "learning_rate": 0.0005, + "loss": 1.006, + "step": 49824 + }, + { + "epoch": 0.43453803352462017, + "grad_norm": 0.09912109375, + "learning_rate": 0.0005, + "loss": 0.9914, + "step": 49825 + }, + { + "epoch": 0.43454675480978877, + "grad_norm": 0.123046875, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 49826 + }, + { + "epoch": 0.43455547609495737, + "grad_norm": 0.10888671875, + "learning_rate": 0.0005, + "loss": 1.0127, + "step": 49827 + }, + { + "epoch": 0.4345641973801259, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 1.0312, + "step": 49828 + }, + { + "epoch": 0.4345729186652945, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 49829 + }, + { + "epoch": 0.4345816399504631, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 1.0228, + "step": 49830 + }, + { + "epoch": 0.43459036123563166, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9837, + "step": 49831 + }, + { + "epoch": 0.43459908252080026, + "grad_norm": 0.15625, + "learning_rate": 0.0005, + "loss": 1.0015, + "step": 49832 + }, + { + "epoch": 0.43460780380596886, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0203, + "step": 49833 + }, + { + "epoch": 0.4346165250911374, + "grad_norm": 0.150390625, + "learning_rate": 0.0005, + "loss": 1.0039, + "step": 49834 + }, + { + "epoch": 0.434625246376306, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 49835 + }, + { + "epoch": 0.4346339676614746, + "grad_norm": 0.1435546875, + "learning_rate": 0.0005, + "loss": 0.9991, + "step": 49836 + }, + { + "epoch": 0.43464268894664315, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0102, + "step": 49837 + }, + { + "epoch": 0.43465141023181175, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0041, + "step": 49838 + }, + { + "epoch": 0.43466013151698035, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0043, + "step": 49839 + }, + { + "epoch": 0.4346688528021489, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0022, + "step": 49840 + }, + { + "epoch": 0.4346775740873175, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 49841 + }, + { + "epoch": 0.4346862953724861, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0327, + "step": 49842 + }, + { + "epoch": 0.4346950166576547, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9984, + "step": 49843 + }, + { + "epoch": 0.43470373794282324, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 49844 + }, + { + "epoch": 0.43471245922799184, + "grad_norm": 0.09619140625, + "learning_rate": 0.0005, + "loss": 1.0186, + "step": 49845 + }, + { + "epoch": 0.43472118051316044, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 49846 + }, + { + "epoch": 0.434729901798329, + "grad_norm": 0.12158203125, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 49847 + }, + { + "epoch": 0.4347386230834976, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.0261, + "step": 49848 + }, + { + "epoch": 0.4347473443686662, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 0.9969, + "step": 49849 + }, + { + "epoch": 0.43475606565383473, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0046, + "step": 49850 + }, + { + "epoch": 0.43476478693900333, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0196, + "step": 49851 + }, + { + "epoch": 0.43477350822417193, + "grad_norm": 0.09521484375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 49852 + }, + { + "epoch": 0.4347822295093405, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 49853 + }, + { + "epoch": 0.4347909507945091, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0163, + "step": 49854 + }, + { + "epoch": 0.4347996720796777, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0207, + "step": 49855 + }, + { + "epoch": 0.4348083933648462, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.0077, + "step": 49856 + }, + { + "epoch": 0.4348171146500148, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005, + "loss": 0.9968, + "step": 49857 + }, + { + "epoch": 0.4348258359351834, + "grad_norm": 0.083984375, + "learning_rate": 0.0005, + "loss": 0.9804, + "step": 49858 + }, + { + "epoch": 0.43483455722035197, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 49859 + }, + { + "epoch": 0.43484327850552057, + "grad_norm": 0.10595703125, + "learning_rate": 0.0005, + "loss": 0.9831, + "step": 49860 + }, + { + "epoch": 0.43485199979068917, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 49861 + }, + { + "epoch": 0.4348607210758577, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 1.0098, + "step": 49862 + }, + { + "epoch": 0.4348694423610263, + "grad_norm": 0.10498046875, + "learning_rate": 0.0005, + "loss": 1.0183, + "step": 49863 + }, + { + "epoch": 0.4348781636461949, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005, + "loss": 1.0069, + "step": 49864 + }, + { + "epoch": 0.43488688493136346, + "grad_norm": 0.09814453125, + "learning_rate": 0.0005, + "loss": 1.0026, + "step": 49865 + }, + { + "epoch": 0.43489560621653206, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9979, + "step": 49866 + }, + { + "epoch": 0.43490432750170066, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0118, + "step": 49867 + }, + { + "epoch": 0.43491304878686926, + "grad_norm": 0.0888671875, + "learning_rate": 0.0005, + "loss": 1.0157, + "step": 49868 + }, + { + "epoch": 0.4349217700720378, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0119, + "step": 49869 + }, + { + "epoch": 0.4349304913572064, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 49870 + }, + { + "epoch": 0.434939212642375, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49871 + }, + { + "epoch": 0.43494793392754355, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 49872 + }, + { + "epoch": 0.43495665521271215, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005, + "loss": 1.0101, + "step": 49873 + }, + { + "epoch": 0.43496537649788075, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 0.9811, + "step": 49874 + }, + { + "epoch": 0.4349740977830493, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0105, + "step": 49875 + }, + { + "epoch": 0.4349828190682179, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.022, + "step": 49876 + }, + { + "epoch": 0.4349915403533865, + "grad_norm": 0.0849609375, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 49877 + }, + { + "epoch": 0.43500026163855504, + "grad_norm": 0.1142578125, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 49878 + }, + { + "epoch": 0.43500898292372364, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 1.0275, + "step": 49879 + }, + { + "epoch": 0.43501770420889224, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9953, + "step": 49880 + }, + { + "epoch": 0.4350264254940608, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 0.9885, + "step": 49881 + }, + { + "epoch": 0.4350351467792294, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0067, + "step": 49882 + }, + { + "epoch": 0.435043868064398, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9962, + "step": 49883 + }, + { + "epoch": 0.43505258934956653, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 0.9869, + "step": 49884 + }, + { + "epoch": 0.43506131063473513, + "grad_norm": 0.0869140625, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 49885 + }, + { + "epoch": 0.43507003191990373, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0177, + "step": 49886 + }, + { + "epoch": 0.4350787532050723, + "grad_norm": 0.078125, + "learning_rate": 0.0005, + "loss": 1.0136, + "step": 49887 + }, + { + "epoch": 0.4350874744902409, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 0.9846, + "step": 49888 + }, + { + "epoch": 0.4350961957754095, + "grad_norm": 0.0927734375, + "learning_rate": 0.0005, + "loss": 1.002, + "step": 49889 + }, + { + "epoch": 0.435104917060578, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 0.9891, + "step": 49890 + }, + { + "epoch": 0.4351136383457466, + "grad_norm": 0.1279296875, + "learning_rate": 0.0005, + "loss": 0.9874, + "step": 49891 + }, + { + "epoch": 0.4351223596309152, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 0.9943, + "step": 49892 + }, + { + "epoch": 0.43513108091608377, + "grad_norm": 0.0947265625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 49893 + }, + { + "epoch": 0.43513980220125237, + "grad_norm": 0.1318359375, + "learning_rate": 0.0005, + "loss": 0.9971, + "step": 49894 + }, + { + "epoch": 0.43514852348642097, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0066, + "step": 49895 + }, + { + "epoch": 0.43515724477158957, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0083, + "step": 49896 + }, + { + "epoch": 0.4351659660567581, + "grad_norm": 0.09228515625, + "learning_rate": 0.0005, + "loss": 1.0145, + "step": 49897 + }, + { + "epoch": 0.4351746873419267, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49898 + }, + { + "epoch": 0.4351834086270953, + "grad_norm": 0.087890625, + "learning_rate": 0.0005, + "loss": 1.0038, + "step": 49899 + }, + { + "epoch": 0.43519212991226386, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49900 + }, + { + "epoch": 0.43520085119743246, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0173, + "step": 49901 + }, + { + "epoch": 0.43520957248260106, + "grad_norm": 0.146484375, + "learning_rate": 0.0005, + "loss": 0.9963, + "step": 49902 + }, + { + "epoch": 0.4352182937677696, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 49903 + }, + { + "epoch": 0.4352270150529382, + "grad_norm": 0.15234375, + "learning_rate": 0.0005, + "loss": 0.9941, + "step": 49904 + }, + { + "epoch": 0.4352357363381068, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 1.0182, + "step": 49905 + }, + { + "epoch": 0.43524445762327535, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 1.0137, + "step": 49906 + }, + { + "epoch": 0.43525317890844395, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0162, + "step": 49907 + }, + { + "epoch": 0.43526190019361255, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 0.998, + "step": 49908 + }, + { + "epoch": 0.4352706214787811, + "grad_norm": 0.1220703125, + "learning_rate": 0.0005, + "loss": 1.0007, + "step": 49909 + }, + { + "epoch": 0.4352793427639497, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0093, + "step": 49910 + }, + { + "epoch": 0.4352880640491183, + "grad_norm": 0.1494140625, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49911 + }, + { + "epoch": 0.43529678533428684, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0283, + "step": 49912 + }, + { + "epoch": 0.43530550661945544, + "grad_norm": 0.2109375, + "learning_rate": 0.0005, + "loss": 1.0226, + "step": 49913 + }, + { + "epoch": 0.43531422790462404, + "grad_norm": 0.2392578125, + "learning_rate": 0.0005, + "loss": 1.0035, + "step": 49914 + }, + { + "epoch": 0.4353229491897926, + "grad_norm": 0.10791015625, + "learning_rate": 0.0005, + "loss": 1.0214, + "step": 49915 + }, + { + "epoch": 0.4353316704749612, + "grad_norm": 0.2060546875, + "learning_rate": 0.0005, + "loss": 1.0132, + "step": 49916 + }, + { + "epoch": 0.4353403917601298, + "grad_norm": 0.1044921875, + "learning_rate": 0.0005, + "loss": 1.0081, + "step": 49917 + }, + { + "epoch": 0.43534911304529833, + "grad_norm": 0.1064453125, + "learning_rate": 0.0005, + "loss": 1.0166, + "step": 49918 + }, + { + "epoch": 0.43535783433046693, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0006, + "step": 49919 + }, + { + "epoch": 0.43536655561563553, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 49920 + }, + { + "epoch": 0.4353752769008041, + "grad_norm": 0.08203125, + "learning_rate": 0.0005, + "loss": 0.9887, + "step": 49921 + }, + { + "epoch": 0.4353839981859727, + "grad_norm": 0.08984375, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 49922 + }, + { + "epoch": 0.4353927194711413, + "grad_norm": 0.1484375, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 49923 + }, + { + "epoch": 0.4354014407563099, + "grad_norm": 0.11669921875, + "learning_rate": 0.0005, + "loss": 1.0023, + "step": 49924 + }, + { + "epoch": 0.4354101620414784, + "grad_norm": 0.115234375, + "learning_rate": 0.0005, + "loss": 1.0147, + "step": 49925 + }, + { + "epoch": 0.435418883326647, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 0.9854, + "step": 49926 + }, + { + "epoch": 0.4354276046118156, + "grad_norm": 0.103515625, + "learning_rate": 0.0005, + "loss": 0.9994, + "step": 49927 + }, + { + "epoch": 0.43543632589698417, + "grad_norm": 0.11865234375, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 49928 + }, + { + "epoch": 0.43544504718215277, + "grad_norm": 0.173828125, + "learning_rate": 0.0005, + "loss": 0.9999, + "step": 49929 + }, + { + "epoch": 0.43545376846732137, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0004, + "step": 49930 + }, + { + "epoch": 0.4354624897524899, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 0.9844, + "step": 49931 + }, + { + "epoch": 0.4354712110376585, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 0.9782, + "step": 49932 + }, + { + "epoch": 0.4354799323228271, + "grad_norm": 0.099609375, + "learning_rate": 0.0005, + "loss": 1.0264, + "step": 49933 + }, + { + "epoch": 0.43548865360799566, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0255, + "step": 49934 + }, + { + "epoch": 0.43549737489316426, + "grad_norm": 0.0986328125, + "learning_rate": 0.0005, + "loss": 1.0096, + "step": 49935 + }, + { + "epoch": 0.43550609617833286, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.0178, + "step": 49936 + }, + { + "epoch": 0.4355148174635014, + "grad_norm": 0.10302734375, + "learning_rate": 0.0005, + "loss": 1.0064, + "step": 49937 + }, + { + "epoch": 0.43552353874867, + "grad_norm": 0.07861328125, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 49938 + }, + { + "epoch": 0.4355322600338386, + "grad_norm": 0.08642578125, + "learning_rate": 0.0005, + "loss": 1.0008, + "step": 49939 + }, + { + "epoch": 0.43554098131900715, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0087, + "step": 49940 + }, + { + "epoch": 0.43554970260417575, + "grad_norm": 0.095703125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 49941 + }, + { + "epoch": 0.43555842388934435, + "grad_norm": 0.1005859375, + "learning_rate": 0.0005, + "loss": 1.0142, + "step": 49942 + }, + { + "epoch": 0.4355671451745129, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 1.0194, + "step": 49943 + }, + { + "epoch": 0.4355758664596815, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 49944 + }, + { + "epoch": 0.4355845877448501, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005, + "loss": 0.9911, + "step": 49945 + }, + { + "epoch": 0.43559330903001864, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005, + "loss": 1.0217, + "step": 49946 + }, + { + "epoch": 0.43560203031518724, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 49947 + }, + { + "epoch": 0.43561075160035584, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.007, + "step": 49948 + }, + { + "epoch": 0.4356194728855244, + "grad_norm": 0.1513671875, + "learning_rate": 0.0005, + "loss": 1.0175, + "step": 49949 + }, + { + "epoch": 0.435628194170693, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0303, + "step": 49950 + }, + { + "epoch": 0.4356369154558616, + "grad_norm": 0.1181640625, + "learning_rate": 0.0005, + "loss": 0.9597, + "step": 49951 + }, + { + "epoch": 0.4356456367410302, + "grad_norm": 0.10009765625, + "learning_rate": 0.0005, + "loss": 1.0097, + "step": 49952 + }, + { + "epoch": 0.43565435802619873, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9948, + "step": 49953 + }, + { + "epoch": 0.43566307931136733, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 0.9983, + "step": 49954 + }, + { + "epoch": 0.43567180059653593, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005, + "loss": 1.0045, + "step": 49955 + }, + { + "epoch": 0.4356805218817045, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0122, + "step": 49956 + }, + { + "epoch": 0.4356892431668731, + "grad_norm": 0.142578125, + "learning_rate": 0.0005, + "loss": 1.0329, + "step": 49957 + }, + { + "epoch": 0.4356979644520417, + "grad_norm": 0.09130859375, + "learning_rate": 0.0005, + "loss": 0.9973, + "step": 49958 + }, + { + "epoch": 0.4357066857372102, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0112, + "step": 49959 + }, + { + "epoch": 0.4357154070223788, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 49960 + }, + { + "epoch": 0.4357241283075474, + "grad_norm": 0.111328125, + "learning_rate": 0.0005, + "loss": 0.9905, + "step": 49961 + }, + { + "epoch": 0.43573284959271597, + "grad_norm": 0.10546875, + "learning_rate": 0.0005, + "loss": 1.008, + "step": 49962 + }, + { + "epoch": 0.43574157087788457, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005, + "loss": 1.0086, + "step": 49963 + }, + { + "epoch": 0.43575029216305317, + "grad_norm": 0.08349609375, + "learning_rate": 0.0005, + "loss": 1.0235, + "step": 49964 + }, + { + "epoch": 0.4357590134482217, + "grad_norm": 0.1328125, + "learning_rate": 0.0005, + "loss": 1.0198, + "step": 49965 + }, + { + "epoch": 0.4357677347333903, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 49966 + }, + { + "epoch": 0.4357764560185589, + "grad_norm": 0.08935546875, + "learning_rate": 0.0005, + "loss": 0.9879, + "step": 49967 + }, + { + "epoch": 0.43578517730372746, + "grad_norm": 0.1943359375, + "learning_rate": 0.0005, + "loss": 0.9986, + "step": 49968 + }, + { + "epoch": 0.43579389858889606, + "grad_norm": 0.10107421875, + "learning_rate": 0.0005, + "loss": 1.0113, + "step": 49969 + }, + { + "epoch": 0.43580261987406466, + "grad_norm": 0.1611328125, + "learning_rate": 0.0005, + "loss": 1.0059, + "step": 49970 + }, + { + "epoch": 0.4358113411592332, + "grad_norm": 0.2412109375, + "learning_rate": 0.0005, + "loss": 0.9964, + "step": 49971 + }, + { + "epoch": 0.4358200624444018, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 0.9895, + "step": 49972 + }, + { + "epoch": 0.4358287837295704, + "grad_norm": 0.2119140625, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 49973 + }, + { + "epoch": 0.43583750501473895, + "grad_norm": 0.12060546875, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49974 + }, + { + "epoch": 0.43584622629990755, + "grad_norm": 0.12255859375, + "learning_rate": 0.0005, + "loss": 1.0009, + "step": 49975 + }, + { + "epoch": 0.43585494758507615, + "grad_norm": 0.09765625, + "learning_rate": 0.0005, + "loss": 1.0042, + "step": 49976 + }, + { + "epoch": 0.4358636688702447, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 1.009, + "step": 49977 + }, + { + "epoch": 0.4358723901554133, + "grad_norm": 0.08447265625, + "learning_rate": 0.0005, + "loss": 1.0094, + "step": 49978 + }, + { + "epoch": 0.4358811114405819, + "grad_norm": 0.0908203125, + "learning_rate": 0.0005, + "loss": 1.0199, + "step": 49979 + }, + { + "epoch": 0.4358898327257505, + "grad_norm": 0.0859375, + "learning_rate": 0.0005, + "loss": 0.9967, + "step": 49980 + }, + { + "epoch": 0.43589855401091904, + "grad_norm": 0.09033203125, + "learning_rate": 0.0005, + "loss": 0.9993, + "step": 49981 + }, + { + "epoch": 0.43590727529608764, + "grad_norm": 0.09423828125, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 49982 + }, + { + "epoch": 0.43591599658125624, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 0.9871, + "step": 49983 + }, + { + "epoch": 0.4359247178664248, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 1.0076, + "step": 49984 + }, + { + "epoch": 0.4359334391515934, + "grad_norm": 0.0810546875, + "learning_rate": 0.0005, + "loss": 0.9932, + "step": 49985 + }, + { + "epoch": 0.435942160436762, + "grad_norm": 0.1259765625, + "learning_rate": 0.0005, + "loss": 1.0172, + "step": 49986 + }, + { + "epoch": 0.4359508817219305, + "grad_norm": 0.11181640625, + "learning_rate": 0.0005, + "loss": 1.0082, + "step": 49987 + }, + { + "epoch": 0.43595960300709913, + "grad_norm": 0.1337890625, + "learning_rate": 0.0005, + "loss": 1.0021, + "step": 49988 + }, + { + "epoch": 0.43596832429226773, + "grad_norm": 0.1376953125, + "learning_rate": 0.0005, + "loss": 0.9925, + "step": 49989 + }, + { + "epoch": 0.4359770455774363, + "grad_norm": 0.12109375, + "learning_rate": 0.0005, + "loss": 1.0054, + "step": 49990 + }, + { + "epoch": 0.4359857668626049, + "grad_norm": 0.10693359375, + "learning_rate": 0.0005, + "loss": 1.0241, + "step": 49991 + }, + { + "epoch": 0.4359944881477735, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 1.0139, + "step": 49992 + }, + { + "epoch": 0.436003209432942, + "grad_norm": 0.1015625, + "learning_rate": 0.0005, + "loss": 1.0028, + "step": 49993 + }, + { + "epoch": 0.4360119307181106, + "grad_norm": 0.169921875, + "learning_rate": 0.0005, + "loss": 1.016, + "step": 49994 + }, + { + "epoch": 0.4360206520032792, + "grad_norm": 0.091796875, + "learning_rate": 0.0005, + "loss": 1.0185, + "step": 49995 + }, + { + "epoch": 0.43602937328844776, + "grad_norm": 0.12890625, + "learning_rate": 0.0005, + "loss": 0.9878, + "step": 49996 + }, + { + "epoch": 0.43603809457361636, + "grad_norm": 0.140625, + "learning_rate": 0.0005, + "loss": 1.0052, + "step": 49997 + }, + { + "epoch": 0.43604681585878496, + "grad_norm": 0.134765625, + "learning_rate": 0.0005, + "loss": 1.0286, + "step": 49998 + }, + { + "epoch": 0.4360555371439535, + "grad_norm": 0.1025390625, + "learning_rate": 0.0005, + "loss": 0.9935, + "step": 49999 + }, + { + "epoch": 0.4360642584291221, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005, + "loss": 1.0085, + "step": 50000 + } + ], + "logging_steps": 1.0, + "max_steps": 1146620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.949596191949901e+19, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}