gpt_train_6_256 / trainer_state.json
gokulsrinivasagan's picture
End of training
f22b430 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.018308631211857017,
"eval_steps": 1,
"global_step": 336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.448997384481256e-05,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 1
},
{
"epoch": 5.448997384481256e-05,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 283.3287,
"eval_samples_per_second": 119.18,
"eval_steps_per_second": 3.311,
"step": 1
},
{
"epoch": 0.00010897994768962511,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 2
},
{
"epoch": 0.00010897994768962511,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 283.1668,
"eval_samples_per_second": 119.248,
"eval_steps_per_second": 3.313,
"step": 2
},
{
"epoch": 0.00016346992153443767,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 3
},
{
"epoch": 0.00016346992153443767,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 282.1254,
"eval_samples_per_second": 119.688,
"eval_steps_per_second": 3.325,
"step": 3
},
{
"epoch": 0.00021795989537925023,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 4
},
{
"epoch": 0.00021795989537925023,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 281.3948,
"eval_samples_per_second": 119.999,
"eval_steps_per_second": 3.333,
"step": 4
},
{
"epoch": 0.00027244986922406276,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8594,
"step": 5
},
{
"epoch": 0.00027244986922406276,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 281.7028,
"eval_samples_per_second": 119.867,
"eval_steps_per_second": 3.33,
"step": 5
},
{
"epoch": 0.00032693984306887534,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 6
},
{
"epoch": 0.00032693984306887534,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 282.539,
"eval_samples_per_second": 119.513,
"eval_steps_per_second": 3.32,
"step": 6
},
{
"epoch": 0.00038142981691368787,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 7
},
{
"epoch": 0.00038142981691368787,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 282.3676,
"eval_samples_per_second": 119.585,
"eval_steps_per_second": 3.322,
"step": 7
},
{
"epoch": 0.00043591979075850045,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 8
},
{
"epoch": 0.00043591979075850045,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 281.6041,
"eval_samples_per_second": 119.909,
"eval_steps_per_second": 3.331,
"step": 8
},
{
"epoch": 0.000490409764603313,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 9
},
{
"epoch": 0.000490409764603313,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 281.1865,
"eval_samples_per_second": 120.088,
"eval_steps_per_second": 3.336,
"step": 9
},
{
"epoch": 0.0005448997384481255,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8594,
"step": 10
},
{
"epoch": 0.0005448997384481255,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 282.104,
"eval_samples_per_second": 119.697,
"eval_steps_per_second": 3.325,
"step": 10
},
{
"epoch": 0.0005993897122929382,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 11
},
{
"epoch": 0.0005993897122929382,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 280.8618,
"eval_samples_per_second": 120.226,
"eval_steps_per_second": 3.34,
"step": 11
},
{
"epoch": 0.0006538796861377507,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 12
},
{
"epoch": 0.0006538796861377507,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 280.475,
"eval_samples_per_second": 120.392,
"eval_steps_per_second": 3.344,
"step": 12
},
{
"epoch": 0.0007083696599825632,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8594,
"step": 13
},
{
"epoch": 0.0007083696599825632,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 279.8203,
"eval_samples_per_second": 120.674,
"eval_steps_per_second": 3.352,
"step": 13
},
{
"epoch": 0.0007628596338273757,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 14
},
{
"epoch": 0.0007628596338273757,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 279.8528,
"eval_samples_per_second": 120.66,
"eval_steps_per_second": 3.352,
"step": 14
},
{
"epoch": 0.0008173496076721883,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8594,
"step": 15
},
{
"epoch": 0.0008173496076721883,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 281.1019,
"eval_samples_per_second": 120.124,
"eval_steps_per_second": 3.337,
"step": 15
},
{
"epoch": 0.0008718395815170009,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8594,
"step": 16
},
{
"epoch": 0.0008718395815170009,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 280.3712,
"eval_samples_per_second": 120.437,
"eval_steps_per_second": 3.346,
"step": 16
},
{
"epoch": 0.0009263295553618134,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8672,
"step": 17
},
{
"epoch": 0.0009263295553618134,
"eval_accuracy": 0.004507457682298169,
"eval_loss": 10.8671875,
"eval_runtime": 281.4302,
"eval_samples_per_second": 119.984,
"eval_steps_per_second": 3.333,
"step": 17
},
{
"epoch": 0.000980819529206626,
"grad_norm": 3.3501086235046387,
"learning_rate": 9.999994551002616e-06,
"loss": 10.8672,
"step": 18
},
{
"epoch": 0.000980819529206626,
"eval_accuracy": 0.008603349021604294,
"eval_loss": 10.8359375,
"eval_runtime": 281.0275,
"eval_samples_per_second": 120.155,
"eval_steps_per_second": 3.338,
"step": 18
},
{
"epoch": 0.0010353095030514385,
"grad_norm": 3.1848981380462646,
"learning_rate": 9.999989102005233e-06,
"loss": 10.8359,
"step": 19
},
{
"epoch": 0.0010353095030514385,
"eval_accuracy": 0.010782302884632226,
"eval_loss": 10.8046875,
"eval_runtime": 280.8218,
"eval_samples_per_second": 120.244,
"eval_steps_per_second": 3.34,
"step": 19
},
{
"epoch": 0.001089799476896251,
"grad_norm": 3.103949546813965,
"learning_rate": 9.999983653007848e-06,
"loss": 10.8047,
"step": 20
},
{
"epoch": 0.001089799476896251,
"eval_accuracy": 0.011296145649498848,
"eval_loss": 10.7734375,
"eval_runtime": 280.7519,
"eval_samples_per_second": 120.273,
"eval_steps_per_second": 3.341,
"step": 20
},
{
"epoch": 0.0011442894507410636,
"grad_norm": 3.007913827896118,
"learning_rate": 9.999978204010463e-06,
"loss": 10.7891,
"step": 21
},
{
"epoch": 0.0011442894507410636,
"eval_accuracy": 0.011502753864307472,
"eval_loss": 10.75,
"eval_runtime": 282.1403,
"eval_samples_per_second": 119.682,
"eval_steps_per_second": 3.325,
"step": 21
},
{
"epoch": 0.0011987794245858763,
"grad_norm": 3.0112082958221436,
"learning_rate": 9.99997275501308e-06,
"loss": 10.7578,
"step": 22
},
{
"epoch": 0.0011987794245858763,
"eval_accuracy": 0.011935742384539025,
"eval_loss": 10.7265625,
"eval_runtime": 280.7974,
"eval_samples_per_second": 120.254,
"eval_steps_per_second": 3.34,
"step": 22
},
{
"epoch": 0.0012532693984306888,
"grad_norm": 3.0085818767547607,
"learning_rate": 9.999967306015694e-06,
"loss": 10.7188,
"step": 23
},
{
"epoch": 0.0012532693984306888,
"eval_accuracy": 0.012902490504692311,
"eval_loss": 10.703125,
"eval_runtime": 279.9507,
"eval_samples_per_second": 120.618,
"eval_steps_per_second": 3.351,
"step": 23
},
{
"epoch": 0.0013077593722755014,
"grad_norm": 2.6498820781707764,
"learning_rate": 9.99996185701831e-06,
"loss": 10.7188,
"step": 24
},
{
"epoch": 0.0013077593722755014,
"eval_accuracy": 0.014669617484734745,
"eval_loss": 10.6796875,
"eval_runtime": 284.0388,
"eval_samples_per_second": 118.882,
"eval_steps_per_second": 3.302,
"step": 24
},
{
"epoch": 0.0013622493461203139,
"grad_norm": 2.525411367416382,
"learning_rate": 9.999956408020926e-06,
"loss": 10.6953,
"step": 25
},
{
"epoch": 0.0013622493461203139,
"eval_accuracy": 0.017945705260195358,
"eval_loss": 10.6640625,
"eval_runtime": 279.0802,
"eval_samples_per_second": 120.994,
"eval_steps_per_second": 3.361,
"step": 25
},
{
"epoch": 0.0014167393199651264,
"grad_norm": 2.438088893890381,
"learning_rate": 9.99995095902354e-06,
"loss": 10.6719,
"step": 26
},
{
"epoch": 0.0014167393199651264,
"eval_accuracy": 0.023050523249706075,
"eval_loss": 10.640625,
"eval_runtime": 280.2346,
"eval_samples_per_second": 120.495,
"eval_steps_per_second": 3.347,
"step": 26
},
{
"epoch": 0.001471229293809939,
"grad_norm": 2.2894654273986816,
"learning_rate": 9.999945510026156e-06,
"loss": 10.6562,
"step": 27
},
{
"epoch": 0.001471229293809939,
"eval_accuracy": 0.02863128990948001,
"eval_loss": 10.625,
"eval_runtime": 280.5686,
"eval_samples_per_second": 120.352,
"eval_steps_per_second": 3.343,
"step": 27
},
{
"epoch": 0.0015257192676547515,
"grad_norm": 2.0723860263824463,
"learning_rate": 9.999940061028771e-06,
"loss": 10.6641,
"step": 28
},
{
"epoch": 0.0015257192676547515,
"eval_accuracy": 0.03466305129792195,
"eval_loss": 10.609375,
"eval_runtime": 278.9184,
"eval_samples_per_second": 121.064,
"eval_steps_per_second": 3.363,
"step": 28
},
{
"epoch": 0.001580209241499564,
"grad_norm": 2.0480406284332275,
"learning_rate": 9.999934612031386e-06,
"loss": 10.6328,
"step": 29
},
{
"epoch": 0.001580209241499564,
"eval_accuracy": 0.039908271395015946,
"eval_loss": 10.59375,
"eval_runtime": 278.4399,
"eval_samples_per_second": 121.272,
"eval_steps_per_second": 3.369,
"step": 29
},
{
"epoch": 0.0016346992153443765,
"grad_norm": 2.0142972469329834,
"learning_rate": 9.999929163034003e-06,
"loss": 10.6016,
"step": 30
},
{
"epoch": 0.0016346992153443765,
"eval_accuracy": 0.043642156887862514,
"eval_loss": 10.578125,
"eval_runtime": 277.7144,
"eval_samples_per_second": 121.589,
"eval_steps_per_second": 3.378,
"step": 30
},
{
"epoch": 0.0016891891891891893,
"grad_norm": 1.8250195980072021,
"learning_rate": 9.999923714036618e-06,
"loss": 10.6016,
"step": 31
},
{
"epoch": 0.0016891891891891893,
"eval_accuracy": 0.046290082739106744,
"eval_loss": 10.5703125,
"eval_runtime": 277.9514,
"eval_samples_per_second": 121.485,
"eval_steps_per_second": 3.375,
"step": 31
},
{
"epoch": 0.0017436791630340018,
"grad_norm": 1.7317852973937988,
"learning_rate": 9.999918265039233e-06,
"loss": 10.5938,
"step": 32
},
{
"epoch": 0.0017436791630340018,
"eval_accuracy": 0.04786212316182883,
"eval_loss": 10.5546875,
"eval_runtime": 278.188,
"eval_samples_per_second": 121.382,
"eval_steps_per_second": 3.372,
"step": 32
},
{
"epoch": 0.0017981691368788143,
"grad_norm": 1.6736972332000732,
"learning_rate": 9.99991281604185e-06,
"loss": 10.5781,
"step": 33
},
{
"epoch": 0.0017981691368788143,
"eval_accuracy": 0.048402714699356676,
"eval_loss": 10.546875,
"eval_runtime": 276.9089,
"eval_samples_per_second": 121.943,
"eval_steps_per_second": 3.387,
"step": 33
},
{
"epoch": 0.0018526591107236269,
"grad_norm": 1.7245711088180542,
"learning_rate": 9.999907367044465e-06,
"loss": 10.5547,
"step": 34
},
{
"epoch": 0.0018526591107236269,
"eval_accuracy": 0.04842535272989897,
"eval_loss": 10.53125,
"eval_runtime": 277.5457,
"eval_samples_per_second": 121.663,
"eval_steps_per_second": 3.38,
"step": 34
},
{
"epoch": 0.0019071490845684394,
"grad_norm": 1.6025965213775635,
"learning_rate": 9.99990191804708e-06,
"loss": 10.5469,
"step": 35
},
{
"epoch": 0.0019071490845684394,
"eval_accuracy": 0.04835280681616625,
"eval_loss": 10.5234375,
"eval_runtime": 280.2247,
"eval_samples_per_second": 120.5,
"eval_steps_per_second": 3.347,
"step": 35
},
{
"epoch": 0.001961639058413252,
"grad_norm": 1.5452033281326294,
"learning_rate": 9.999896469049695e-06,
"loss": 10.5391,
"step": 36
},
{
"epoch": 0.001961639058413252,
"eval_accuracy": 0.04815010670125943,
"eval_loss": 10.515625,
"eval_runtime": 278.5802,
"eval_samples_per_second": 121.211,
"eval_steps_per_second": 3.367,
"step": 36
},
{
"epoch": 0.0020161290322580645,
"grad_norm": 1.5433681011199951,
"learning_rate": 9.999891020052312e-06,
"loss": 10.5312,
"step": 37
},
{
"epoch": 0.0020161290322580645,
"eval_accuracy": 0.047523855403661705,
"eval_loss": 10.5078125,
"eval_runtime": 279.3479,
"eval_samples_per_second": 120.878,
"eval_steps_per_second": 3.358,
"step": 37
},
{
"epoch": 0.002070619006102877,
"grad_norm": 1.4813498258590698,
"learning_rate": 9.999885571054927e-06,
"loss": 10.5312,
"step": 38
},
{
"epoch": 0.002070619006102877,
"eval_accuracy": 0.047497569813211064,
"eval_loss": 10.4921875,
"eval_runtime": 279.8874,
"eval_samples_per_second": 120.645,
"eval_steps_per_second": 3.351,
"step": 38
},
{
"epoch": 0.0021251089799476895,
"grad_norm": 1.5266352891921997,
"learning_rate": 9.999880122057542e-06,
"loss": 10.4922,
"step": 39
},
{
"epoch": 0.0021251089799476895,
"eval_accuracy": 0.04756021520719255,
"eval_loss": 10.484375,
"eval_runtime": 278.6312,
"eval_samples_per_second": 121.189,
"eval_steps_per_second": 3.366,
"step": 39
},
{
"epoch": 0.002179598953792502,
"grad_norm": 1.433236837387085,
"learning_rate": 9.999874673060158e-06,
"loss": 10.5078,
"step": 40
},
{
"epoch": 0.002179598953792502,
"eval_accuracy": 0.047665212824554305,
"eval_loss": 10.484375,
"eval_runtime": 278.9837,
"eval_samples_per_second": 121.036,
"eval_steps_per_second": 3.362,
"step": 40
},
{
"epoch": 0.0022340889276373146,
"grad_norm": 1.4035075902938843,
"learning_rate": 9.999869224062774e-06,
"loss": 10.4922,
"step": 41
},
{
"epoch": 0.0022340889276373146,
"eval_accuracy": 0.048127468670717134,
"eval_loss": 10.4765625,
"eval_runtime": 280.2687,
"eval_samples_per_second": 120.481,
"eval_steps_per_second": 3.347,
"step": 41
},
{
"epoch": 0.002288578901482127,
"grad_norm": 1.392196774482727,
"learning_rate": 9.999863775065389e-06,
"loss": 10.4844,
"step": 42
},
{
"epoch": 0.002288578901482127,
"eval_accuracy": 0.048623971051575024,
"eval_loss": 10.46875,
"eval_runtime": 279.283,
"eval_samples_per_second": 120.906,
"eval_steps_per_second": 3.359,
"step": 42
},
{
"epoch": 0.0023430688753269396,
"grad_norm": 1.4084738492965698,
"learning_rate": 9.999858326068004e-06,
"loss": 10.4766,
"step": 43
},
{
"epoch": 0.0023430688753269396,
"eval_accuracy": 0.0492689812287014,
"eval_loss": 10.4609375,
"eval_runtime": 279.956,
"eval_samples_per_second": 120.615,
"eval_steps_per_second": 3.351,
"step": 43
},
{
"epoch": 0.0023975588491717526,
"grad_norm": 1.3411015272140503,
"learning_rate": 9.999852877070619e-06,
"loss": 10.4844,
"step": 44
},
{
"epoch": 0.0023975588491717526,
"eval_accuracy": 0.04953739531973483,
"eval_loss": 10.453125,
"eval_runtime": 278.5767,
"eval_samples_per_second": 121.213,
"eval_steps_per_second": 3.367,
"step": 44
},
{
"epoch": 0.002452048823016565,
"grad_norm": 1.381066083908081,
"learning_rate": 9.999847428073235e-06,
"loss": 10.4688,
"step": 45
},
{
"epoch": 0.002452048823016565,
"eval_accuracy": 0.05031273339136427,
"eval_loss": 10.4453125,
"eval_runtime": 278.3035,
"eval_samples_per_second": 121.332,
"eval_steps_per_second": 3.37,
"step": 45
},
{
"epoch": 0.0025065387968613777,
"grad_norm": 1.2761576175689697,
"learning_rate": 9.99984197907585e-06,
"loss": 10.4844,
"step": 46
},
{
"epoch": 0.0025065387968613777,
"eval_accuracy": 0.05126917570733207,
"eval_loss": 10.4453125,
"eval_runtime": 278.7818,
"eval_samples_per_second": 121.123,
"eval_steps_per_second": 3.365,
"step": 46
},
{
"epoch": 0.00256102877070619,
"grad_norm": 1.2938231229782104,
"learning_rate": 9.999836530078465e-06,
"loss": 10.4609,
"step": 47
},
{
"epoch": 0.00256102877070619,
"eval_accuracy": 0.05223774760743953,
"eval_loss": 10.4375,
"eval_runtime": 280.01,
"eval_samples_per_second": 120.592,
"eval_steps_per_second": 3.35,
"step": 47
},
{
"epoch": 0.0026155187445510027,
"grad_norm": 1.408036231994629,
"learning_rate": 9.999831081081082e-06,
"loss": 10.4453,
"step": 48
},
{
"epoch": 0.0026155187445510027,
"eval_accuracy": 0.05263593956410096,
"eval_loss": 10.4296875,
"eval_runtime": 279.456,
"eval_samples_per_second": 120.831,
"eval_steps_per_second": 3.357,
"step": 48
},
{
"epoch": 0.0026700087183958152,
"grad_norm": 1.3031139373779297,
"learning_rate": 9.999825632083697e-06,
"loss": 10.4453,
"step": 49
},
{
"epoch": 0.0026700087183958152,
"eval_accuracy": 0.05322177821382523,
"eval_loss": 10.4296875,
"eval_runtime": 279.6217,
"eval_samples_per_second": 120.76,
"eval_steps_per_second": 3.355,
"step": 49
},
{
"epoch": 0.0027244986922406278,
"grad_norm": 1.3556911945343018,
"learning_rate": 9.999820183086312e-06,
"loss": 10.4297,
"step": 50
},
{
"epoch": 0.0027244986922406278,
"eval_accuracy": 0.05367262819805243,
"eval_loss": 10.421875,
"eval_runtime": 278.4251,
"eval_samples_per_second": 121.279,
"eval_steps_per_second": 3.369,
"step": 50
},
{
"epoch": 0.0027789886660854403,
"grad_norm": 1.3359757661819458,
"learning_rate": 9.999814734088929e-06,
"loss": 10.4219,
"step": 51
},
{
"epoch": 0.0027789886660854403,
"eval_accuracy": 0.05440648251294645,
"eval_loss": 10.4140625,
"eval_runtime": 278.6487,
"eval_samples_per_second": 121.181,
"eval_steps_per_second": 3.366,
"step": 51
},
{
"epoch": 0.002833478639930253,
"grad_norm": 1.2961536645889282,
"learning_rate": 9.999809285091544e-06,
"loss": 10.4297,
"step": 52
},
{
"epoch": 0.002833478639930253,
"eval_accuracy": 0.05480528239625927,
"eval_loss": 10.4140625,
"eval_runtime": 278.1813,
"eval_samples_per_second": 121.385,
"eval_steps_per_second": 3.372,
"step": 52
},
{
"epoch": 0.0028879686137750654,
"grad_norm": 1.2359050512313843,
"learning_rate": 9.999803836094159e-06,
"loss": 10.4375,
"step": 53
},
{
"epoch": 0.0028879686137750654,
"eval_accuracy": 0.055358814086795306,
"eval_loss": 10.40625,
"eval_runtime": 279.3681,
"eval_samples_per_second": 120.869,
"eval_steps_per_second": 3.358,
"step": 53
},
{
"epoch": 0.002942458587619878,
"grad_norm": 1.2747548818588257,
"learning_rate": 9.999798387096776e-06,
"loss": 10.4219,
"step": 54
},
{
"epoch": 0.002942458587619878,
"eval_accuracy": 0.055845676487895415,
"eval_loss": 10.40625,
"eval_runtime": 280.0513,
"eval_samples_per_second": 120.574,
"eval_steps_per_second": 3.349,
"step": 54
},
{
"epoch": 0.0029969485614646904,
"grad_norm": 1.304934024810791,
"learning_rate": 9.99979293809939e-06,
"loss": 10.4141,
"step": 55
},
{
"epoch": 0.0029969485614646904,
"eval_accuracy": 0.05652635169523676,
"eval_loss": 10.3984375,
"eval_runtime": 280.159,
"eval_samples_per_second": 120.528,
"eval_steps_per_second": 3.348,
"step": 55
},
{
"epoch": 0.003051438535309503,
"grad_norm": 1.2796216011047363,
"learning_rate": 9.999787489102006e-06,
"loss": 10.4141,
"step": 56
},
{
"epoch": 0.003051438535309503,
"eval_accuracy": 0.05738361512036325,
"eval_loss": 10.390625,
"eval_runtime": 280.8546,
"eval_samples_per_second": 120.229,
"eval_steps_per_second": 3.34,
"step": 56
},
{
"epoch": 0.0031059285091543155,
"grad_norm": 1.273751974105835,
"learning_rate": 9.999782040104623e-06,
"loss": 10.4219,
"step": 57
},
{
"epoch": 0.0031059285091543155,
"eval_accuracy": 0.05828062536893549,
"eval_loss": 10.390625,
"eval_runtime": 280.5946,
"eval_samples_per_second": 120.341,
"eval_steps_per_second": 3.343,
"step": 57
},
{
"epoch": 0.003160418482999128,
"grad_norm": 1.2347089052200317,
"learning_rate": 9.999776591107238e-06,
"loss": 10.4219,
"step": 58
},
{
"epoch": 0.003160418482999128,
"eval_accuracy": 0.05910523444821581,
"eval_loss": 10.3828125,
"eval_runtime": 281.2657,
"eval_samples_per_second": 120.054,
"eval_steps_per_second": 3.335,
"step": 58
},
{
"epoch": 0.0032149084568439405,
"grad_norm": 1.2617988586425781,
"learning_rate": 9.999771142109853e-06,
"loss": 10.3984,
"step": 59
},
{
"epoch": 0.0032149084568439405,
"eval_accuracy": 0.059817348148100545,
"eval_loss": 10.3828125,
"eval_runtime": 279.7299,
"eval_samples_per_second": 120.713,
"eval_steps_per_second": 3.353,
"step": 59
},
{
"epoch": 0.003269398430688753,
"grad_norm": 1.2854173183441162,
"learning_rate": 9.999765693112468e-06,
"loss": 10.3984,
"step": 60
},
{
"epoch": 0.003269398430688753,
"eval_accuracy": 0.06033486742176367,
"eval_loss": 10.375,
"eval_runtime": 279.8061,
"eval_samples_per_second": 120.68,
"eval_steps_per_second": 3.352,
"step": 60
},
{
"epoch": 0.003323888404533566,
"grad_norm": 1.2649012804031372,
"learning_rate": 9.999760244115083e-06,
"loss": 10.3984,
"step": 61
},
{
"epoch": 0.003323888404533566,
"eval_accuracy": 0.06070726591907321,
"eval_loss": 10.375,
"eval_runtime": 279.7805,
"eval_samples_per_second": 120.691,
"eval_steps_per_second": 3.353,
"step": 61
},
{
"epoch": 0.0033783783783783786,
"grad_norm": 1.284862756729126,
"learning_rate": 9.9997547951177e-06,
"loss": 10.3906,
"step": 62
},
{
"epoch": 0.0033783783783783786,
"eval_accuracy": 0.061101260286951224,
"eval_loss": 10.3671875,
"eval_runtime": 280.6475,
"eval_samples_per_second": 120.318,
"eval_steps_per_second": 3.342,
"step": 62
},
{
"epoch": 0.003432868352223191,
"grad_norm": 1.3201900720596313,
"learning_rate": 9.999749346120314e-06,
"loss": 10.3672,
"step": 63
},
{
"epoch": 0.003432868352223191,
"eval_accuracy": 0.06146384511117401,
"eval_loss": 10.3671875,
"eval_runtime": 280.4398,
"eval_samples_per_second": 120.407,
"eval_steps_per_second": 3.345,
"step": 63
},
{
"epoch": 0.0034873583260680036,
"grad_norm": 1.265735387802124,
"learning_rate": 9.99974389712293e-06,
"loss": 10.3906,
"step": 64
},
{
"epoch": 0.0034873583260680036,
"eval_accuracy": 0.0615552078022117,
"eval_loss": 10.359375,
"eval_runtime": 280.4115,
"eval_samples_per_second": 120.419,
"eval_steps_per_second": 3.345,
"step": 64
},
{
"epoch": 0.003541848299912816,
"grad_norm": 1.2759215831756592,
"learning_rate": 9.999738448125546e-06,
"loss": 10.3828,
"step": 65
},
{
"epoch": 0.003541848299912816,
"eval_accuracy": 0.061487409506137465,
"eval_loss": 10.359375,
"eval_runtime": 280.0889,
"eval_samples_per_second": 120.558,
"eval_steps_per_second": 3.349,
"step": 65
},
{
"epoch": 0.0035963382737576287,
"grad_norm": 1.2972633838653564,
"learning_rate": 9.999732999128161e-06,
"loss": 10.3594,
"step": 66
},
{
"epoch": 0.0035963382737576287,
"eval_accuracy": 0.06136165553596391,
"eval_loss": 10.3515625,
"eval_runtime": 280.2984,
"eval_samples_per_second": 120.468,
"eval_steps_per_second": 3.346,
"step": 66
},
{
"epoch": 0.003650828247602441,
"grad_norm": 1.3436386585235596,
"learning_rate": 9.999727550130776e-06,
"loss": 10.3516,
"step": 67
},
{
"epoch": 0.003650828247602441,
"eval_accuracy": 0.06103861489296974,
"eval_loss": 10.34375,
"eval_runtime": 280.5172,
"eval_samples_per_second": 120.374,
"eval_steps_per_second": 3.344,
"step": 67
},
{
"epoch": 0.0037053182214472537,
"grad_norm": 1.3115813732147217,
"learning_rate": 9.999722101133393e-06,
"loss": 10.3516,
"step": 68
},
{
"epoch": 0.0037053182214472537,
"eval_accuracy": 0.060934138355594886,
"eval_loss": 10.34375,
"eval_runtime": 279.6787,
"eval_samples_per_second": 120.735,
"eval_steps_per_second": 3.354,
"step": 68
},
{
"epoch": 0.0037598081952920663,
"grad_norm": 1.3387259244918823,
"learning_rate": 9.999716652136008e-06,
"loss": 10.3438,
"step": 69
},
{
"epoch": 0.0037598081952920663,
"eval_accuracy": 0.061064147812328176,
"eval_loss": 10.3359375,
"eval_runtime": 279.3007,
"eval_samples_per_second": 120.898,
"eval_steps_per_second": 3.358,
"step": 69
},
{
"epoch": 0.003814298169136879,
"grad_norm": 1.507016897201538,
"learning_rate": 9.999711203138623e-06,
"loss": 10.3594,
"step": 70
},
{
"epoch": 0.003814298169136879,
"eval_accuracy": 0.061038383301864445,
"eval_loss": 10.3359375,
"eval_runtime": 280.6833,
"eval_samples_per_second": 120.303,
"eval_steps_per_second": 3.342,
"step": 70
},
{
"epoch": 0.0038687881429816913,
"grad_norm": 1.2702033519744873,
"learning_rate": 9.99970575414124e-06,
"loss": 10.3594,
"step": 71
},
{
"epoch": 0.0038687881429816913,
"eval_accuracy": 0.06097628793675803,
"eval_loss": 10.328125,
"eval_runtime": 279.2628,
"eval_samples_per_second": 120.915,
"eval_steps_per_second": 3.359,
"step": 71
},
{
"epoch": 0.003923278116826504,
"grad_norm": 1.3358463048934937,
"learning_rate": 9.999700305143855e-06,
"loss": 10.3203,
"step": 72
},
{
"epoch": 0.003923278116826504,
"eval_accuracy": 0.06098071711664674,
"eval_loss": 10.328125,
"eval_runtime": 280.3079,
"eval_samples_per_second": 120.464,
"eval_steps_per_second": 3.346,
"step": 72
},
{
"epoch": 0.003977768090671316,
"grad_norm": 1.259023904800415,
"learning_rate": 9.99969485614647e-06,
"loss": 10.3516,
"step": 73
},
{
"epoch": 0.003977768090671316,
"eval_accuracy": 0.06099933125173458,
"eval_loss": 10.3203125,
"eval_runtime": 280.2434,
"eval_samples_per_second": 120.492,
"eval_steps_per_second": 3.347,
"step": 73
},
{
"epoch": 0.004032258064516129,
"grad_norm": 1.458150029182434,
"learning_rate": 9.999689407149085e-06,
"loss": 10.3203,
"step": 74
},
{
"epoch": 0.004032258064516129,
"eval_accuracy": 0.061083225129626606,
"eval_loss": 10.3125,
"eval_runtime": 280.0404,
"eval_samples_per_second": 120.579,
"eval_steps_per_second": 3.35,
"step": 74
},
{
"epoch": 0.0040867480383609414,
"grad_norm": 1.2943834066390991,
"learning_rate": 9.999683958151702e-06,
"loss": 10.3281,
"step": 75
},
{
"epoch": 0.0040867480383609414,
"eval_accuracy": 0.06123138553923716,
"eval_loss": 10.3125,
"eval_runtime": 280.4851,
"eval_samples_per_second": 120.388,
"eval_steps_per_second": 3.344,
"step": 75
},
{
"epoch": 0.004141238012205754,
"grad_norm": 1.2712702751159668,
"learning_rate": 9.999678509154317e-06,
"loss": 10.3438,
"step": 76
},
{
"epoch": 0.004141238012205754,
"eval_accuracy": 0.061402212928278174,
"eval_loss": 10.3046875,
"eval_runtime": 279.7875,
"eval_samples_per_second": 120.688,
"eval_steps_per_second": 3.353,
"step": 76
},
{
"epoch": 0.0041957279860505665,
"grad_norm": 1.3315693140029907,
"learning_rate": 9.999673060156932e-06,
"loss": 10.2969,
"step": 77
},
{
"epoch": 0.0041957279860505665,
"eval_accuracy": 0.0617529576572429,
"eval_loss": 10.3046875,
"eval_runtime": 280.9468,
"eval_samples_per_second": 120.19,
"eval_steps_per_second": 3.339,
"step": 77
},
{
"epoch": 0.004250217959895379,
"grad_norm": 1.2693438529968262,
"learning_rate": 9.999667611159547e-06,
"loss": 10.3281,
"step": 78
},
{
"epoch": 0.004250217959895379,
"eval_accuracy": 0.06221338972345156,
"eval_loss": 10.296875,
"eval_runtime": 280.2001,
"eval_samples_per_second": 120.51,
"eval_steps_per_second": 3.348,
"step": 78
},
{
"epoch": 0.004304707933740192,
"grad_norm": 1.4125028848648071,
"learning_rate": 9.999662162162162e-06,
"loss": 10.2891,
"step": 79
},
{
"epoch": 0.004304707933740192,
"eval_accuracy": 0.0628054234352424,
"eval_loss": 10.296875,
"eval_runtime": 280.5046,
"eval_samples_per_second": 120.379,
"eval_steps_per_second": 3.344,
"step": 79
},
{
"epoch": 0.004359197907585004,
"grad_norm": 1.4127213954925537,
"learning_rate": 9.999656713164779e-06,
"loss": 10.3047,
"step": 80
},
{
"epoch": 0.004359197907585004,
"eval_accuracy": 0.0632026600785945,
"eval_loss": 10.2890625,
"eval_runtime": 281.1308,
"eval_samples_per_second": 120.111,
"eval_steps_per_second": 3.337,
"step": 80
},
{
"epoch": 0.004413687881429817,
"grad_norm": 1.2919211387634277,
"learning_rate": 9.999651264167394e-06,
"loss": 10.2969,
"step": 81
},
{
"epoch": 0.004413687881429817,
"eval_accuracy": 0.06365513120055874,
"eval_loss": 10.28125,
"eval_runtime": 280.4747,
"eval_samples_per_second": 120.392,
"eval_steps_per_second": 3.344,
"step": 81
},
{
"epoch": 0.004468177855274629,
"grad_norm": 1.3636354207992554,
"learning_rate": 9.999645815170009e-06,
"loss": 10.2891,
"step": 82
},
{
"epoch": 0.004468177855274629,
"eval_accuracy": 0.06432144775937197,
"eval_loss": 10.28125,
"eval_runtime": 280.6686,
"eval_samples_per_second": 120.309,
"eval_steps_per_second": 3.342,
"step": 82
},
{
"epoch": 0.004522667829119442,
"grad_norm": 1.2588390111923218,
"learning_rate": 9.999640366172625e-06,
"loss": 10.3125,
"step": 83
},
{
"epoch": 0.004522667829119442,
"eval_accuracy": 0.06491886596436085,
"eval_loss": 10.2734375,
"eval_runtime": 280.7018,
"eval_samples_per_second": 120.295,
"eval_steps_per_second": 3.342,
"step": 83
},
{
"epoch": 0.004577157802964254,
"grad_norm": 1.2841159105300903,
"learning_rate": 9.99963491717524e-06,
"loss": 10.2891,
"step": 84
},
{
"epoch": 0.004577157802964254,
"eval_accuracy": 0.0653847114726557,
"eval_loss": 10.2734375,
"eval_runtime": 280.4683,
"eval_samples_per_second": 120.395,
"eval_steps_per_second": 3.344,
"step": 84
},
{
"epoch": 0.004631647776809067,
"grad_norm": 1.2950408458709717,
"learning_rate": 9.999629468177855e-06,
"loss": 10.2812,
"step": 85
},
{
"epoch": 0.004631647776809067,
"eval_accuracy": 0.06567657416309995,
"eval_loss": 10.265625,
"eval_runtime": 279.8035,
"eval_samples_per_second": 120.681,
"eval_steps_per_second": 3.352,
"step": 85
},
{
"epoch": 0.004686137750653879,
"grad_norm": 1.284853458404541,
"learning_rate": 9.999624019180472e-06,
"loss": 10.3047,
"step": 86
},
{
"epoch": 0.004686137750653879,
"eval_accuracy": 0.06585295973866796,
"eval_loss": 10.265625,
"eval_runtime": 280.2049,
"eval_samples_per_second": 120.508,
"eval_steps_per_second": 3.348,
"step": 86
},
{
"epoch": 0.004740627724498693,
"grad_norm": 1.229882836341858,
"learning_rate": 9.999618570183087e-06,
"loss": 10.2969,
"step": 87
},
{
"epoch": 0.004740627724498693,
"eval_accuracy": 0.06600204651269968,
"eval_loss": 10.2578125,
"eval_runtime": 280.4096,
"eval_samples_per_second": 120.42,
"eval_steps_per_second": 3.345,
"step": 87
},
{
"epoch": 0.004795117698343505,
"grad_norm": 1.3210368156433105,
"learning_rate": 9.999613121185702e-06,
"loss": 10.2578,
"step": 88
},
{
"epoch": 0.004795117698343505,
"eval_accuracy": 0.06612053431194471,
"eval_loss": 10.25,
"eval_runtime": 280.0298,
"eval_samples_per_second": 120.584,
"eval_steps_per_second": 3.35,
"step": 88
},
{
"epoch": 0.004849607672188318,
"grad_norm": 1.2671563625335693,
"learning_rate": 9.999607672188319e-06,
"loss": 10.2812,
"step": 89
},
{
"epoch": 0.004849607672188318,
"eval_accuracy": 0.06618060325487982,
"eval_loss": 10.25,
"eval_runtime": 280.1841,
"eval_samples_per_second": 120.517,
"eval_steps_per_second": 3.348,
"step": 89
},
{
"epoch": 0.00490409764603313,
"grad_norm": 1.2486541271209717,
"learning_rate": 9.999602223190934e-06,
"loss": 10.2734,
"step": 90
},
{
"epoch": 0.00490409764603313,
"eval_accuracy": 0.06629969898077623,
"eval_loss": 10.2421875,
"eval_runtime": 279.9304,
"eval_samples_per_second": 120.626,
"eval_steps_per_second": 3.351,
"step": 90
},
{
"epoch": 0.004958587619877943,
"grad_norm": 1.219537615776062,
"learning_rate": 9.999596774193549e-06,
"loss": 10.2891,
"step": 91
},
{
"epoch": 0.004958587619877943,
"eval_accuracy": 0.06638553243417508,
"eval_loss": 10.2421875,
"eval_runtime": 280.2115,
"eval_samples_per_second": 120.505,
"eval_steps_per_second": 3.347,
"step": 91
},
{
"epoch": 0.005013077593722755,
"grad_norm": 1.3005180358886719,
"learning_rate": 9.999591325196166e-06,
"loss": 10.2578,
"step": 92
},
{
"epoch": 0.005013077593722755,
"eval_accuracy": 0.06655224908109715,
"eval_loss": 10.234375,
"eval_runtime": 280.5566,
"eval_samples_per_second": 120.357,
"eval_steps_per_second": 3.343,
"step": 92
},
{
"epoch": 0.005067567567567568,
"grad_norm": 1.2326613664627075,
"learning_rate": 9.99958587619878e-06,
"loss": 10.2734,
"step": 93
},
{
"epoch": 0.005067567567567568,
"eval_accuracy": 0.06677831094875031,
"eval_loss": 10.234375,
"eval_runtime": 279.0494,
"eval_samples_per_second": 121.007,
"eval_steps_per_second": 3.361,
"step": 93
},
{
"epoch": 0.00512205754141238,
"grad_norm": 1.343145728111267,
"learning_rate": 9.999580427201396e-06,
"loss": 10.2266,
"step": 94
},
{
"epoch": 0.00512205754141238,
"eval_accuracy": 0.0670887877742824,
"eval_loss": 10.2265625,
"eval_runtime": 278.9829,
"eval_samples_per_second": 121.036,
"eval_steps_per_second": 3.362,
"step": 94
},
{
"epoch": 0.005176547515257193,
"grad_norm": 1.2524234056472778,
"learning_rate": 9.999574978204011e-06,
"loss": 10.2578,
"step": 95
},
{
"epoch": 0.005176547515257193,
"eval_accuracy": 0.06735230950321652,
"eval_loss": 10.2265625,
"eval_runtime": 278.058,
"eval_samples_per_second": 121.439,
"eval_steps_per_second": 3.373,
"step": 95
},
{
"epoch": 0.0052310374891020054,
"grad_norm": 1.232895016670227,
"learning_rate": 9.999569529206626e-06,
"loss": 10.25,
"step": 96
},
{
"epoch": 0.0052310374891020054,
"eval_accuracy": 0.0675672260489275,
"eval_loss": 10.21875,
"eval_runtime": 280.2022,
"eval_samples_per_second": 120.509,
"eval_steps_per_second": 3.348,
"step": 96
},
{
"epoch": 0.005285527462946818,
"grad_norm": 1.2870153188705444,
"learning_rate": 9.999564080209243e-06,
"loss": 10.2266,
"step": 97
},
{
"epoch": 0.005285527462946818,
"eval_accuracy": 0.06775721760193143,
"eval_loss": 10.21875,
"eval_runtime": 279.5356,
"eval_samples_per_second": 120.797,
"eval_steps_per_second": 3.356,
"step": 97
},
{
"epoch": 0.0053400174367916305,
"grad_norm": 1.2629221677780151,
"learning_rate": 9.999558631211858e-06,
"loss": 10.2266,
"step": 98
},
{
"epoch": 0.0053400174367916305,
"eval_accuracy": 0.06793869818181586,
"eval_loss": 10.2109375,
"eval_runtime": 279.0624,
"eval_samples_per_second": 121.002,
"eval_steps_per_second": 3.361,
"step": 98
},
{
"epoch": 0.005394507410636443,
"grad_norm": 1.279175043106079,
"learning_rate": 9.999553182214473e-06,
"loss": 10.2344,
"step": 99
},
{
"epoch": 0.005394507410636443,
"eval_accuracy": 0.06805782285660043,
"eval_loss": 10.2109375,
"eval_runtime": 278.9598,
"eval_samples_per_second": 121.046,
"eval_steps_per_second": 3.362,
"step": 99
},
{
"epoch": 0.0054489973844812556,
"grad_norm": 1.21150803565979,
"learning_rate": 9.99954773321709e-06,
"loss": 10.2422,
"step": 100
},
{
"epoch": 0.0054489973844812556,
"eval_accuracy": 0.0681561332807969,
"eval_loss": 10.203125,
"eval_runtime": 278.9899,
"eval_samples_per_second": 121.033,
"eval_steps_per_second": 3.362,
"step": 100
},
{
"epoch": 0.005503487358326068,
"grad_norm": 1.2411643266677856,
"learning_rate": 9.999542284219704e-06,
"loss": 10.2422,
"step": 101
},
{
"epoch": 0.005503487358326068,
"eval_accuracy": 0.06829934343053183,
"eval_loss": 10.203125,
"eval_runtime": 278.3501,
"eval_samples_per_second": 121.311,
"eval_steps_per_second": 3.37,
"step": 101
},
{
"epoch": 0.005557977332170881,
"grad_norm": 1.2620774507522583,
"learning_rate": 9.99953683522232e-06,
"loss": 10.2266,
"step": 102
},
{
"epoch": 0.005557977332170881,
"eval_accuracy": 0.0684582438776503,
"eval_loss": 10.1953125,
"eval_runtime": 280.5409,
"eval_samples_per_second": 120.364,
"eval_steps_per_second": 3.344,
"step": 102
},
{
"epoch": 0.005612467306015693,
"grad_norm": 1.2730474472045898,
"learning_rate": 9.999531386224936e-06,
"loss": 10.2188,
"step": 103
},
{
"epoch": 0.005612467306015693,
"eval_accuracy": 0.06855808859291931,
"eval_loss": 10.1953125,
"eval_runtime": 279.3284,
"eval_samples_per_second": 120.886,
"eval_steps_per_second": 3.358,
"step": 103
},
{
"epoch": 0.005666957279860506,
"grad_norm": 1.250596046447754,
"learning_rate": 9.999525937227551e-06,
"loss": 10.2109,
"step": 104
},
{
"epoch": 0.005666957279860506,
"eval_accuracy": 0.06870066186711471,
"eval_loss": 10.1875,
"eval_runtime": 279.5974,
"eval_samples_per_second": 120.77,
"eval_steps_per_second": 3.355,
"step": 104
},
{
"epoch": 0.005721447253705318,
"grad_norm": 1.3434346914291382,
"learning_rate": 9.999520488230166e-06,
"loss": 10.1797,
"step": 105
},
{
"epoch": 0.005721447253705318,
"eval_accuracy": 0.06888807696907226,
"eval_loss": 10.1875,
"eval_runtime": 279.5391,
"eval_samples_per_second": 120.795,
"eval_steps_per_second": 3.356,
"step": 105
},
{
"epoch": 0.005775937227550131,
"grad_norm": 1.28876793384552,
"learning_rate": 9.999515039232783e-06,
"loss": 10.1797,
"step": 106
},
{
"epoch": 0.005775937227550131,
"eval_accuracy": 0.06907245243777284,
"eval_loss": 10.1796875,
"eval_runtime": 279.0416,
"eval_samples_per_second": 121.011,
"eval_steps_per_second": 3.362,
"step": 106
},
{
"epoch": 0.005830427201394943,
"grad_norm": 1.3488516807556152,
"learning_rate": 9.999509590235398e-06,
"loss": 10.1719,
"step": 107
},
{
"epoch": 0.005830427201394943,
"eval_accuracy": 0.06933892695329945,
"eval_loss": 10.1796875,
"eval_runtime": 279.7608,
"eval_samples_per_second": 120.7,
"eval_steps_per_second": 3.353,
"step": 107
},
{
"epoch": 0.005884917175239756,
"grad_norm": 1.272630214691162,
"learning_rate": 9.999504141238013e-06,
"loss": 10.1875,
"step": 108
},
{
"epoch": 0.005884917175239756,
"eval_accuracy": 0.06959364822023249,
"eval_loss": 10.171875,
"eval_runtime": 279.3313,
"eval_samples_per_second": 120.885,
"eval_steps_per_second": 3.358,
"step": 108
},
{
"epoch": 0.005939407149084568,
"grad_norm": 1.275840401649475,
"learning_rate": 9.999498692240628e-06,
"loss": 10.1797,
"step": 109
},
{
"epoch": 0.005939407149084568,
"eval_accuracy": 0.06980685678154193,
"eval_loss": 10.171875,
"eval_runtime": 279.6516,
"eval_samples_per_second": 120.747,
"eval_steps_per_second": 3.354,
"step": 109
},
{
"epoch": 0.005993897122929381,
"grad_norm": 1.2549141645431519,
"learning_rate": 9.999493243243245e-06,
"loss": 10.1797,
"step": 110
},
{
"epoch": 0.005993897122929381,
"eval_accuracy": 0.07002058642283829,
"eval_loss": 10.1640625,
"eval_runtime": 279.1415,
"eval_samples_per_second": 120.967,
"eval_steps_per_second": 3.36,
"step": 110
},
{
"epoch": 0.006048387096774193,
"grad_norm": 1.3456748723983765,
"learning_rate": 9.999487794245858e-06,
"loss": 10.1406,
"step": 111
},
{
"epoch": 0.006048387096774193,
"eval_accuracy": 0.07022299704886349,
"eval_loss": 10.1640625,
"eval_runtime": 279.1359,
"eval_samples_per_second": 120.97,
"eval_steps_per_second": 3.36,
"step": 111
},
{
"epoch": 0.006102877070619006,
"grad_norm": 1.2664400339126587,
"learning_rate": 9.999482345248475e-06,
"loss": 10.1719,
"step": 112
},
{
"epoch": 0.006102877070619006,
"eval_accuracy": 0.07042644983486251,
"eval_loss": 10.1640625,
"eval_runtime": 279.7876,
"eval_samples_per_second": 120.688,
"eval_steps_per_second": 3.353,
"step": 112
},
{
"epoch": 0.0061573670444638184,
"grad_norm": 1.226805329322815,
"learning_rate": 9.99947689625109e-06,
"loss": 10.1953,
"step": 113
},
{
"epoch": 0.0061573670444638184,
"eval_accuracy": 0.07062440233211086,
"eval_loss": 10.15625,
"eval_runtime": 280.1774,
"eval_samples_per_second": 120.52,
"eval_steps_per_second": 3.348,
"step": 113
},
{
"epoch": 0.006211857018308631,
"grad_norm": 1.2513021230697632,
"learning_rate": 9.999471447253705e-06,
"loss": 10.1719,
"step": 114
},
{
"epoch": 0.006211857018308631,
"eval_accuracy": 0.070792566423441,
"eval_loss": 10.15625,
"eval_runtime": 279.7452,
"eval_samples_per_second": 120.706,
"eval_steps_per_second": 3.353,
"step": 114
},
{
"epoch": 0.0062663469921534435,
"grad_norm": 1.2754563093185425,
"learning_rate": 9.999465998256322e-06,
"loss": 10.1641,
"step": 115
},
{
"epoch": 0.0062663469921534435,
"eval_accuracy": 0.0709808499920434,
"eval_loss": 10.1484375,
"eval_runtime": 279.6651,
"eval_samples_per_second": 120.741,
"eval_steps_per_second": 3.354,
"step": 115
},
{
"epoch": 0.006320836965998256,
"grad_norm": 1.2457561492919922,
"learning_rate": 9.999460549258937e-06,
"loss": 10.1719,
"step": 116
},
{
"epoch": 0.006320836965998256,
"eval_accuracy": 0.07117214424501458,
"eval_loss": 10.1484375,
"eval_runtime": 279.2608,
"eval_samples_per_second": 120.916,
"eval_steps_per_second": 3.359,
"step": 116
},
{
"epoch": 0.0063753269398430686,
"grad_norm": 1.2697603702545166,
"learning_rate": 9.999455100261552e-06,
"loss": 10.1484,
"step": 117
},
{
"epoch": 0.0063753269398430686,
"eval_accuracy": 0.07134343481626618,
"eval_loss": 10.140625,
"eval_runtime": 279.2939,
"eval_samples_per_second": 120.901,
"eval_steps_per_second": 3.358,
"step": 117
},
{
"epoch": 0.006429816913687881,
"grad_norm": 1.2824610471725464,
"learning_rate": 9.999449651264169e-06,
"loss": 10.1562,
"step": 118
},
{
"epoch": 0.006429816913687881,
"eval_accuracy": 0.07150007725010805,
"eval_loss": 10.140625,
"eval_runtime": 280.6459,
"eval_samples_per_second": 120.319,
"eval_steps_per_second": 3.342,
"step": 118
},
{
"epoch": 0.006484306887532694,
"grad_norm": 1.2311967611312866,
"learning_rate": 9.999444202266784e-06,
"loss": 10.1562,
"step": 119
},
{
"epoch": 0.006484306887532694,
"eval_accuracy": 0.07164412691759968,
"eval_loss": 10.1328125,
"eval_runtime": 278.8366,
"eval_samples_per_second": 121.1,
"eval_steps_per_second": 3.364,
"step": 119
},
{
"epoch": 0.006538796861377506,
"grad_norm": 1.260802984237671,
"learning_rate": 9.999438753269399e-06,
"loss": 10.1484,
"step": 120
},
{
"epoch": 0.006538796861377506,
"eval_accuracy": 0.07180059565811259,
"eval_loss": 10.1328125,
"eval_runtime": 279.3119,
"eval_samples_per_second": 120.894,
"eval_steps_per_second": 3.358,
"step": 120
},
{
"epoch": 0.006593286835222319,
"grad_norm": 1.2933627367019653,
"learning_rate": 9.999433304272015e-06,
"loss": 10.1406,
"step": 121
},
{
"epoch": 0.006593286835222319,
"eval_accuracy": 0.07193274733256984,
"eval_loss": 10.125,
"eval_runtime": 280.0499,
"eval_samples_per_second": 120.575,
"eval_steps_per_second": 3.349,
"step": 121
},
{
"epoch": 0.006647776809067132,
"grad_norm": 1.2777124643325806,
"learning_rate": 9.99942785527463e-06,
"loss": 10.1328,
"step": 122
},
{
"epoch": 0.006647776809067132,
"eval_accuracy": 0.07206449372259283,
"eval_loss": 10.125,
"eval_runtime": 278.3594,
"eval_samples_per_second": 121.307,
"eval_steps_per_second": 3.37,
"step": 122
},
{
"epoch": 0.006702266782911945,
"grad_norm": 1.225155234336853,
"learning_rate": 9.999422406277245e-06,
"loss": 10.1641,
"step": 123
},
{
"epoch": 0.006702266782911945,
"eval_accuracy": 0.07220330364132721,
"eval_loss": 10.1171875,
"eval_runtime": 279.7375,
"eval_samples_per_second": 120.71,
"eval_steps_per_second": 3.353,
"step": 123
},
{
"epoch": 0.006756756756756757,
"grad_norm": 1.2431164979934692,
"learning_rate": 9.999416957279862e-06,
"loss": 10.1328,
"step": 124
},
{
"epoch": 0.006756756756756757,
"eval_accuracy": 0.07233015766925091,
"eval_loss": 10.1171875,
"eval_runtime": 278.5065,
"eval_samples_per_second": 121.243,
"eval_steps_per_second": 3.368,
"step": 124
},
{
"epoch": 0.00681124673060157,
"grad_norm": 1.2372488975524902,
"learning_rate": 9.999411508282477e-06,
"loss": 10.1484,
"step": 125
},
{
"epoch": 0.00681124673060157,
"eval_accuracy": 0.07246410417477417,
"eval_loss": 10.109375,
"eval_runtime": 278.7434,
"eval_samples_per_second": 121.14,
"eval_steps_per_second": 3.365,
"step": 125
},
{
"epoch": 0.006865736704446382,
"grad_norm": 1.2289838790893555,
"learning_rate": 9.999406059285092e-06,
"loss": 10.1406,
"step": 126
},
{
"epoch": 0.006865736704446382,
"eval_accuracy": 0.07261460944432581,
"eval_loss": 10.109375,
"eval_runtime": 279.5991,
"eval_samples_per_second": 120.769,
"eval_steps_per_second": 3.355,
"step": 126
},
{
"epoch": 0.006920226678291195,
"grad_norm": 1.231143832206726,
"learning_rate": 9.999400610287709e-06,
"loss": 10.1406,
"step": 127
},
{
"epoch": 0.006920226678291195,
"eval_accuracy": 0.07277023866708203,
"eval_loss": 10.1015625,
"eval_runtime": 279.4963,
"eval_samples_per_second": 120.814,
"eval_steps_per_second": 3.356,
"step": 127
},
{
"epoch": 0.006974716652136007,
"grad_norm": 1.3046151399612427,
"learning_rate": 9.999395161290324e-06,
"loss": 10.125,
"step": 128
},
{
"epoch": 0.006974716652136007,
"eval_accuracy": 0.07291527259677114,
"eval_loss": 10.1015625,
"eval_runtime": 280.7304,
"eval_samples_per_second": 120.283,
"eval_steps_per_second": 3.341,
"step": 128
},
{
"epoch": 0.00702920662598082,
"grad_norm": 1.2855069637298584,
"learning_rate": 9.999389712292939e-06,
"loss": 10.1172,
"step": 129
},
{
"epoch": 0.00702920662598082,
"eval_accuracy": 0.07305231663332767,
"eval_loss": 10.09375,
"eval_runtime": 279.8824,
"eval_samples_per_second": 120.647,
"eval_steps_per_second": 3.351,
"step": 129
},
{
"epoch": 0.007083696599825632,
"grad_norm": 1.282074213027954,
"learning_rate": 9.999384263295554e-06,
"loss": 10.1016,
"step": 130
},
{
"epoch": 0.007083696599825632,
"eval_accuracy": 0.07317312034362562,
"eval_loss": 10.09375,
"eval_runtime": 279.9541,
"eval_samples_per_second": 120.616,
"eval_steps_per_second": 3.351,
"step": 130
},
{
"epoch": 0.007138186573670445,
"grad_norm": 1.218042254447937,
"learning_rate": 9.999378814298169e-06,
"loss": 10.1172,
"step": 131
},
{
"epoch": 0.007138186573670445,
"eval_accuracy": 0.07330454829587883,
"eval_loss": 10.0859375,
"eval_runtime": 280.2317,
"eval_samples_per_second": 120.497,
"eval_steps_per_second": 3.347,
"step": 131
},
{
"epoch": 0.007192676547515257,
"grad_norm": 1.2612570524215698,
"learning_rate": 9.999373365300786e-06,
"loss": 10.1172,
"step": 132
},
{
"epoch": 0.007192676547515257,
"eval_accuracy": 0.07344506619901475,
"eval_loss": 10.0859375,
"eval_runtime": 280.7991,
"eval_samples_per_second": 120.253,
"eval_steps_per_second": 3.34,
"step": 132
},
{
"epoch": 0.00724716652136007,
"grad_norm": 1.2458122968673706,
"learning_rate": 9.999367916303401e-06,
"loss": 10.1172,
"step": 133
},
{
"epoch": 0.00724716652136007,
"eval_accuracy": 0.07357432298465584,
"eval_loss": 10.0859375,
"eval_runtime": 281.5847,
"eval_samples_per_second": 119.918,
"eval_steps_per_second": 3.331,
"step": 133
},
{
"epoch": 0.007301656495204882,
"grad_norm": 1.2749879360198975,
"learning_rate": 9.999362467306016e-06,
"loss": 10.0938,
"step": 134
},
{
"epoch": 0.007301656495204882,
"eval_accuracy": 0.07369055277062427,
"eval_loss": 10.078125,
"eval_runtime": 279.1661,
"eval_samples_per_second": 120.957,
"eval_steps_per_second": 3.36,
"step": 134
},
{
"epoch": 0.007356146469049695,
"grad_norm": 1.2771598100662231,
"learning_rate": 9.999357018308633e-06,
"loss": 10.1094,
"step": 135
},
{
"epoch": 0.007356146469049695,
"eval_accuracy": 0.07383138911152996,
"eval_loss": 10.078125,
"eval_runtime": 280.6696,
"eval_samples_per_second": 120.309,
"eval_steps_per_second": 3.342,
"step": 135
},
{
"epoch": 0.0074106364428945075,
"grad_norm": 1.2409011125564575,
"learning_rate": 9.999351569311248e-06,
"loss": 10.1094,
"step": 136
},
{
"epoch": 0.0074106364428945075,
"eval_accuracy": 0.07399631092738602,
"eval_loss": 10.0703125,
"eval_runtime": 279.9187,
"eval_samples_per_second": 120.631,
"eval_steps_per_second": 3.351,
"step": 136
},
{
"epoch": 0.00746512641673932,
"grad_norm": 1.3785778284072876,
"learning_rate": 9.999346120313863e-06,
"loss": 10.0703,
"step": 137
},
{
"epoch": 0.00746512641673932,
"eval_accuracy": 0.07418262597159345,
"eval_loss": 10.0703125,
"eval_runtime": 278.8604,
"eval_samples_per_second": 121.089,
"eval_steps_per_second": 3.364,
"step": 137
},
{
"epoch": 0.0075196163905841325,
"grad_norm": 1.2543950080871582,
"learning_rate": 9.99934067131648e-06,
"loss": 10.0781,
"step": 138
},
{
"epoch": 0.0075196163905841325,
"eval_accuracy": 0.07433802360324437,
"eval_loss": 10.0625,
"eval_runtime": 278.9667,
"eval_samples_per_second": 121.043,
"eval_steps_per_second": 3.362,
"step": 138
},
{
"epoch": 0.007574106364428945,
"grad_norm": 1.2550749778747559,
"learning_rate": 9.999335222319094e-06,
"loss": 10.0781,
"step": 139
},
{
"epoch": 0.007574106364428945,
"eval_accuracy": 0.07445636665804858,
"eval_loss": 10.0625,
"eval_runtime": 280.3577,
"eval_samples_per_second": 120.443,
"eval_steps_per_second": 3.346,
"step": 139
},
{
"epoch": 0.007628596338273758,
"grad_norm": 1.2808057069778442,
"learning_rate": 9.99932977332171e-06,
"loss": 10.0781,
"step": 140
},
{
"epoch": 0.007628596338273758,
"eval_accuracy": 0.07457149638626687,
"eval_loss": 10.0546875,
"eval_runtime": 278.8572,
"eval_samples_per_second": 121.091,
"eval_steps_per_second": 3.364,
"step": 140
},
{
"epoch": 0.00768308631211857,
"grad_norm": 1.2947779893875122,
"learning_rate": 9.999324324324326e-06,
"loss": 10.0625,
"step": 141
},
{
"epoch": 0.00768308631211857,
"eval_accuracy": 0.0747105089472184,
"eval_loss": 10.0546875,
"eval_runtime": 278.2341,
"eval_samples_per_second": 121.362,
"eval_steps_per_second": 3.371,
"step": 141
},
{
"epoch": 0.007737576285963383,
"grad_norm": 1.2656594514846802,
"learning_rate": 9.999318875326941e-06,
"loss": 10.0781,
"step": 142
},
{
"epoch": 0.007737576285963383,
"eval_accuracy": 0.07485380594361782,
"eval_loss": 10.046875,
"eval_runtime": 279.1203,
"eval_samples_per_second": 120.976,
"eval_steps_per_second": 3.361,
"step": 142
},
{
"epoch": 0.007792066259808195,
"grad_norm": 1.3355940580368042,
"learning_rate": 9.999313426329556e-06,
"loss": 10.0391,
"step": 143
},
{
"epoch": 0.007792066259808195,
"eval_accuracy": 0.07498022573821908,
"eval_loss": 10.046875,
"eval_runtime": 279.5611,
"eval_samples_per_second": 120.786,
"eval_steps_per_second": 3.355,
"step": 143
},
{
"epoch": 0.007846556233653008,
"grad_norm": 1.2356702089309692,
"learning_rate": 9.999307977332171e-06,
"loss": 10.0703,
"step": 144
},
{
"epoch": 0.007846556233653008,
"eval_accuracy": 0.07511967253249303,
"eval_loss": 10.046875,
"eval_runtime": 279.9704,
"eval_samples_per_second": 120.609,
"eval_steps_per_second": 3.35,
"step": 144
},
{
"epoch": 0.007901046207497821,
"grad_norm": 1.2823630571365356,
"learning_rate": 9.999302528334788e-06,
"loss": 10.0391,
"step": 145
},
{
"epoch": 0.007901046207497821,
"eval_accuracy": 0.07525402432245056,
"eval_loss": 10.0390625,
"eval_runtime": 279.3818,
"eval_samples_per_second": 120.863,
"eval_steps_per_second": 3.357,
"step": 145
},
{
"epoch": 0.007955536181342633,
"grad_norm": 1.2719727754592896,
"learning_rate": 9.999297079337401e-06,
"loss": 10.0469,
"step": 146
},
{
"epoch": 0.007955536181342633,
"eval_accuracy": 0.07538672602578286,
"eval_loss": 10.0390625,
"eval_runtime": 280.0065,
"eval_samples_per_second": 120.594,
"eval_steps_per_second": 3.35,
"step": 146
},
{
"epoch": 0.008010026155187446,
"grad_norm": 1.2450186014175415,
"learning_rate": 9.999291630340018e-06,
"loss": 10.0547,
"step": 147
},
{
"epoch": 0.008010026155187446,
"eval_accuracy": 0.07551505644700279,
"eval_loss": 10.03125,
"eval_runtime": 279.889,
"eval_samples_per_second": 120.644,
"eval_steps_per_second": 3.351,
"step": 147
},
{
"epoch": 0.008064516129032258,
"grad_norm": 1.2184418439865112,
"learning_rate": 9.999286181342633e-06,
"loss": 10.0703,
"step": 148
},
{
"epoch": 0.008064516129032258,
"eval_accuracy": 0.07562749392862206,
"eval_loss": 10.03125,
"eval_runtime": 278.8225,
"eval_samples_per_second": 121.106,
"eval_steps_per_second": 3.364,
"step": 148
},
{
"epoch": 0.008119006102877071,
"grad_norm": 1.241905689239502,
"learning_rate": 9.999280732345248e-06,
"loss": 10.0469,
"step": 149
},
{
"epoch": 0.008119006102877071,
"eval_accuracy": 0.0757480660478147,
"eval_loss": 10.0234375,
"eval_runtime": 279.6408,
"eval_samples_per_second": 120.751,
"eval_steps_per_second": 3.354,
"step": 149
},
{
"epoch": 0.008173496076721883,
"grad_norm": 1.2825335264205933,
"learning_rate": 9.999275283347865e-06,
"loss": 10.0391,
"step": 150
},
{
"epoch": 0.008173496076721883,
"eval_accuracy": 0.07586571432930304,
"eval_loss": 10.0234375,
"eval_runtime": 279.96,
"eval_samples_per_second": 120.614,
"eval_steps_per_second": 3.35,
"step": 150
},
{
"epoch": 0.008227986050566696,
"grad_norm": 1.2704492807388306,
"learning_rate": 9.99926983435048e-06,
"loss": 10.0391,
"step": 151
},
{
"epoch": 0.008227986050566696,
"eval_accuracy": 0.07599531850160207,
"eval_loss": 10.015625,
"eval_runtime": 278.0769,
"eval_samples_per_second": 121.43,
"eval_steps_per_second": 3.373,
"step": 151
},
{
"epoch": 0.008282476024411508,
"grad_norm": 1.2123051881790161,
"learning_rate": 9.999264385353095e-06,
"loss": 10.0391,
"step": 152
},
{
"epoch": 0.008282476024411508,
"eval_accuracy": 0.07611594851857105,
"eval_loss": 10.015625,
"eval_runtime": 279.8596,
"eval_samples_per_second": 120.657,
"eval_steps_per_second": 3.352,
"step": 152
},
{
"epoch": 0.008336965998256321,
"grad_norm": 1.258999228477478,
"learning_rate": 9.999258936355712e-06,
"loss": 10.0391,
"step": 153
},
{
"epoch": 0.008336965998256321,
"eval_accuracy": 0.07623133878678279,
"eval_loss": 10.015625,
"eval_runtime": 279.3191,
"eval_samples_per_second": 120.89,
"eval_steps_per_second": 3.358,
"step": 153
},
{
"epoch": 0.008391455972101133,
"grad_norm": 1.194441795349121,
"learning_rate": 9.999253487358327e-06,
"loss": 10.0469,
"step": 154
},
{
"epoch": 0.008391455972101133,
"eval_accuracy": 0.07634800280607362,
"eval_loss": 10.0078125,
"eval_runtime": 279.5107,
"eval_samples_per_second": 120.808,
"eval_steps_per_second": 3.356,
"step": 154
},
{
"epoch": 0.008445945945945946,
"grad_norm": 1.289880633354187,
"learning_rate": 9.999248038360942e-06,
"loss": 10.0312,
"step": 155
},
{
"epoch": 0.008445945945945946,
"eval_accuracy": 0.07645618480113316,
"eval_loss": 10.0078125,
"eval_runtime": 279.6787,
"eval_samples_per_second": 120.735,
"eval_steps_per_second": 3.354,
"step": 155
},
{
"epoch": 0.008500435919790758,
"grad_norm": 1.318518042564392,
"learning_rate": 9.999242589363559e-06,
"loss": 9.9844,
"step": 156
},
{
"epoch": 0.008500435919790758,
"eval_accuracy": 0.07656841964053529,
"eval_loss": 10.0,
"eval_runtime": 279.7302,
"eval_samples_per_second": 120.713,
"eval_steps_per_second": 3.353,
"step": 156
},
{
"epoch": 0.008554925893635571,
"grad_norm": 1.3092557191848755,
"learning_rate": 9.999237140366174e-06,
"loss": 10.0,
"step": 157
},
{
"epoch": 0.008554925893635571,
"eval_accuracy": 0.0766785701599898,
"eval_loss": 10.0,
"eval_runtime": 279.5756,
"eval_samples_per_second": 120.779,
"eval_steps_per_second": 3.355,
"step": 157
},
{
"epoch": 0.008609415867480383,
"grad_norm": 1.2869155406951904,
"learning_rate": 9.999231691368789e-06,
"loss": 10.0078,
"step": 158
},
{
"epoch": 0.008609415867480383,
"eval_accuracy": 0.07677942808634446,
"eval_loss": 9.9921875,
"eval_runtime": 281.549,
"eval_samples_per_second": 119.933,
"eval_steps_per_second": 3.332,
"step": 158
},
{
"epoch": 0.008663905841325197,
"grad_norm": 1.2500320672988892,
"learning_rate": 9.999226242371405e-06,
"loss": 10.0078,
"step": 159
},
{
"epoch": 0.008663905841325197,
"eval_accuracy": 0.07687990967715302,
"eval_loss": 9.9921875,
"eval_runtime": 280.7605,
"eval_samples_per_second": 120.27,
"eval_steps_per_second": 3.341,
"step": 159
},
{
"epoch": 0.008718395815170008,
"grad_norm": 1.2038897275924683,
"learning_rate": 9.99922079337402e-06,
"loss": 10.0234,
"step": 160
},
{
"epoch": 0.008718395815170008,
"eval_accuracy": 0.07697283560815144,
"eval_loss": 9.9921875,
"eval_runtime": 278.5618,
"eval_samples_per_second": 121.219,
"eval_steps_per_second": 3.367,
"step": 160
},
{
"epoch": 0.008772885789014822,
"grad_norm": 1.2646440267562866,
"learning_rate": 9.999215344376635e-06,
"loss": 9.9922,
"step": 161
},
{
"epoch": 0.008772885789014822,
"eval_accuracy": 0.07707207239676907,
"eval_loss": 9.984375,
"eval_runtime": 279.0044,
"eval_samples_per_second": 121.027,
"eval_steps_per_second": 3.362,
"step": 161
},
{
"epoch": 0.008827375762859633,
"grad_norm": 1.2453374862670898,
"learning_rate": 9.999209895379252e-06,
"loss": 9.9922,
"step": 162
},
{
"epoch": 0.008827375762859633,
"eval_accuracy": 0.07716395616779366,
"eval_loss": 9.984375,
"eval_runtime": 280.4701,
"eval_samples_per_second": 120.394,
"eval_steps_per_second": 3.344,
"step": 162
},
{
"epoch": 0.008881865736704447,
"grad_norm": 1.3001788854599,
"learning_rate": 9.999204446381865e-06,
"loss": 9.9766,
"step": 163
},
{
"epoch": 0.008881865736704447,
"eval_accuracy": 0.07725401615886408,
"eval_loss": 9.9765625,
"eval_runtime": 280.3097,
"eval_samples_per_second": 120.463,
"eval_steps_per_second": 3.346,
"step": 163
},
{
"epoch": 0.008936355710549258,
"grad_norm": 1.2622265815734863,
"learning_rate": 9.999198997384482e-06,
"loss": 9.9922,
"step": 164
},
{
"epoch": 0.008936355710549258,
"eval_accuracy": 0.07734222342109218,
"eval_loss": 9.9765625,
"eval_runtime": 279.8973,
"eval_samples_per_second": 120.641,
"eval_steps_per_second": 3.351,
"step": 164
},
{
"epoch": 0.008990845684394072,
"grad_norm": 1.3013081550598145,
"learning_rate": 9.999193548387097e-06,
"loss": 9.9766,
"step": 165
},
{
"epoch": 0.008990845684394072,
"eval_accuracy": 0.07743468616988,
"eval_loss": 9.96875,
"eval_runtime": 280.2864,
"eval_samples_per_second": 120.473,
"eval_steps_per_second": 3.347,
"step": 165
},
{
"epoch": 0.009045335658238883,
"grad_norm": 1.258622407913208,
"learning_rate": 9.999188099389712e-06,
"loss": 9.9844,
"step": 166
},
{
"epoch": 0.009045335658238883,
"eval_accuracy": 0.07752159073214084,
"eval_loss": 9.96875,
"eval_runtime": 280.7557,
"eval_samples_per_second": 120.272,
"eval_steps_per_second": 3.341,
"step": 166
},
{
"epoch": 0.009099825632083697,
"grad_norm": 1.2523554563522339,
"learning_rate": 9.999182650392329e-06,
"loss": 9.9766,
"step": 167
},
{
"epoch": 0.009099825632083697,
"eval_accuracy": 0.07759642360803831,
"eval_loss": 9.96875,
"eval_runtime": 281.1831,
"eval_samples_per_second": 120.089,
"eval_steps_per_second": 3.336,
"step": 167
},
{
"epoch": 0.009154315605928508,
"grad_norm": 1.2290598154067993,
"learning_rate": 9.999177201394944e-06,
"loss": 9.9844,
"step": 168
},
{
"epoch": 0.009154315605928508,
"eval_accuracy": 0.07767707521045625,
"eval_loss": 9.9609375,
"eval_runtime": 281.1351,
"eval_samples_per_second": 120.11,
"eval_steps_per_second": 3.336,
"step": 168
},
{
"epoch": 0.009208805579773322,
"grad_norm": 1.2900922298431396,
"learning_rate": 9.999171752397559e-06,
"loss": 9.9609,
"step": 169
},
{
"epoch": 0.009208805579773322,
"eval_accuracy": 0.07775179229080108,
"eval_loss": 9.9609375,
"eval_runtime": 280.4797,
"eval_samples_per_second": 120.39,
"eval_steps_per_second": 3.344,
"step": 169
},
{
"epoch": 0.009263295553618133,
"grad_norm": 1.246096134185791,
"learning_rate": 9.999166303400176e-06,
"loss": 9.9766,
"step": 170
},
{
"epoch": 0.009263295553618133,
"eval_accuracy": 0.07782975164662,
"eval_loss": 9.953125,
"eval_runtime": 278.9976,
"eval_samples_per_second": 121.03,
"eval_steps_per_second": 3.362,
"step": 170
},
{
"epoch": 0.009317785527462947,
"grad_norm": 1.2970362901687622,
"learning_rate": 9.99916085440279e-06,
"loss": 9.9531,
"step": 171
},
{
"epoch": 0.009317785527462947,
"eval_accuracy": 0.07790701622912304,
"eval_loss": 9.953125,
"eval_runtime": 281.1285,
"eval_samples_per_second": 120.112,
"eval_steps_per_second": 3.337,
"step": 171
},
{
"epoch": 0.009372275501307759,
"grad_norm": 1.204124093055725,
"learning_rate": 9.999155405405406e-06,
"loss": 9.9922,
"step": 172
},
{
"epoch": 0.009372275501307759,
"eval_accuracy": 0.07798121222948096,
"eval_loss": 9.953125,
"eval_runtime": 280.5802,
"eval_samples_per_second": 120.347,
"eval_steps_per_second": 3.343,
"step": 172
},
{
"epoch": 0.009426765475152572,
"grad_norm": 1.29256010055542,
"learning_rate": 9.999149956408023e-06,
"loss": 9.9531,
"step": 173
},
{
"epoch": 0.009426765475152572,
"eval_accuracy": 0.07805448186541772,
"eval_loss": 9.9453125,
"eval_runtime": 279.9622,
"eval_samples_per_second": 120.613,
"eval_steps_per_second": 3.35,
"step": 173
},
{
"epoch": 0.009481255448997385,
"grad_norm": 1.3079378604888916,
"learning_rate": 9.999144507410638e-06,
"loss": 9.9375,
"step": 174
},
{
"epoch": 0.009481255448997385,
"eval_accuracy": 0.07812670934138066,
"eval_loss": 9.9453125,
"eval_runtime": 281.3535,
"eval_samples_per_second": 120.016,
"eval_steps_per_second": 3.334,
"step": 174
},
{
"epoch": 0.009535745422842197,
"grad_norm": 1.2296249866485596,
"learning_rate": 9.999139058413253e-06,
"loss": 9.9688,
"step": 175
},
{
"epoch": 0.009535745422842197,
"eval_accuracy": 0.07820545031717994,
"eval_loss": 9.9375,
"eval_runtime": 281.384,
"eval_samples_per_second": 120.003,
"eval_steps_per_second": 3.334,
"step": 175
},
{
"epoch": 0.00959023539668701,
"grad_norm": 1.2954541444778442,
"learning_rate": 9.99913360941587e-06,
"loss": 9.9453,
"step": 176
},
{
"epoch": 0.00959023539668701,
"eval_accuracy": 0.07826963000223398,
"eval_loss": 9.9375,
"eval_runtime": 281.0155,
"eval_samples_per_second": 120.161,
"eval_steps_per_second": 3.338,
"step": 176
},
{
"epoch": 0.009644725370531822,
"grad_norm": 1.2820461988449097,
"learning_rate": 9.999128160418484e-06,
"loss": 9.9375,
"step": 177
},
{
"epoch": 0.009644725370531822,
"eval_accuracy": 0.0783392520782624,
"eval_loss": 9.9375,
"eval_runtime": 280.6598,
"eval_samples_per_second": 120.313,
"eval_steps_per_second": 3.342,
"step": 177
},
{
"epoch": 0.009699215344376635,
"grad_norm": 1.278702974319458,
"learning_rate": 9.9991227114211e-06,
"loss": 9.9375,
"step": 178
},
{
"epoch": 0.009699215344376635,
"eval_accuracy": 0.07841200063421225,
"eval_loss": 9.9296875,
"eval_runtime": 279.7691,
"eval_samples_per_second": 120.696,
"eval_steps_per_second": 3.353,
"step": 178
},
{
"epoch": 0.009753705318221447,
"grad_norm": 1.253942847251892,
"learning_rate": 9.999117262423716e-06,
"loss": 9.9453,
"step": 179
},
{
"epoch": 0.009753705318221447,
"eval_accuracy": 0.07850298698970383,
"eval_loss": 9.9296875,
"eval_runtime": 279.9236,
"eval_samples_per_second": 120.629,
"eval_steps_per_second": 3.351,
"step": 179
},
{
"epoch": 0.00980819529206626,
"grad_norm": 1.2406857013702393,
"learning_rate": 9.99911181342633e-06,
"loss": 9.9453,
"step": 180
},
{
"epoch": 0.00980819529206626,
"eval_accuracy": 0.078591744280807,
"eval_loss": 9.921875,
"eval_runtime": 280.2024,
"eval_samples_per_second": 120.509,
"eval_steps_per_second": 3.348,
"step": 180
},
{
"epoch": 0.009862685265911072,
"grad_norm": 1.2425099611282349,
"learning_rate": 9.999106364428946e-06,
"loss": 9.9297,
"step": 181
},
{
"epoch": 0.009862685265911072,
"eval_accuracy": 0.07866854568109945,
"eval_loss": 9.921875,
"eval_runtime": 278.8945,
"eval_samples_per_second": 121.074,
"eval_steps_per_second": 3.363,
"step": 181
},
{
"epoch": 0.009917175239755886,
"grad_norm": 1.2617361545562744,
"learning_rate": 9.999100915431561e-06,
"loss": 9.9375,
"step": 182
},
{
"epoch": 0.009917175239755886,
"eval_accuracy": 0.07874769194133299,
"eval_loss": 9.9140625,
"eval_runtime": 281.1799,
"eval_samples_per_second": 120.09,
"eval_steps_per_second": 3.336,
"step": 182
},
{
"epoch": 0.009971665213600697,
"grad_norm": 1.2626757621765137,
"learning_rate": 9.999095466434176e-06,
"loss": 9.9375,
"step": 183
},
{
"epoch": 0.009971665213600697,
"eval_accuracy": 0.07882507231938868,
"eval_loss": 9.9140625,
"eval_runtime": 280.0578,
"eval_samples_per_second": 120.572,
"eval_steps_per_second": 3.349,
"step": 183
},
{
"epoch": 0.01002615518744551,
"grad_norm": 1.312477469444275,
"learning_rate": 9.999090017436791e-06,
"loss": 9.8984,
"step": 184
},
{
"epoch": 0.01002615518744551,
"eval_accuracy": 0.07890256849299702,
"eval_loss": 9.9140625,
"eval_runtime": 279.5807,
"eval_samples_per_second": 120.777,
"eval_steps_per_second": 3.355,
"step": 184
},
{
"epoch": 0.010080645161290322,
"grad_norm": 1.2198225259780884,
"learning_rate": 9.999084568439408e-06,
"loss": 9.9375,
"step": 185
},
{
"epoch": 0.010080645161290322,
"eval_accuracy": 0.07897815403998669,
"eval_loss": 9.90625,
"eval_runtime": 281.1433,
"eval_samples_per_second": 120.106,
"eval_steps_per_second": 3.336,
"step": 185
},
{
"epoch": 0.010135135135135136,
"grad_norm": 1.2455923557281494,
"learning_rate": 9.999079119442023e-06,
"loss": 9.9297,
"step": 186
},
{
"epoch": 0.010135135135135136,
"eval_accuracy": 0.07905443436029225,
"eval_loss": 9.90625,
"eval_runtime": 280.4858,
"eval_samples_per_second": 120.388,
"eval_steps_per_second": 3.344,
"step": 186
},
{
"epoch": 0.010189625108979947,
"grad_norm": 1.2409745454788208,
"learning_rate": 9.999073670444638e-06,
"loss": 9.9297,
"step": 187
},
{
"epoch": 0.010189625108979947,
"eval_accuracy": 0.07912625655182093,
"eval_loss": 9.8984375,
"eval_runtime": 280.6787,
"eval_samples_per_second": 120.305,
"eval_steps_per_second": 3.342,
"step": 187
},
{
"epoch": 0.01024411508282476,
"grad_norm": 1.2540324926376343,
"learning_rate": 9.999068221447255e-06,
"loss": 9.9141,
"step": 188
},
{
"epoch": 0.01024411508282476,
"eval_accuracy": 0.07919081257242107,
"eval_loss": 9.8984375,
"eval_runtime": 281.0078,
"eval_samples_per_second": 120.164,
"eval_steps_per_second": 3.338,
"step": 188
},
{
"epoch": 0.010298605056669572,
"grad_norm": 1.2172249555587769,
"learning_rate": 9.99906277244987e-06,
"loss": 9.9219,
"step": 189
},
{
"epoch": 0.010298605056669572,
"eval_accuracy": 0.07925064992425089,
"eval_loss": 9.8984375,
"eval_runtime": 280.6094,
"eval_samples_per_second": 120.335,
"eval_steps_per_second": 3.343,
"step": 189
},
{
"epoch": 0.010353095030514386,
"grad_norm": 1.2848864793777466,
"learning_rate": 9.999057323452485e-06,
"loss": 9.8984,
"step": 190
},
{
"epoch": 0.010353095030514386,
"eval_accuracy": 0.07933312530662301,
"eval_loss": 9.890625,
"eval_runtime": 280.1069,
"eval_samples_per_second": 120.55,
"eval_steps_per_second": 3.349,
"step": 190
},
{
"epoch": 0.010407585004359197,
"grad_norm": 1.3001790046691895,
"learning_rate": 9.999051874455102e-06,
"loss": 9.8828,
"step": 191
},
{
"epoch": 0.010407585004359197,
"eval_accuracy": 0.07942619598206223,
"eval_loss": 9.890625,
"eval_runtime": 279.216,
"eval_samples_per_second": 120.935,
"eval_steps_per_second": 3.359,
"step": 191
},
{
"epoch": 0.010462074978204011,
"grad_norm": 1.2808548212051392,
"learning_rate": 9.999046425457717e-06,
"loss": 9.8984,
"step": 192
},
{
"epoch": 0.010462074978204011,
"eval_accuracy": 0.07953382794824668,
"eval_loss": 9.8828125,
"eval_runtime": 279.7965,
"eval_samples_per_second": 120.684,
"eval_steps_per_second": 3.352,
"step": 192
},
{
"epoch": 0.010516564952048823,
"grad_norm": 1.2758278846740723,
"learning_rate": 9.999040976460332e-06,
"loss": 9.8906,
"step": 193
},
{
"epoch": 0.010516564952048823,
"eval_accuracy": 0.07961514537509233,
"eval_loss": 9.8828125,
"eval_runtime": 280.4068,
"eval_samples_per_second": 120.421,
"eval_steps_per_second": 3.345,
"step": 193
},
{
"epoch": 0.010571054925893636,
"grad_norm": 1.2733352184295654,
"learning_rate": 9.999035527462949e-06,
"loss": 9.9062,
"step": 194
},
{
"epoch": 0.010571054925893636,
"eval_accuracy": 0.07969660754637879,
"eval_loss": 9.8828125,
"eval_runtime": 280.0801,
"eval_samples_per_second": 120.562,
"eval_steps_per_second": 3.349,
"step": 194
},
{
"epoch": 0.010625544899738448,
"grad_norm": 1.3207552433013916,
"learning_rate": 9.999030078465564e-06,
"loss": 9.875,
"step": 195
},
{
"epoch": 0.010625544899738448,
"eval_accuracy": 0.07977876449098113,
"eval_loss": 9.875,
"eval_runtime": 279.7352,
"eval_samples_per_second": 120.711,
"eval_steps_per_second": 3.353,
"step": 195
},
{
"epoch": 0.010680034873583261,
"grad_norm": 1.3311365842819214,
"learning_rate": 9.999024629468179e-06,
"loss": 9.8594,
"step": 196
},
{
"epoch": 0.010680034873583261,
"eval_accuracy": 0.07984216255605482,
"eval_loss": 9.875,
"eval_runtime": 280.0513,
"eval_samples_per_second": 120.574,
"eval_steps_per_second": 3.349,
"step": 196
},
{
"epoch": 0.010734524847428073,
"grad_norm": 1.2795319557189941,
"learning_rate": 9.999019180470794e-06,
"loss": 9.8828,
"step": 197
},
{
"epoch": 0.010734524847428073,
"eval_accuracy": 0.07991207412096483,
"eval_loss": 9.875,
"eval_runtime": 279.4752,
"eval_samples_per_second": 120.823,
"eval_steps_per_second": 3.356,
"step": 197
},
{
"epoch": 0.010789014821272886,
"grad_norm": 1.2289413213729858,
"learning_rate": 9.999013731473409e-06,
"loss": 9.8984,
"step": 198
},
{
"epoch": 0.010789014821272886,
"eval_accuracy": 0.07998430159692778,
"eval_loss": 9.8671875,
"eval_runtime": 279.5837,
"eval_samples_per_second": 120.776,
"eval_steps_per_second": 3.355,
"step": 198
},
{
"epoch": 0.010843504795117698,
"grad_norm": 1.233830451965332,
"learning_rate": 9.999008282476025e-06,
"loss": 9.8906,
"step": 199
},
{
"epoch": 0.010843504795117698,
"eval_accuracy": 0.0800536920818509,
"eval_loss": 9.8671875,
"eval_runtime": 279.9047,
"eval_samples_per_second": 120.637,
"eval_steps_per_second": 3.351,
"step": 199
},
{
"epoch": 0.010897994768962511,
"grad_norm": 1.2242302894592285,
"learning_rate": 9.99900283347864e-06,
"loss": 9.9062,
"step": 200
},
{
"epoch": 0.010897994768962511,
"eval_accuracy": 0.08014829704836268,
"eval_loss": 9.859375,
"eval_runtime": 278.8987,
"eval_samples_per_second": 121.073,
"eval_steps_per_second": 3.363,
"step": 200
},
{
"epoch": 0.010952484742807323,
"grad_norm": 1.2602059841156006,
"learning_rate": 9.998997384481255e-06,
"loss": 9.8672,
"step": 201
},
{
"epoch": 0.010952484742807323,
"eval_accuracy": 0.08021710855552257,
"eval_loss": 9.859375,
"eval_runtime": 279.1708,
"eval_samples_per_second": 120.955,
"eval_steps_per_second": 3.36,
"step": 201
},
{
"epoch": 0.011006974716652136,
"grad_norm": 1.2747395038604736,
"learning_rate": 9.998991935483872e-06,
"loss": 9.8672,
"step": 202
},
{
"epoch": 0.011006974716652136,
"eval_accuracy": 0.08028279358276101,
"eval_loss": 9.859375,
"eval_runtime": 279.6011,
"eval_samples_per_second": 120.768,
"eval_steps_per_second": 3.355,
"step": 202
},
{
"epoch": 0.011061464690496948,
"grad_norm": 1.2887182235717773,
"learning_rate": 9.998986486486487e-06,
"loss": 9.8906,
"step": 203
},
{
"epoch": 0.011061464690496948,
"eval_accuracy": 0.08035612111647408,
"eval_loss": 9.8515625,
"eval_runtime": 278.7511,
"eval_samples_per_second": 121.137,
"eval_steps_per_second": 3.365,
"step": 203
},
{
"epoch": 0.011115954664341761,
"grad_norm": 1.2620630264282227,
"learning_rate": 9.998981037489102e-06,
"loss": 9.8828,
"step": 204
},
{
"epoch": 0.011115954664341761,
"eval_accuracy": 0.08042151665483091,
"eval_loss": 9.8515625,
"eval_runtime": 280.8141,
"eval_samples_per_second": 120.247,
"eval_steps_per_second": 3.34,
"step": 204
},
{
"epoch": 0.011170444638186573,
"grad_norm": 1.204312801361084,
"learning_rate": 9.998975588491719e-06,
"loss": 9.8906,
"step": 205
},
{
"epoch": 0.011170444638186573,
"eval_accuracy": 0.08048413309992423,
"eval_loss": 9.84375,
"eval_runtime": 278.2837,
"eval_samples_per_second": 121.34,
"eval_steps_per_second": 3.371,
"step": 205
},
{
"epoch": 0.011224934612031386,
"grad_norm": 1.2498817443847656,
"learning_rate": 9.998970139494334e-06,
"loss": 9.8828,
"step": 206
},
{
"epoch": 0.011224934612031386,
"eval_accuracy": 0.08053965706741799,
"eval_loss": 9.84375,
"eval_runtime": 280.6201,
"eval_samples_per_second": 120.33,
"eval_steps_per_second": 3.343,
"step": 206
},
{
"epoch": 0.011279424585876198,
"grad_norm": 1.2628120183944702,
"learning_rate": 9.998964690496949e-06,
"loss": 9.8594,
"step": 207
},
{
"epoch": 0.011279424585876198,
"eval_accuracy": 0.08057526419985664,
"eval_loss": 9.84375,
"eval_runtime": 281.1793,
"eval_samples_per_second": 120.091,
"eval_steps_per_second": 3.336,
"step": 207
},
{
"epoch": 0.011333914559721011,
"grad_norm": 1.286372184753418,
"learning_rate": 9.998959241499566e-06,
"loss": 9.875,
"step": 208
},
{
"epoch": 0.011333914559721011,
"eval_accuracy": 0.0806213797786979,
"eval_loss": 9.8359375,
"eval_runtime": 279.1646,
"eval_samples_per_second": 120.957,
"eval_steps_per_second": 3.36,
"step": 208
},
{
"epoch": 0.011388404533565825,
"grad_norm": 1.2716000080108643,
"learning_rate": 9.99895379250218e-06,
"loss": 9.8594,
"step": 209
},
{
"epoch": 0.011388404533565825,
"eval_accuracy": 0.0807168821607427,
"eval_loss": 9.8359375,
"eval_runtime": 279.4887,
"eval_samples_per_second": 120.817,
"eval_steps_per_second": 3.356,
"step": 209
},
{
"epoch": 0.011442894507410636,
"grad_norm": 1.2634248733520508,
"learning_rate": 9.998948343504796e-06,
"loss": 9.8516,
"step": 210
},
{
"epoch": 0.011442894507410636,
"eval_accuracy": 0.08081287667388623,
"eval_loss": 9.828125,
"eval_runtime": 280.2441,
"eval_samples_per_second": 120.491,
"eval_steps_per_second": 3.347,
"step": 210
},
{
"epoch": 0.01149738448125545,
"grad_norm": 1.2706140279769897,
"learning_rate": 9.998942894507413e-06,
"loss": 9.8359,
"step": 211
},
{
"epoch": 0.01149738448125545,
"eval_accuracy": 0.08091327141803031,
"eval_loss": 9.828125,
"eval_runtime": 279.7835,
"eval_samples_per_second": 120.69,
"eval_steps_per_second": 3.353,
"step": 211
},
{
"epoch": 0.011551874455100261,
"grad_norm": 1.3309648036956787,
"learning_rate": 9.998937445510028e-06,
"loss": 9.8281,
"step": 212
},
{
"epoch": 0.011551874455100261,
"eval_accuracy": 0.08096497413228675,
"eval_loss": 9.828125,
"eval_runtime": 279.8407,
"eval_samples_per_second": 120.665,
"eval_steps_per_second": 3.352,
"step": 212
},
{
"epoch": 0.011606364428945075,
"grad_norm": 1.228298544883728,
"learning_rate": 9.998931996512643e-06,
"loss": 9.8516,
"step": 213
},
{
"epoch": 0.011606364428945075,
"eval_accuracy": 0.08100848431119348,
"eval_loss": 9.8203125,
"eval_runtime": 278.724,
"eval_samples_per_second": 121.149,
"eval_steps_per_second": 3.365,
"step": 213
},
{
"epoch": 0.011660854402789887,
"grad_norm": 1.2184377908706665,
"learning_rate": 9.998926547515258e-06,
"loss": 9.8516,
"step": 214
},
{
"epoch": 0.011660854402789887,
"eval_accuracy": 0.0810596080476867,
"eval_loss": 9.8203125,
"eval_runtime": 279.3251,
"eval_samples_per_second": 120.888,
"eval_steps_per_second": 3.358,
"step": 214
},
{
"epoch": 0.0117153443766347,
"grad_norm": 1.311761498451233,
"learning_rate": 9.998921098517873e-06,
"loss": 9.8281,
"step": 215
},
{
"epoch": 0.0117153443766347,
"eval_accuracy": 0.08114680209882913,
"eval_loss": 9.8203125,
"eval_runtime": 279.8598,
"eval_samples_per_second": 120.657,
"eval_steps_per_second": 3.352,
"step": 215
},
{
"epoch": 0.011769834350479512,
"grad_norm": 1.2210040092468262,
"learning_rate": 9.99891564952049e-06,
"loss": 9.8438,
"step": 216
},
{
"epoch": 0.011769834350479512,
"eval_accuracy": 0.08124644417188101,
"eval_loss": 9.8125,
"eval_runtime": 280.305,
"eval_samples_per_second": 120.465,
"eval_steps_per_second": 3.346,
"step": 216
},
{
"epoch": 0.011824324324324325,
"grad_norm": 1.2239922285079956,
"learning_rate": 9.998910200523105e-06,
"loss": 9.8359,
"step": 217
},
{
"epoch": 0.011824324324324325,
"eval_accuracy": 0.08132301398106818,
"eval_loss": 9.8125,
"eval_runtime": 280.4951,
"eval_samples_per_second": 120.384,
"eval_steps_per_second": 3.344,
"step": 217
},
{
"epoch": 0.011878814298169137,
"grad_norm": 1.26374089717865,
"learning_rate": 9.99890475152572e-06,
"loss": 9.8281,
"step": 218
},
{
"epoch": 0.011878814298169137,
"eval_accuracy": 0.08139370716595856,
"eval_loss": 9.8046875,
"eval_runtime": 280.3905,
"eval_samples_per_second": 120.428,
"eval_steps_per_second": 3.345,
"step": 218
},
{
"epoch": 0.01193330427201395,
"grad_norm": 1.255954384803772,
"learning_rate": 9.998899302528335e-06,
"loss": 9.8281,
"step": 219
},
{
"epoch": 0.01193330427201395,
"eval_accuracy": 0.08145580253106498,
"eval_loss": 9.8046875,
"eval_runtime": 278.85,
"eval_samples_per_second": 121.094,
"eval_steps_per_second": 3.364,
"step": 219
},
{
"epoch": 0.011987794245858762,
"grad_norm": 1.2596890926361084,
"learning_rate": 9.998893853530951e-06,
"loss": 9.8281,
"step": 220
},
{
"epoch": 0.011987794245858762,
"eval_accuracy": 0.08151057382746654,
"eval_loss": 9.8046875,
"eval_runtime": 280.1905,
"eval_samples_per_second": 120.514,
"eval_steps_per_second": 3.348,
"step": 220
},
{
"epoch": 0.012042284219703575,
"grad_norm": 1.3286200761795044,
"learning_rate": 9.998888404533566e-06,
"loss": 9.7969,
"step": 221
},
{
"epoch": 0.012042284219703575,
"eval_accuracy": 0.08155735523073553,
"eval_loss": 9.796875,
"eval_runtime": 279.7838,
"eval_samples_per_second": 120.69,
"eval_steps_per_second": 3.353,
"step": 221
},
{
"epoch": 0.012096774193548387,
"grad_norm": 1.2406117916107178,
"learning_rate": 9.998882955536181e-06,
"loss": 9.8281,
"step": 222
},
{
"epoch": 0.012096774193548387,
"eval_accuracy": 0.08160451296955061,
"eval_loss": 9.796875,
"eval_runtime": 279.2237,
"eval_samples_per_second": 120.932,
"eval_steps_per_second": 3.359,
"step": 222
},
{
"epoch": 0.0121512641673932,
"grad_norm": 1.2828328609466553,
"learning_rate": 9.998877506538798e-06,
"loss": 9.8047,
"step": 223
},
{
"epoch": 0.0121512641673932,
"eval_accuracy": 0.08167708783217148,
"eval_loss": 9.7890625,
"eval_runtime": 280.1502,
"eval_samples_per_second": 120.532,
"eval_steps_per_second": 3.348,
"step": 223
},
{
"epoch": 0.012205754141238012,
"grad_norm": 1.3072834014892578,
"learning_rate": 9.998872057541413e-06,
"loss": 9.8047,
"step": 224
},
{
"epoch": 0.012205754141238012,
"eval_accuracy": 0.08175794207680655,
"eval_loss": 9.7890625,
"eval_runtime": 279.8278,
"eval_samples_per_second": 120.671,
"eval_steps_per_second": 3.352,
"step": 224
},
{
"epoch": 0.012260244115082825,
"grad_norm": 1.2602524757385254,
"learning_rate": 9.998866608544028e-06,
"loss": 9.8047,
"step": 225
},
{
"epoch": 0.012260244115082825,
"eval_accuracy": 0.0818193426685971,
"eval_loss": 9.7890625,
"eval_runtime": 279.4782,
"eval_samples_per_second": 120.822,
"eval_steps_per_second": 3.356,
"step": 225
},
{
"epoch": 0.012314734088927637,
"grad_norm": 1.2501016855239868,
"learning_rate": 9.998861159546645e-06,
"loss": 9.8047,
"step": 226
},
{
"epoch": 0.012314734088927637,
"eval_accuracy": 0.08185301022552892,
"eval_loss": 9.78125,
"eval_runtime": 279.6131,
"eval_samples_per_second": 120.763,
"eval_steps_per_second": 3.355,
"step": 226
},
{
"epoch": 0.01236922406277245,
"grad_norm": 1.2058792114257812,
"learning_rate": 9.99885571054926e-06,
"loss": 9.8281,
"step": 227
},
{
"epoch": 0.01236922406277245,
"eval_accuracy": 0.08187451924943291,
"eval_loss": 9.78125,
"eval_runtime": 279.6288,
"eval_samples_per_second": 120.757,
"eval_steps_per_second": 3.354,
"step": 227
},
{
"epoch": 0.012423714036617262,
"grad_norm": 1.2502729892730713,
"learning_rate": 9.998850261551875e-06,
"loss": 9.7812,
"step": 228
},
{
"epoch": 0.012423714036617262,
"eval_accuracy": 0.08191142908183882,
"eval_loss": 9.78125,
"eval_runtime": 279.6067,
"eval_samples_per_second": 120.766,
"eval_steps_per_second": 3.355,
"step": 228
},
{
"epoch": 0.012478204010462075,
"grad_norm": 1.2606781721115112,
"learning_rate": 9.998844812554492e-06,
"loss": 9.7891,
"step": 229
},
{
"epoch": 0.012478204010462075,
"eval_accuracy": 0.08197398762915582,
"eval_loss": 9.7734375,
"eval_runtime": 279.4306,
"eval_samples_per_second": 120.842,
"eval_steps_per_second": 3.357,
"step": 229
},
{
"epoch": 0.012532693984306887,
"grad_norm": 1.248340368270874,
"learning_rate": 9.998839363557107e-06,
"loss": 9.7969,
"step": 230
},
{
"epoch": 0.012532693984306887,
"eval_accuracy": 0.08206019741810078,
"eval_loss": 9.7734375,
"eval_runtime": 278.547,
"eval_samples_per_second": 121.226,
"eval_steps_per_second": 3.367,
"step": 230
},
{
"epoch": 0.0125871839581517,
"grad_norm": 1.3996437788009644,
"learning_rate": 9.998833914559722e-06,
"loss": 9.7578,
"step": 231
},
{
"epoch": 0.0125871839581517,
"eval_accuracy": 0.08214620456482859,
"eval_loss": 9.765625,
"eval_runtime": 279.48,
"eval_samples_per_second": 120.821,
"eval_steps_per_second": 3.356,
"step": 231
},
{
"epoch": 0.012641673931996512,
"grad_norm": 1.2275718450546265,
"learning_rate": 9.998828465562337e-06,
"loss": 9.8125,
"step": 232
},
{
"epoch": 0.012641673931996512,
"eval_accuracy": 0.08221081848320505,
"eval_loss": 9.765625,
"eval_runtime": 280.1352,
"eval_samples_per_second": 120.538,
"eval_steps_per_second": 3.348,
"step": 232
},
{
"epoch": 0.012696163905841325,
"grad_norm": 1.294257640838623,
"learning_rate": 9.998823016564952e-06,
"loss": 9.7734,
"step": 233
},
{
"epoch": 0.012696163905841325,
"eval_accuracy": 0.0822595105130927,
"eval_loss": 9.765625,
"eval_runtime": 278.3308,
"eval_samples_per_second": 121.32,
"eval_steps_per_second": 3.37,
"step": 233
},
{
"epoch": 0.012750653879686137,
"grad_norm": 1.302538275718689,
"learning_rate": 9.998817567567569e-06,
"loss": 9.7656,
"step": 234
},
{
"epoch": 0.012750653879686137,
"eval_accuracy": 0.0823088394185199,
"eval_loss": 9.7578125,
"eval_runtime": 279.0657,
"eval_samples_per_second": 121.0,
"eval_steps_per_second": 3.361,
"step": 234
},
{
"epoch": 0.01280514385353095,
"grad_norm": 1.2675853967666626,
"learning_rate": 9.998812118570184e-06,
"loss": 9.7578,
"step": 235
},
{
"epoch": 0.01280514385353095,
"eval_accuracy": 0.0823553023840191,
"eval_loss": 9.7578125,
"eval_runtime": 279.2742,
"eval_samples_per_second": 120.91,
"eval_steps_per_second": 3.359,
"step": 235
},
{
"epoch": 0.012859633827375762,
"grad_norm": 1.2685391902923584,
"learning_rate": 9.998806669572799e-06,
"loss": 9.7891,
"step": 236
},
{
"epoch": 0.012859633827375762,
"eval_accuracy": 0.08240431285167651,
"eval_loss": 9.7578125,
"eval_runtime": 279.3255,
"eval_samples_per_second": 120.888,
"eval_steps_per_second": 3.358,
"step": 236
},
{
"epoch": 0.012914123801220576,
"grad_norm": 1.279767632484436,
"learning_rate": 9.998801220575415e-06,
"loss": 9.7812,
"step": 237
},
{
"epoch": 0.012914123801220576,
"eval_accuracy": 0.08243551975311462,
"eval_loss": 9.75,
"eval_runtime": 278.6385,
"eval_samples_per_second": 121.186,
"eval_steps_per_second": 3.366,
"step": 237
},
{
"epoch": 0.012968613775065387,
"grad_norm": 1.2205240726470947,
"learning_rate": 9.99879577157803e-06,
"loss": 9.7656,
"step": 238
},
{
"epoch": 0.012968613775065387,
"eval_accuracy": 0.08245219431269564,
"eval_loss": 9.75,
"eval_runtime": 278.8375,
"eval_samples_per_second": 121.099,
"eval_steps_per_second": 3.364,
"step": 238
},
{
"epoch": 0.0130231037489102,
"grad_norm": 1.1911712884902954,
"learning_rate": 9.998790322580645e-06,
"loss": 9.7969,
"step": 239
},
{
"epoch": 0.0130231037489102,
"eval_accuracy": 0.08246684245010535,
"eval_loss": 9.75,
"eval_runtime": 279.7565,
"eval_samples_per_second": 120.701,
"eval_steps_per_second": 3.353,
"step": 239
},
{
"epoch": 0.013077593722755012,
"grad_norm": 1.250552773475647,
"learning_rate": 9.998784873583262e-06,
"loss": 9.75,
"step": 240
},
{
"epoch": 0.013077593722755012,
"eval_accuracy": 0.0824862092562854,
"eval_loss": 9.7421875,
"eval_runtime": 279.3878,
"eval_samples_per_second": 120.861,
"eval_steps_per_second": 3.357,
"step": 240
},
{
"epoch": 0.013132083696599826,
"grad_norm": 1.1946778297424316,
"learning_rate": 9.998779424585877e-06,
"loss": 9.7734,
"step": 241
},
{
"epoch": 0.013132083696599826,
"eval_accuracy": 0.08250629978466949,
"eval_loss": 9.7421875,
"eval_runtime": 278.1923,
"eval_samples_per_second": 121.38,
"eval_steps_per_second": 3.372,
"step": 241
},
{
"epoch": 0.013186573670444637,
"grad_norm": 1.2602076530456543,
"learning_rate": 9.998773975588492e-06,
"loss": 9.7578,
"step": 242
},
{
"epoch": 0.013186573670444637,
"eval_accuracy": 0.08251411598447309,
"eval_loss": 9.734375,
"eval_runtime": 278.443,
"eval_samples_per_second": 121.271,
"eval_steps_per_second": 3.369,
"step": 242
},
{
"epoch": 0.01324106364428945,
"grad_norm": 1.2365894317626953,
"learning_rate": 9.998768526591109e-06,
"loss": 9.7656,
"step": 243
},
{
"epoch": 0.01324106364428945,
"eval_accuracy": 0.08253744878833126,
"eval_loss": 9.734375,
"eval_runtime": 278.0592,
"eval_samples_per_second": 121.438,
"eval_steps_per_second": 3.373,
"step": 243
},
{
"epoch": 0.013295553618134264,
"grad_norm": 1.2928210496902466,
"learning_rate": 9.998763077593724e-06,
"loss": 9.7266,
"step": 244
},
{
"epoch": 0.013295553618134264,
"eval_accuracy": 0.0825691478208681,
"eval_loss": 9.734375,
"eval_runtime": 277.8043,
"eval_samples_per_second": 121.55,
"eval_steps_per_second": 3.376,
"step": 244
},
{
"epoch": 0.013350043591979076,
"grad_norm": 1.2674510478973389,
"learning_rate": 9.998757628596339e-06,
"loss": 9.75,
"step": 245
},
{
"epoch": 0.013350043591979076,
"eval_accuracy": 0.08259832830013489,
"eval_loss": 9.7265625,
"eval_runtime": 278.1561,
"eval_samples_per_second": 121.396,
"eval_steps_per_second": 3.372,
"step": 245
},
{
"epoch": 0.01340453356582389,
"grad_norm": 1.2813974618911743,
"learning_rate": 9.998752179598956e-06,
"loss": 9.7422,
"step": 246
},
{
"epoch": 0.01340453356582389,
"eval_accuracy": 0.08266126318299799,
"eval_loss": 9.7265625,
"eval_runtime": 280.0345,
"eval_samples_per_second": 120.582,
"eval_steps_per_second": 3.35,
"step": 246
},
{
"epoch": 0.0134590235396687,
"grad_norm": 1.2468206882476807,
"learning_rate": 9.99874673060157e-06,
"loss": 9.75,
"step": 247
},
{
"epoch": 0.0134590235396687,
"eval_accuracy": 0.08272399542364396,
"eval_loss": 9.7265625,
"eval_runtime": 277.7706,
"eval_samples_per_second": 121.564,
"eval_steps_per_second": 3.377,
"step": 247
},
{
"epoch": 0.013513513513513514,
"grad_norm": 1.1757469177246094,
"learning_rate": 9.998741281604186e-06,
"loss": 9.7656,
"step": 248
},
{
"epoch": 0.013513513513513514,
"eval_accuracy": 0.08277827458894678,
"eval_loss": 9.71875,
"eval_runtime": 278.8949,
"eval_samples_per_second": 121.074,
"eval_steps_per_second": 3.363,
"step": 248
},
{
"epoch": 0.013568003487358326,
"grad_norm": 1.2581287622451782,
"learning_rate": 9.998735832606801e-06,
"loss": 9.7266,
"step": 249
},
{
"epoch": 0.013568003487358326,
"eval_accuracy": 0.08281637132576732,
"eval_loss": 9.71875,
"eval_runtime": 277.5935,
"eval_samples_per_second": 121.642,
"eval_steps_per_second": 3.379,
"step": 249
},
{
"epoch": 0.01362249346120314,
"grad_norm": 1.2253607511520386,
"learning_rate": 9.998730383609416e-06,
"loss": 9.75,
"step": 250
},
{
"epoch": 0.01362249346120314,
"eval_accuracy": 0.08284589919169204,
"eval_loss": 9.7109375,
"eval_runtime": 277.5385,
"eval_samples_per_second": 121.666,
"eval_steps_per_second": 3.38,
"step": 250
},
{
"epoch": 0.013676983435047951,
"grad_norm": 1.2590970993041992,
"learning_rate": 9.998724934612033e-06,
"loss": 9.7266,
"step": 251
},
{
"epoch": 0.013676983435047951,
"eval_accuracy": 0.08286613446451693,
"eval_loss": 9.7109375,
"eval_runtime": 278.6492,
"eval_samples_per_second": 121.181,
"eval_steps_per_second": 3.366,
"step": 251
},
{
"epoch": 0.013731473408892764,
"grad_norm": 1.2588908672332764,
"learning_rate": 9.998719485614648e-06,
"loss": 9.7266,
"step": 252
},
{
"epoch": 0.013731473408892764,
"eval_accuracy": 0.08288451700849948,
"eval_loss": 9.7109375,
"eval_runtime": 278.1724,
"eval_samples_per_second": 121.389,
"eval_steps_per_second": 3.372,
"step": 252
},
{
"epoch": 0.013785963382737576,
"grad_norm": 1.2583962678909302,
"learning_rate": 9.998714036617263e-06,
"loss": 9.7266,
"step": 253
},
{
"epoch": 0.013785963382737576,
"eval_accuracy": 0.0829049549235415,
"eval_loss": 9.703125,
"eval_runtime": 277.9162,
"eval_samples_per_second": 121.501,
"eval_steps_per_second": 3.375,
"step": 253
},
{
"epoch": 0.01384045335658239,
"grad_norm": 1.2444450855255127,
"learning_rate": 9.998708587619878e-06,
"loss": 9.7266,
"step": 254
},
{
"epoch": 0.01384045335658239,
"eval_accuracy": 0.08292394539417544,
"eval_loss": 9.703125,
"eval_runtime": 279.5035,
"eval_samples_per_second": 120.811,
"eval_steps_per_second": 3.356,
"step": 254
},
{
"epoch": 0.013894943330427201,
"grad_norm": 1.201521396636963,
"learning_rate": 9.998703138622494e-06,
"loss": 9.7344,
"step": 255
},
{
"epoch": 0.013894943330427201,
"eval_accuracy": 0.08293364327170955,
"eval_loss": 9.703125,
"eval_runtime": 278.5449,
"eval_samples_per_second": 121.226,
"eval_steps_per_second": 3.368,
"step": 255
},
{
"epoch": 0.013949433304272014,
"grad_norm": 1.2879900932312012,
"learning_rate": 9.99869768962511e-06,
"loss": 9.7109,
"step": 256
},
{
"epoch": 0.013949433304272014,
"eval_accuracy": 0.08292825877851151,
"eval_loss": 9.6953125,
"eval_runtime": 278.9219,
"eval_samples_per_second": 121.063,
"eval_steps_per_second": 3.363,
"step": 256
},
{
"epoch": 0.014003923278116826,
"grad_norm": 1.2371270656585693,
"learning_rate": 9.998692240627725e-06,
"loss": 9.7109,
"step": 257
},
{
"epoch": 0.014003923278116826,
"eval_accuracy": 0.08293071943400523,
"eval_loss": 9.6953125,
"eval_runtime": 280.4149,
"eval_samples_per_second": 120.418,
"eval_steps_per_second": 3.345,
"step": 257
},
{
"epoch": 0.01405841325196164,
"grad_norm": 1.2438043355941772,
"learning_rate": 9.998686791630341e-06,
"loss": 9.7109,
"step": 258
},
{
"epoch": 0.01405841325196164,
"eval_accuracy": 0.08297243478184596,
"eval_loss": 9.6953125,
"eval_runtime": 279.9479,
"eval_samples_per_second": 120.619,
"eval_steps_per_second": 3.351,
"step": 258
},
{
"epoch": 0.014112903225806451,
"grad_norm": 1.2312678098678589,
"learning_rate": 9.998681342632956e-06,
"loss": 9.7031,
"step": 259
},
{
"epoch": 0.014112903225806451,
"eval_accuracy": 0.08300974989868613,
"eval_loss": 9.6875,
"eval_runtime": 278.9806,
"eval_samples_per_second": 121.037,
"eval_steps_per_second": 3.362,
"step": 259
},
{
"epoch": 0.014167393199651265,
"grad_norm": 1.289109706878662,
"learning_rate": 9.998675893635571e-06,
"loss": 9.7109,
"step": 260
},
{
"epoch": 0.014167393199651265,
"eval_accuracy": 0.0830896198811237,
"eval_loss": 9.6875,
"eval_runtime": 280.8942,
"eval_samples_per_second": 120.213,
"eval_steps_per_second": 3.339,
"step": 260
},
{
"epoch": 0.014221883173496076,
"grad_norm": 1.2733935117721558,
"learning_rate": 9.998670444638188e-06,
"loss": 9.6953,
"step": 261
},
{
"epoch": 0.014221883173496076,
"eval_accuracy": 0.08316147102154055,
"eval_loss": 9.6796875,
"eval_runtime": 279.4498,
"eval_samples_per_second": 120.834,
"eval_steps_per_second": 3.357,
"step": 261
},
{
"epoch": 0.01427637314734089,
"grad_norm": 1.2618088722229004,
"learning_rate": 9.998664995640803e-06,
"loss": 9.7031,
"step": 262
},
{
"epoch": 0.01427637314734089,
"eval_accuracy": 0.08321233421804031,
"eval_loss": 9.6796875,
"eval_runtime": 278.1291,
"eval_samples_per_second": 121.408,
"eval_steps_per_second": 3.373,
"step": 262
},
{
"epoch": 0.014330863121185701,
"grad_norm": 1.2773399353027344,
"learning_rate": 9.998659546643418e-06,
"loss": 9.6953,
"step": 263
},
{
"epoch": 0.014330863121185701,
"eval_accuracy": 0.08324872297045931,
"eval_loss": 9.6796875,
"eval_runtime": 278.6034,
"eval_samples_per_second": 121.201,
"eval_steps_per_second": 3.367,
"step": 263
},
{
"epoch": 0.014385353095030515,
"grad_norm": 1.246199131011963,
"learning_rate": 9.998654097646035e-06,
"loss": 9.6875,
"step": 264
},
{
"epoch": 0.014385353095030515,
"eval_accuracy": 0.08326855295884994,
"eval_loss": 9.671875,
"eval_runtime": 278.4105,
"eval_samples_per_second": 121.285,
"eval_steps_per_second": 3.369,
"step": 264
},
{
"epoch": 0.014439843068875326,
"grad_norm": 1.2552728652954102,
"learning_rate": 9.998648648648648e-06,
"loss": 9.6719,
"step": 265
},
{
"epoch": 0.014439843068875326,
"eval_accuracy": 0.08325928931463826,
"eval_loss": 9.671875,
"eval_runtime": 279.4834,
"eval_samples_per_second": 120.819,
"eval_steps_per_second": 3.356,
"step": 265
},
{
"epoch": 0.01449433304272014,
"grad_norm": 1.2612943649291992,
"learning_rate": 9.998643199651265e-06,
"loss": 9.6797,
"step": 266
},
{
"epoch": 0.01449433304272014,
"eval_accuracy": 0.08324947564155151,
"eval_loss": 9.671875,
"eval_runtime": 277.8081,
"eval_samples_per_second": 121.548,
"eval_steps_per_second": 3.376,
"step": 266
},
{
"epoch": 0.014548823016564951,
"grad_norm": 1.1741077899932861,
"learning_rate": 9.99863775065388e-06,
"loss": 9.7188,
"step": 267
},
{
"epoch": 0.014548823016564951,
"eval_accuracy": 0.08325022831264371,
"eval_loss": 9.6640625,
"eval_runtime": 278.7089,
"eval_samples_per_second": 121.155,
"eval_steps_per_second": 3.366,
"step": 267
},
{
"epoch": 0.014603312990409765,
"grad_norm": 1.2685978412628174,
"learning_rate": 9.998632301656495e-06,
"loss": 9.6953,
"step": 268
},
{
"epoch": 0.014603312990409765,
"eval_accuracy": 0.08326976881215271,
"eval_loss": 9.6640625,
"eval_runtime": 278.6178,
"eval_samples_per_second": 121.195,
"eval_steps_per_second": 3.367,
"step": 268
},
{
"epoch": 0.014657802964254577,
"grad_norm": 1.329729437828064,
"learning_rate": 9.998626852659112e-06,
"loss": 9.6797,
"step": 269
},
{
"epoch": 0.014657802964254577,
"eval_accuracy": 0.08330664969567048,
"eval_loss": 9.6640625,
"eval_runtime": 276.7799,
"eval_samples_per_second": 121.999,
"eval_steps_per_second": 3.389,
"step": 269
},
{
"epoch": 0.01471229293809939,
"grad_norm": 1.2404329776763916,
"learning_rate": 9.998621403661727e-06,
"loss": 9.6719,
"step": 270
},
{
"epoch": 0.01471229293809939,
"eval_accuracy": 0.08336324477202621,
"eval_loss": 9.65625,
"eval_runtime": 277.6213,
"eval_samples_per_second": 121.63,
"eval_steps_per_second": 3.379,
"step": 270
},
{
"epoch": 0.014766782911944202,
"grad_norm": 1.2414337396621704,
"learning_rate": 9.998615954664342e-06,
"loss": 9.6875,
"step": 271
},
{
"epoch": 0.014766782911944202,
"eval_accuracy": 0.0834355011968773,
"eval_loss": 9.65625,
"eval_runtime": 278.0197,
"eval_samples_per_second": 121.455,
"eval_steps_per_second": 3.374,
"step": 271
},
{
"epoch": 0.014821272885789015,
"grad_norm": 1.241061806678772,
"learning_rate": 9.998610505666959e-06,
"loss": 9.6641,
"step": 272
},
{
"epoch": 0.014821272885789015,
"eval_accuracy": 0.083517542345927,
"eval_loss": 9.6484375,
"eval_runtime": 278.7783,
"eval_samples_per_second": 121.125,
"eval_steps_per_second": 3.365,
"step": 272
},
{
"epoch": 0.014875762859633827,
"grad_norm": 1.2449415922164917,
"learning_rate": 9.998605056669574e-06,
"loss": 9.6719,
"step": 273
},
{
"epoch": 0.014875762859633827,
"eval_accuracy": 0.08358869871302796,
"eval_loss": 9.6484375,
"eval_runtime": 278.069,
"eval_samples_per_second": 121.434,
"eval_steps_per_second": 3.373,
"step": 273
},
{
"epoch": 0.01493025283347864,
"grad_norm": 1.2785589694976807,
"learning_rate": 9.998599607672189e-06,
"loss": 9.6719,
"step": 274
},
{
"epoch": 0.01493025283347864,
"eval_accuracy": 0.08364344106054136,
"eval_loss": 9.6484375,
"eval_runtime": 278.0063,
"eval_samples_per_second": 121.461,
"eval_steps_per_second": 3.374,
"step": 274
},
{
"epoch": 0.014984742807323452,
"grad_norm": 1.2922983169555664,
"learning_rate": 9.998594158674805e-06,
"loss": 9.6406,
"step": 275
},
{
"epoch": 0.014984742807323452,
"eval_accuracy": 0.08369166990821841,
"eval_loss": 9.640625,
"eval_runtime": 278.9804,
"eval_samples_per_second": 121.037,
"eval_steps_per_second": 3.362,
"step": 275
},
{
"epoch": 0.015039232781168265,
"grad_norm": 1.2422248125076294,
"learning_rate": 9.99858870967742e-06,
"loss": 9.6641,
"step": 276
},
{
"epoch": 0.015039232781168265,
"eval_accuracy": 0.08373764074261889,
"eval_loss": 9.640625,
"eval_runtime": 279.0599,
"eval_samples_per_second": 121.003,
"eval_steps_per_second": 3.361,
"step": 276
},
{
"epoch": 0.015093722755013077,
"grad_norm": 1.3546416759490967,
"learning_rate": 9.998583260680035e-06,
"loss": 9.6328,
"step": 277
},
{
"epoch": 0.015093722755013077,
"eval_accuracy": 0.08375952610206898,
"eval_loss": 9.640625,
"eval_runtime": 279.3312,
"eval_samples_per_second": 120.885,
"eval_steps_per_second": 3.358,
"step": 277
},
{
"epoch": 0.01514821272885789,
"grad_norm": 1.3069231510162354,
"learning_rate": 9.998577811682652e-06,
"loss": 9.6328,
"step": 278
},
{
"epoch": 0.01514821272885789,
"eval_accuracy": 0.08376647383522773,
"eval_loss": 9.6328125,
"eval_runtime": 277.9079,
"eval_samples_per_second": 121.504,
"eval_steps_per_second": 3.375,
"step": 278
},
{
"epoch": 0.015202702702702704,
"grad_norm": 1.2460886240005493,
"learning_rate": 9.998572362685267e-06,
"loss": 9.6484,
"step": 279
},
{
"epoch": 0.015202702702702704,
"eval_accuracy": 0.08376496849304334,
"eval_loss": 9.6328125,
"eval_runtime": 277.6965,
"eval_samples_per_second": 121.597,
"eval_steps_per_second": 3.378,
"step": 279
},
{
"epoch": 0.015257192676547515,
"grad_norm": 1.2432016134262085,
"learning_rate": 9.998566913687882e-06,
"loss": 9.6484,
"step": 280
},
{
"epoch": 0.015257192676547515,
"eval_accuracy": 0.08377012139513608,
"eval_loss": 9.6328125,
"eval_runtime": 278.8575,
"eval_samples_per_second": 121.091,
"eval_steps_per_second": 3.364,
"step": 280
},
{
"epoch": 0.015311682650392329,
"grad_norm": 1.1862634420394897,
"learning_rate": 9.998561464690499e-06,
"loss": 9.6875,
"step": 281
},
{
"epoch": 0.015311682650392329,
"eval_accuracy": 0.08376899238849779,
"eval_loss": 9.625,
"eval_runtime": 277.3994,
"eval_samples_per_second": 121.727,
"eval_steps_per_second": 3.381,
"step": 281
},
{
"epoch": 0.01536617262423714,
"grad_norm": 1.2821177244186401,
"learning_rate": 9.998556015693112e-06,
"loss": 9.6328,
"step": 282
},
{
"epoch": 0.01536617262423714,
"eval_accuracy": 0.08377649015053161,
"eval_loss": 9.625,
"eval_runtime": 277.0927,
"eval_samples_per_second": 121.862,
"eval_steps_per_second": 3.385,
"step": 282
},
{
"epoch": 0.015420662598081954,
"grad_norm": 1.2484102249145508,
"learning_rate": 9.998550566695729e-06,
"loss": 9.6562,
"step": 283
},
{
"epoch": 0.015420662598081954,
"eval_accuracy": 0.08380494690759437,
"eval_loss": 9.6171875,
"eval_runtime": 277.7775,
"eval_samples_per_second": 121.561,
"eval_steps_per_second": 3.377,
"step": 283
},
{
"epoch": 0.015475152571926765,
"grad_norm": 1.188653588294983,
"learning_rate": 9.998545117698344e-06,
"loss": 9.6719,
"step": 284
},
{
"epoch": 0.015475152571926765,
"eval_accuracy": 0.08383511164905864,
"eval_loss": 9.6171875,
"eval_runtime": 278.6388,
"eval_samples_per_second": 121.186,
"eval_steps_per_second": 3.366,
"step": 284
},
{
"epoch": 0.015529642545771579,
"grad_norm": 1.2269041538238525,
"learning_rate": 9.998539668700959e-06,
"loss": 9.6641,
"step": 285
},
{
"epoch": 0.015529642545771579,
"eval_accuracy": 0.08384709648875752,
"eval_loss": 9.6171875,
"eval_runtime": 278.3677,
"eval_samples_per_second": 121.304,
"eval_steps_per_second": 3.37,
"step": 285
},
{
"epoch": 0.01558413251961639,
"grad_norm": 1.266385555267334,
"learning_rate": 9.998534219703576e-06,
"loss": 9.6328,
"step": 286
},
{
"epoch": 0.01558413251961639,
"eval_accuracy": 0.08384223307554638,
"eval_loss": 9.609375,
"eval_runtime": 278.3739,
"eval_samples_per_second": 121.301,
"eval_steps_per_second": 3.37,
"step": 286
},
{
"epoch": 0.015638622493461204,
"grad_norm": 1.28980553150177,
"learning_rate": 9.998528770706191e-06,
"loss": 9.6328,
"step": 287
},
{
"epoch": 0.015638622493461204,
"eval_accuracy": 0.08388331154784755,
"eval_loss": 9.609375,
"eval_runtime": 279.4156,
"eval_samples_per_second": 120.849,
"eval_steps_per_second": 3.357,
"step": 287
},
{
"epoch": 0.015693112467306015,
"grad_norm": 1.2614293098449707,
"learning_rate": 9.998523321708806e-06,
"loss": 9.625,
"step": 288
},
{
"epoch": 0.015693112467306015,
"eval_accuracy": 0.08391469214261461,
"eval_loss": 9.609375,
"eval_runtime": 278.4116,
"eval_samples_per_second": 121.284,
"eval_steps_per_second": 3.369,
"step": 288
},
{
"epoch": 0.015747602441150827,
"grad_norm": 1.226367473602295,
"learning_rate": 9.998517872711423e-06,
"loss": 9.6328,
"step": 289
},
{
"epoch": 0.015747602441150827,
"eval_accuracy": 0.08397444264777995,
"eval_loss": 9.6015625,
"eval_runtime": 277.9542,
"eval_samples_per_second": 121.484,
"eval_steps_per_second": 3.375,
"step": 289
},
{
"epoch": 0.015802092414995642,
"grad_norm": 1.2424851655960083,
"learning_rate": 9.998512423714038e-06,
"loss": 9.6172,
"step": 290
},
{
"epoch": 0.015802092414995642,
"eval_accuracy": 0.08402672433979962,
"eval_loss": 9.6015625,
"eval_runtime": 277.2733,
"eval_samples_per_second": 121.782,
"eval_steps_per_second": 3.383,
"step": 290
},
{
"epoch": 0.015856582388840454,
"grad_norm": 1.2392754554748535,
"learning_rate": 9.998506974716653e-06,
"loss": 9.6172,
"step": 291
},
{
"epoch": 0.015856582388840454,
"eval_accuracy": 0.08409255411147887,
"eval_loss": 9.6015625,
"eval_runtime": 276.0303,
"eval_samples_per_second": 122.331,
"eval_steps_per_second": 3.398,
"step": 291
},
{
"epoch": 0.015911072362685266,
"grad_norm": 1.2615457773208618,
"learning_rate": 9.998501525719268e-06,
"loss": 9.6094,
"step": 292
},
{
"epoch": 0.015911072362685266,
"eval_accuracy": 0.08412827703947016,
"eval_loss": 9.59375,
"eval_runtime": 278.5317,
"eval_samples_per_second": 121.232,
"eval_steps_per_second": 3.368,
"step": 292
},
{
"epoch": 0.015965562336530077,
"grad_norm": 1.2395169734954834,
"learning_rate": 9.998496076721884e-06,
"loss": 9.6172,
"step": 293
},
{
"epoch": 0.015965562336530077,
"eval_accuracy": 0.0841603524075531,
"eval_loss": 9.59375,
"eval_runtime": 278.5884,
"eval_samples_per_second": 121.207,
"eval_steps_per_second": 3.367,
"step": 293
},
{
"epoch": 0.016020052310374892,
"grad_norm": 1.2416415214538574,
"learning_rate": 9.9984906277245e-06,
"loss": 9.6094,
"step": 294
},
{
"epoch": 0.016020052310374892,
"eval_accuracy": 0.08417905338930542,
"eval_loss": 9.59375,
"eval_runtime": 279.278,
"eval_samples_per_second": 120.908,
"eval_steps_per_second": 3.359,
"step": 294
},
{
"epoch": 0.016074542284219704,
"grad_norm": 1.2306995391845703,
"learning_rate": 9.998485178727115e-06,
"loss": 9.6328,
"step": 295
},
{
"epoch": 0.016074542284219704,
"eval_accuracy": 0.08419016976235945,
"eval_loss": 9.5859375,
"eval_runtime": 279.0203,
"eval_samples_per_second": 121.02,
"eval_steps_per_second": 3.362,
"step": 295
},
{
"epoch": 0.016129032258064516,
"grad_norm": 1.2327263355255127,
"learning_rate": 9.998479729729731e-06,
"loss": 9.5938,
"step": 296
},
{
"epoch": 0.016129032258064516,
"eval_accuracy": 0.08418545109358912,
"eval_loss": 9.5859375,
"eval_runtime": 279.0896,
"eval_samples_per_second": 120.99,
"eval_steps_per_second": 3.361,
"step": 296
},
{
"epoch": 0.016183522231909327,
"grad_norm": 1.322943091392517,
"learning_rate": 9.998474280732346e-06,
"loss": 9.5938,
"step": 297
},
{
"epoch": 0.016183522231909327,
"eval_accuracy": 0.08417552162494972,
"eval_loss": 9.578125,
"eval_runtime": 278.5612,
"eval_samples_per_second": 121.219,
"eval_steps_per_second": 3.367,
"step": 297
},
{
"epoch": 0.016238012205754142,
"grad_norm": 1.264687180519104,
"learning_rate": 9.998468831734961e-06,
"loss": 9.6016,
"step": 298
},
{
"epoch": 0.016238012205754142,
"eval_accuracy": 0.08418681169133271,
"eval_loss": 9.578125,
"eval_runtime": 276.7181,
"eval_samples_per_second": 122.027,
"eval_steps_per_second": 3.39,
"step": 298
},
{
"epoch": 0.016292502179598954,
"grad_norm": 1.2931830883026123,
"learning_rate": 9.998463382737576e-06,
"loss": 9.5781,
"step": 299
},
{
"epoch": 0.016292502179598954,
"eval_accuracy": 0.08422736908364697,
"eval_loss": 9.578125,
"eval_runtime": 278.6618,
"eval_samples_per_second": 121.176,
"eval_steps_per_second": 3.366,
"step": 299
},
{
"epoch": 0.016346992153443766,
"grad_norm": 1.247689962387085,
"learning_rate": 9.998457933740191e-06,
"loss": 9.5938,
"step": 300
},
{
"epoch": 0.016346992153443766,
"eval_accuracy": 0.08427264514473155,
"eval_loss": 9.5703125,
"eval_runtime": 278.3198,
"eval_samples_per_second": 121.324,
"eval_steps_per_second": 3.37,
"step": 300
},
{
"epoch": 0.016401482127288577,
"grad_norm": 1.269108772277832,
"learning_rate": 9.998452484742808e-06,
"loss": 9.5938,
"step": 301
},
{
"epoch": 0.016401482127288577,
"eval_accuracy": 0.08434733327618822,
"eval_loss": 9.5703125,
"eval_runtime": 277.9862,
"eval_samples_per_second": 121.47,
"eval_steps_per_second": 3.374,
"step": 301
},
{
"epoch": 0.016455972101133393,
"grad_norm": 1.2147337198257446,
"learning_rate": 9.998447035745423e-06,
"loss": 9.6016,
"step": 302
},
{
"epoch": 0.016455972101133393,
"eval_accuracy": 0.08440334937478074,
"eval_loss": 9.5703125,
"eval_runtime": 278.4928,
"eval_samples_per_second": 121.249,
"eval_steps_per_second": 3.368,
"step": 302
},
{
"epoch": 0.016510462074978204,
"grad_norm": 1.2472692728042603,
"learning_rate": 9.998441586748038e-06,
"loss": 9.5781,
"step": 303
},
{
"epoch": 0.016510462074978204,
"eval_accuracy": 0.08445693376676766,
"eval_loss": 9.5625,
"eval_runtime": 277.2562,
"eval_samples_per_second": 121.79,
"eval_steps_per_second": 3.383,
"step": 303
},
{
"epoch": 0.016564952048823016,
"grad_norm": 1.2579458951950073,
"learning_rate": 9.998436137750655e-06,
"loss": 9.6016,
"step": 304
},
{
"epoch": 0.016564952048823016,
"eval_accuracy": 0.08448973285705465,
"eval_loss": 9.5625,
"eval_runtime": 277.2268,
"eval_samples_per_second": 121.803,
"eval_steps_per_second": 3.384,
"step": 304
},
{
"epoch": 0.016619442022667828,
"grad_norm": 1.2708834409713745,
"learning_rate": 9.99843068875327e-06,
"loss": 9.5703,
"step": 305
},
{
"epoch": 0.016619442022667828,
"eval_accuracy": 0.084512920916472,
"eval_loss": 9.5625,
"eval_runtime": 277.9135,
"eval_samples_per_second": 121.502,
"eval_steps_per_second": 3.375,
"step": 305
},
{
"epoch": 0.016673931996512643,
"grad_norm": 1.2249526977539062,
"learning_rate": 9.998425239755885e-06,
"loss": 9.5781,
"step": 306
},
{
"epoch": 0.016673931996512643,
"eval_accuracy": 0.08454019076912014,
"eval_loss": 9.5546875,
"eval_runtime": 277.7743,
"eval_samples_per_second": 121.563,
"eval_steps_per_second": 3.377,
"step": 306
},
{
"epoch": 0.016728421970357454,
"grad_norm": 1.214102029800415,
"learning_rate": 9.998419790758502e-06,
"loss": 9.5938,
"step": 307
},
{
"epoch": 0.016728421970357454,
"eval_accuracy": 0.08456963178838038,
"eval_loss": 9.5546875,
"eval_runtime": 277.7599,
"eval_samples_per_second": 121.569,
"eval_steps_per_second": 3.377,
"step": 307
},
{
"epoch": 0.016782911944202266,
"grad_norm": 1.2691110372543335,
"learning_rate": 9.998414341761117e-06,
"loss": 9.5391,
"step": 308
},
{
"epoch": 0.016782911944202266,
"eval_accuracy": 0.08458882490123146,
"eval_loss": 9.5546875,
"eval_runtime": 278.1571,
"eval_samples_per_second": 121.395,
"eval_steps_per_second": 3.372,
"step": 308
},
{
"epoch": 0.016837401918047078,
"grad_norm": 1.2669826745986938,
"learning_rate": 9.998408892763732e-06,
"loss": 9.5625,
"step": 309
},
{
"epoch": 0.016837401918047078,
"eval_accuracy": 0.08460961020293142,
"eval_loss": 9.546875,
"eval_runtime": 277.952,
"eval_samples_per_second": 121.485,
"eval_steps_per_second": 3.375,
"step": 309
},
{
"epoch": 0.016891891891891893,
"grad_norm": 1.2808586359024048,
"learning_rate": 9.998403443766349e-06,
"loss": 9.5547,
"step": 310
},
{
"epoch": 0.016891891891891893,
"eval_accuracy": 0.08461615265165591,
"eval_loss": 9.546875,
"eval_runtime": 277.1252,
"eval_samples_per_second": 121.847,
"eval_steps_per_second": 3.385,
"step": 310
},
{
"epoch": 0.016946381865736704,
"grad_norm": 1.252959132194519,
"learning_rate": 9.998397994768964e-06,
"loss": 9.5703,
"step": 311
},
{
"epoch": 0.016946381865736704,
"eval_accuracy": 0.08458865120790249,
"eval_loss": 9.546875,
"eval_runtime": 278.3797,
"eval_samples_per_second": 121.298,
"eval_steps_per_second": 3.369,
"step": 311
},
{
"epoch": 0.017000871839581516,
"grad_norm": 1.2392700910568237,
"learning_rate": 9.998392545771579e-06,
"loss": 9.5625,
"step": 312
},
{
"epoch": 0.017000871839581516,
"eval_accuracy": 0.08457846119926964,
"eval_loss": 9.5390625,
"eval_runtime": 278.5979,
"eval_samples_per_second": 121.203,
"eval_steps_per_second": 3.367,
"step": 312
},
{
"epoch": 0.01705536181342633,
"grad_norm": 1.363663673400879,
"learning_rate": 9.998387096774195e-06,
"loss": 9.5469,
"step": 313
},
{
"epoch": 0.01705536181342633,
"eval_accuracy": 0.0846050362786019,
"eval_loss": 9.5390625,
"eval_runtime": 279.48,
"eval_samples_per_second": 120.821,
"eval_steps_per_second": 3.356,
"step": 313
},
{
"epoch": 0.017109851787271143,
"grad_norm": 1.258362889289856,
"learning_rate": 9.99838164777681e-06,
"loss": 9.5469,
"step": 314
},
{
"epoch": 0.017109851787271143,
"eval_accuracy": 0.08462159504263028,
"eval_loss": 9.5390625,
"eval_runtime": 280.9528,
"eval_samples_per_second": 120.187,
"eval_steps_per_second": 3.339,
"step": 314
},
{
"epoch": 0.017164341761115955,
"grad_norm": 1.2624891996383667,
"learning_rate": 9.998376198779425e-06,
"loss": 9.5391,
"step": 315
},
{
"epoch": 0.017164341761115955,
"eval_accuracy": 0.08465442308180542,
"eval_loss": 9.53125,
"eval_runtime": 280.6538,
"eval_samples_per_second": 120.316,
"eval_steps_per_second": 3.342,
"step": 315
},
{
"epoch": 0.017218831734960766,
"grad_norm": 1.2132024765014648,
"learning_rate": 9.998370749782042e-06,
"loss": 9.5781,
"step": 316
},
{
"epoch": 0.017218831734960766,
"eval_accuracy": 0.08470062550731117,
"eval_loss": 9.53125,
"eval_runtime": 279.7295,
"eval_samples_per_second": 120.713,
"eval_steps_per_second": 3.353,
"step": 316
},
{
"epoch": 0.01727332170880558,
"grad_norm": 1.2480347156524658,
"learning_rate": 9.998365300784656e-06,
"loss": 9.5469,
"step": 317
},
{
"epoch": 0.01727332170880558,
"eval_accuracy": 0.08472815589995276,
"eval_loss": 9.53125,
"eval_runtime": 278.8002,
"eval_samples_per_second": 121.115,
"eval_steps_per_second": 3.364,
"step": 317
},
{
"epoch": 0.017327811682650393,
"grad_norm": 1.304702639579773,
"learning_rate": 9.998359851787272e-06,
"loss": 9.5312,
"step": 318
},
{
"epoch": 0.017327811682650393,
"eval_accuracy": 0.0847501570549555,
"eval_loss": 9.5234375,
"eval_runtime": 279.8383,
"eval_samples_per_second": 120.666,
"eval_steps_per_second": 3.352,
"step": 318
},
{
"epoch": 0.017382301656495205,
"grad_norm": 1.221549153327942,
"learning_rate": 9.998354402789887e-06,
"loss": 9.5703,
"step": 319
},
{
"epoch": 0.017382301656495205,
"eval_accuracy": 0.08476648422787858,
"eval_loss": 9.5234375,
"eval_runtime": 278.8177,
"eval_samples_per_second": 121.108,
"eval_steps_per_second": 3.364,
"step": 319
},
{
"epoch": 0.017436791630340016,
"grad_norm": 1.262052297592163,
"learning_rate": 9.998348953792502e-06,
"loss": 9.5312,
"step": 320
},
{
"epoch": 0.017436791630340016,
"eval_accuracy": 0.08480269928696862,
"eval_loss": 9.5234375,
"eval_runtime": 278.2907,
"eval_samples_per_second": 121.337,
"eval_steps_per_second": 3.371,
"step": 320
},
{
"epoch": 0.01749128160418483,
"grad_norm": 1.20883047580719,
"learning_rate": 9.998343504795119e-06,
"loss": 9.5703,
"step": 321
},
{
"epoch": 0.01749128160418483,
"eval_accuracy": 0.08483929068160476,
"eval_loss": 9.515625,
"eval_runtime": 278.7692,
"eval_samples_per_second": 121.129,
"eval_steps_per_second": 3.365,
"step": 321
},
{
"epoch": 0.017545771578029643,
"grad_norm": 1.2607730627059937,
"learning_rate": 9.998338055797734e-06,
"loss": 9.5312,
"step": 322
},
{
"epoch": 0.017545771578029643,
"eval_accuracy": 0.08487046863415469,
"eval_loss": 9.515625,
"eval_runtime": 278.4626,
"eval_samples_per_second": 121.262,
"eval_steps_per_second": 3.368,
"step": 322
},
{
"epoch": 0.017600261551874455,
"grad_norm": 1.2543152570724487,
"learning_rate": 9.998332606800349e-06,
"loss": 9.5391,
"step": 323
},
{
"epoch": 0.017600261551874455,
"eval_accuracy": 0.08489484359798667,
"eval_loss": 9.5078125,
"eval_runtime": 278.034,
"eval_samples_per_second": 121.449,
"eval_steps_per_second": 3.374,
"step": 323
},
{
"epoch": 0.017654751525719267,
"grad_norm": 1.261744737625122,
"learning_rate": 9.998327157802966e-06,
"loss": 9.5156,
"step": 324
},
{
"epoch": 0.017654751525719267,
"eval_accuracy": 0.08491759742408161,
"eval_loss": 9.5078125,
"eval_runtime": 278.3626,
"eval_samples_per_second": 121.306,
"eval_steps_per_second": 3.37,
"step": 324
},
{
"epoch": 0.01770924149956408,
"grad_norm": 1.261576771736145,
"learning_rate": 9.998321708805581e-06,
"loss": 9.5234,
"step": 325
},
{
"epoch": 0.01770924149956408,
"eval_accuracy": 0.08492923487712253,
"eval_loss": 9.5078125,
"eval_runtime": 278.7706,
"eval_samples_per_second": 121.128,
"eval_steps_per_second": 3.365,
"step": 325
},
{
"epoch": 0.017763731473408893,
"grad_norm": 1.2752763032913208,
"learning_rate": 9.998316259808196e-06,
"loss": 9.5391,
"step": 326
},
{
"epoch": 0.017763731473408893,
"eval_accuracy": 0.08490972332650168,
"eval_loss": 9.5,
"eval_runtime": 277.4247,
"eval_samples_per_second": 121.716,
"eval_steps_per_second": 3.381,
"step": 326
},
{
"epoch": 0.017818221447253705,
"grad_norm": 1.2762752771377563,
"learning_rate": 9.998310810810811e-06,
"loss": 9.5078,
"step": 327
},
{
"epoch": 0.017818221447253705,
"eval_accuracy": 0.08489041441809797,
"eval_loss": 9.5,
"eval_runtime": 278.791,
"eval_samples_per_second": 121.119,
"eval_steps_per_second": 3.365,
"step": 327
},
{
"epoch": 0.017872711421098517,
"grad_norm": 1.2357895374298096,
"learning_rate": 9.998305361813428e-06,
"loss": 9.5312,
"step": 328
},
{
"epoch": 0.017872711421098517,
"eval_accuracy": 0.08484099866600628,
"eval_loss": 9.5,
"eval_runtime": 278.1228,
"eval_samples_per_second": 121.41,
"eval_steps_per_second": 3.373,
"step": 328
},
{
"epoch": 0.017927201394943332,
"grad_norm": 1.2696473598480225,
"learning_rate": 9.998299912816043e-06,
"loss": 9.5078,
"step": 329
},
{
"epoch": 0.017927201394943332,
"eval_accuracy": 0.08476593419900351,
"eval_loss": 9.4921875,
"eval_runtime": 279.1969,
"eval_samples_per_second": 120.943,
"eval_steps_per_second": 3.36,
"step": 329
},
{
"epoch": 0.017981691368788143,
"grad_norm": 1.2310556173324585,
"learning_rate": 9.998294463818658e-06,
"loss": 9.5234,
"step": 330
},
{
"epoch": 0.017981691368788143,
"eval_accuracy": 0.08473466939978909,
"eval_loss": 9.4921875,
"eval_runtime": 278.5891,
"eval_samples_per_second": 121.207,
"eval_steps_per_second": 3.367,
"step": 330
},
{
"epoch": 0.018036181342632955,
"grad_norm": 1.2741543054580688,
"learning_rate": 9.998289014821274e-06,
"loss": 9.5078,
"step": 331
},
{
"epoch": 0.018036181342632955,
"eval_accuracy": 0.08475229927267945,
"eval_loss": 9.4921875,
"eval_runtime": 278.1693,
"eval_samples_per_second": 121.39,
"eval_steps_per_second": 3.372,
"step": 331
},
{
"epoch": 0.018090671316477767,
"grad_norm": 1.2879000902175903,
"learning_rate": 9.99828356582389e-06,
"loss": 9.4922,
"step": 332
},
{
"epoch": 0.018090671316477767,
"eval_accuracy": 0.08481100761787097,
"eval_loss": 9.484375,
"eval_runtime": 279.2612,
"eval_samples_per_second": 120.915,
"eval_steps_per_second": 3.359,
"step": 332
},
{
"epoch": 0.018145161290322582,
"grad_norm": 1.2958968877792358,
"learning_rate": 9.998278116826505e-06,
"loss": 9.5,
"step": 333
},
{
"epoch": 0.018145161290322582,
"eval_accuracy": 0.08491096812869263,
"eval_loss": 9.484375,
"eval_runtime": 278.8921,
"eval_samples_per_second": 121.076,
"eval_steps_per_second": 3.363,
"step": 333
},
{
"epoch": 0.018199651264167394,
"grad_norm": 1.2337111234664917,
"learning_rate": 9.99827266782912e-06,
"loss": 9.5078,
"step": 334
},
{
"epoch": 0.018199651264167394,
"eval_accuracy": 0.08498186395580014,
"eval_loss": 9.484375,
"eval_runtime": 279.3346,
"eval_samples_per_second": 120.884,
"eval_steps_per_second": 3.358,
"step": 334
},
{
"epoch": 0.018254141238012205,
"grad_norm": 1.3057183027267456,
"learning_rate": 9.998267218831735e-06,
"loss": 9.4766,
"step": 335
},
{
"epoch": 0.018254141238012205,
"eval_accuracy": 0.0850502412296376,
"eval_loss": 9.4765625,
"eval_runtime": 278.5471,
"eval_samples_per_second": 121.225,
"eval_steps_per_second": 3.367,
"step": 335
},
{
"epoch": 0.018308631211857017,
"grad_norm": 1.2413270473480225,
"learning_rate": 9.998261769834351e-06,
"loss": 9.5,
"step": 336
},
{
"epoch": 0.018308631211857017,
"eval_accuracy": 0.08509852797509099,
"eval_loss": 9.4765625,
"eval_runtime": 277.2384,
"eval_samples_per_second": 121.798,
"eval_steps_per_second": 3.383,
"step": 336
},
{
"epoch": 0.018308631211857017,
"step": 336,
"total_flos": 352197518819328.0,
"train_loss": 10.035667782738095,
"train_runtime": 94125.7847,
"train_samples_per_second": 701.873,
"train_steps_per_second": 19.497
}
],
"logging_steps": 1,
"max_steps": 1835200,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 352197518819328.0,
"train_batch_size": 36,
"trial_name": null,
"trial_params": null
}