|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.018308631211857017, |
|
"eval_steps": 1, |
|
"global_step": 336, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 5.448997384481256e-05, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 5.448997384481256e-05, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 283.3287, |
|
"eval_samples_per_second": 119.18, |
|
"eval_steps_per_second": 3.311, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00010897994768962511, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00010897994768962511, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 283.1668, |
|
"eval_samples_per_second": 119.248, |
|
"eval_steps_per_second": 3.313, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00016346992153443767, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00016346992153443767, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 282.1254, |
|
"eval_samples_per_second": 119.688, |
|
"eval_steps_per_second": 3.325, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00021795989537925023, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00021795989537925023, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 281.3948, |
|
"eval_samples_per_second": 119.999, |
|
"eval_steps_per_second": 3.333, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00027244986922406276, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8594, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00027244986922406276, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 281.7028, |
|
"eval_samples_per_second": 119.867, |
|
"eval_steps_per_second": 3.33, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00032693984306887534, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00032693984306887534, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 282.539, |
|
"eval_samples_per_second": 119.513, |
|
"eval_steps_per_second": 3.32, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00038142981691368787, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00038142981691368787, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 282.3676, |
|
"eval_samples_per_second": 119.585, |
|
"eval_steps_per_second": 3.322, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00043591979075850045, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00043591979075850045, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 281.6041, |
|
"eval_samples_per_second": 119.909, |
|
"eval_steps_per_second": 3.331, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.000490409764603313, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.000490409764603313, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 281.1865, |
|
"eval_samples_per_second": 120.088, |
|
"eval_steps_per_second": 3.336, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0005448997384481255, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8594, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005448997384481255, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 282.104, |
|
"eval_samples_per_second": 119.697, |
|
"eval_steps_per_second": 3.325, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005993897122929382, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0005993897122929382, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 280.8618, |
|
"eval_samples_per_second": 120.226, |
|
"eval_steps_per_second": 3.34, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0006538796861377507, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0006538796861377507, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 280.475, |
|
"eval_samples_per_second": 120.392, |
|
"eval_steps_per_second": 3.344, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0007083696599825632, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8594, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0007083696599825632, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 279.8203, |
|
"eval_samples_per_second": 120.674, |
|
"eval_steps_per_second": 3.352, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0007628596338273757, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0007628596338273757, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 279.8528, |
|
"eval_samples_per_second": 120.66, |
|
"eval_steps_per_second": 3.352, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0008173496076721883, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8594, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0008173496076721883, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 281.1019, |
|
"eval_samples_per_second": 120.124, |
|
"eval_steps_per_second": 3.337, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0008718395815170009, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8594, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0008718395815170009, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 280.3712, |
|
"eval_samples_per_second": 120.437, |
|
"eval_steps_per_second": 3.346, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0009263295553618134, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 10.8672, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0009263295553618134, |
|
"eval_accuracy": 0.004507457682298169, |
|
"eval_loss": 10.8671875, |
|
"eval_runtime": 281.4302, |
|
"eval_samples_per_second": 119.984, |
|
"eval_steps_per_second": 3.333, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.000980819529206626, |
|
"grad_norm": 3.3501086235046387, |
|
"learning_rate": 9.999994551002616e-06, |
|
"loss": 10.8672, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.000980819529206626, |
|
"eval_accuracy": 0.008603349021604294, |
|
"eval_loss": 10.8359375, |
|
"eval_runtime": 281.0275, |
|
"eval_samples_per_second": 120.155, |
|
"eval_steps_per_second": 3.338, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0010353095030514385, |
|
"grad_norm": 3.1848981380462646, |
|
"learning_rate": 9.999989102005233e-06, |
|
"loss": 10.8359, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0010353095030514385, |
|
"eval_accuracy": 0.010782302884632226, |
|
"eval_loss": 10.8046875, |
|
"eval_runtime": 280.8218, |
|
"eval_samples_per_second": 120.244, |
|
"eval_steps_per_second": 3.34, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.001089799476896251, |
|
"grad_norm": 3.103949546813965, |
|
"learning_rate": 9.999983653007848e-06, |
|
"loss": 10.8047, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.001089799476896251, |
|
"eval_accuracy": 0.011296145649498848, |
|
"eval_loss": 10.7734375, |
|
"eval_runtime": 280.7519, |
|
"eval_samples_per_second": 120.273, |
|
"eval_steps_per_second": 3.341, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0011442894507410636, |
|
"grad_norm": 3.007913827896118, |
|
"learning_rate": 9.999978204010463e-06, |
|
"loss": 10.7891, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0011442894507410636, |
|
"eval_accuracy": 0.011502753864307472, |
|
"eval_loss": 10.75, |
|
"eval_runtime": 282.1403, |
|
"eval_samples_per_second": 119.682, |
|
"eval_steps_per_second": 3.325, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0011987794245858763, |
|
"grad_norm": 3.0112082958221436, |
|
"learning_rate": 9.99997275501308e-06, |
|
"loss": 10.7578, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0011987794245858763, |
|
"eval_accuracy": 0.011935742384539025, |
|
"eval_loss": 10.7265625, |
|
"eval_runtime": 280.7974, |
|
"eval_samples_per_second": 120.254, |
|
"eval_steps_per_second": 3.34, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0012532693984306888, |
|
"grad_norm": 3.0085818767547607, |
|
"learning_rate": 9.999967306015694e-06, |
|
"loss": 10.7188, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0012532693984306888, |
|
"eval_accuracy": 0.012902490504692311, |
|
"eval_loss": 10.703125, |
|
"eval_runtime": 279.9507, |
|
"eval_samples_per_second": 120.618, |
|
"eval_steps_per_second": 3.351, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0013077593722755014, |
|
"grad_norm": 2.6498820781707764, |
|
"learning_rate": 9.99996185701831e-06, |
|
"loss": 10.7188, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0013077593722755014, |
|
"eval_accuracy": 0.014669617484734745, |
|
"eval_loss": 10.6796875, |
|
"eval_runtime": 284.0388, |
|
"eval_samples_per_second": 118.882, |
|
"eval_steps_per_second": 3.302, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0013622493461203139, |
|
"grad_norm": 2.525411367416382, |
|
"learning_rate": 9.999956408020926e-06, |
|
"loss": 10.6953, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0013622493461203139, |
|
"eval_accuracy": 0.017945705260195358, |
|
"eval_loss": 10.6640625, |
|
"eval_runtime": 279.0802, |
|
"eval_samples_per_second": 120.994, |
|
"eval_steps_per_second": 3.361, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0014167393199651264, |
|
"grad_norm": 2.438088893890381, |
|
"learning_rate": 9.99995095902354e-06, |
|
"loss": 10.6719, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0014167393199651264, |
|
"eval_accuracy": 0.023050523249706075, |
|
"eval_loss": 10.640625, |
|
"eval_runtime": 280.2346, |
|
"eval_samples_per_second": 120.495, |
|
"eval_steps_per_second": 3.347, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.001471229293809939, |
|
"grad_norm": 2.2894654273986816, |
|
"learning_rate": 9.999945510026156e-06, |
|
"loss": 10.6562, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.001471229293809939, |
|
"eval_accuracy": 0.02863128990948001, |
|
"eval_loss": 10.625, |
|
"eval_runtime": 280.5686, |
|
"eval_samples_per_second": 120.352, |
|
"eval_steps_per_second": 3.343, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0015257192676547515, |
|
"grad_norm": 2.0723860263824463, |
|
"learning_rate": 9.999940061028771e-06, |
|
"loss": 10.6641, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0015257192676547515, |
|
"eval_accuracy": 0.03466305129792195, |
|
"eval_loss": 10.609375, |
|
"eval_runtime": 278.9184, |
|
"eval_samples_per_second": 121.064, |
|
"eval_steps_per_second": 3.363, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.001580209241499564, |
|
"grad_norm": 2.0480406284332275, |
|
"learning_rate": 9.999934612031386e-06, |
|
"loss": 10.6328, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.001580209241499564, |
|
"eval_accuracy": 0.039908271395015946, |
|
"eval_loss": 10.59375, |
|
"eval_runtime": 278.4399, |
|
"eval_samples_per_second": 121.272, |
|
"eval_steps_per_second": 3.369, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0016346992153443765, |
|
"grad_norm": 2.0142972469329834, |
|
"learning_rate": 9.999929163034003e-06, |
|
"loss": 10.6016, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0016346992153443765, |
|
"eval_accuracy": 0.043642156887862514, |
|
"eval_loss": 10.578125, |
|
"eval_runtime": 277.7144, |
|
"eval_samples_per_second": 121.589, |
|
"eval_steps_per_second": 3.378, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0016891891891891893, |
|
"grad_norm": 1.8250195980072021, |
|
"learning_rate": 9.999923714036618e-06, |
|
"loss": 10.6016, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0016891891891891893, |
|
"eval_accuracy": 0.046290082739106744, |
|
"eval_loss": 10.5703125, |
|
"eval_runtime": 277.9514, |
|
"eval_samples_per_second": 121.485, |
|
"eval_steps_per_second": 3.375, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0017436791630340018, |
|
"grad_norm": 1.7317852973937988, |
|
"learning_rate": 9.999918265039233e-06, |
|
"loss": 10.5938, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0017436791630340018, |
|
"eval_accuracy": 0.04786212316182883, |
|
"eval_loss": 10.5546875, |
|
"eval_runtime": 278.188, |
|
"eval_samples_per_second": 121.382, |
|
"eval_steps_per_second": 3.372, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0017981691368788143, |
|
"grad_norm": 1.6736972332000732, |
|
"learning_rate": 9.99991281604185e-06, |
|
"loss": 10.5781, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0017981691368788143, |
|
"eval_accuracy": 0.048402714699356676, |
|
"eval_loss": 10.546875, |
|
"eval_runtime": 276.9089, |
|
"eval_samples_per_second": 121.943, |
|
"eval_steps_per_second": 3.387, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0018526591107236269, |
|
"grad_norm": 1.7245711088180542, |
|
"learning_rate": 9.999907367044465e-06, |
|
"loss": 10.5547, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0018526591107236269, |
|
"eval_accuracy": 0.04842535272989897, |
|
"eval_loss": 10.53125, |
|
"eval_runtime": 277.5457, |
|
"eval_samples_per_second": 121.663, |
|
"eval_steps_per_second": 3.38, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0019071490845684394, |
|
"grad_norm": 1.6025965213775635, |
|
"learning_rate": 9.99990191804708e-06, |
|
"loss": 10.5469, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0019071490845684394, |
|
"eval_accuracy": 0.04835280681616625, |
|
"eval_loss": 10.5234375, |
|
"eval_runtime": 280.2247, |
|
"eval_samples_per_second": 120.5, |
|
"eval_steps_per_second": 3.347, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.001961639058413252, |
|
"grad_norm": 1.5452033281326294, |
|
"learning_rate": 9.999896469049695e-06, |
|
"loss": 10.5391, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.001961639058413252, |
|
"eval_accuracy": 0.04815010670125943, |
|
"eval_loss": 10.515625, |
|
"eval_runtime": 278.5802, |
|
"eval_samples_per_second": 121.211, |
|
"eval_steps_per_second": 3.367, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0020161290322580645, |
|
"grad_norm": 1.5433681011199951, |
|
"learning_rate": 9.999891020052312e-06, |
|
"loss": 10.5312, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0020161290322580645, |
|
"eval_accuracy": 0.047523855403661705, |
|
"eval_loss": 10.5078125, |
|
"eval_runtime": 279.3479, |
|
"eval_samples_per_second": 120.878, |
|
"eval_steps_per_second": 3.358, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.002070619006102877, |
|
"grad_norm": 1.4813498258590698, |
|
"learning_rate": 9.999885571054927e-06, |
|
"loss": 10.5312, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.002070619006102877, |
|
"eval_accuracy": 0.047497569813211064, |
|
"eval_loss": 10.4921875, |
|
"eval_runtime": 279.8874, |
|
"eval_samples_per_second": 120.645, |
|
"eval_steps_per_second": 3.351, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0021251089799476895, |
|
"grad_norm": 1.5266352891921997, |
|
"learning_rate": 9.999880122057542e-06, |
|
"loss": 10.4922, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0021251089799476895, |
|
"eval_accuracy": 0.04756021520719255, |
|
"eval_loss": 10.484375, |
|
"eval_runtime": 278.6312, |
|
"eval_samples_per_second": 121.189, |
|
"eval_steps_per_second": 3.366, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.002179598953792502, |
|
"grad_norm": 1.433236837387085, |
|
"learning_rate": 9.999874673060158e-06, |
|
"loss": 10.5078, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.002179598953792502, |
|
"eval_accuracy": 0.047665212824554305, |
|
"eval_loss": 10.484375, |
|
"eval_runtime": 278.9837, |
|
"eval_samples_per_second": 121.036, |
|
"eval_steps_per_second": 3.362, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0022340889276373146, |
|
"grad_norm": 1.4035075902938843, |
|
"learning_rate": 9.999869224062774e-06, |
|
"loss": 10.4922, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0022340889276373146, |
|
"eval_accuracy": 0.048127468670717134, |
|
"eval_loss": 10.4765625, |
|
"eval_runtime": 280.2687, |
|
"eval_samples_per_second": 120.481, |
|
"eval_steps_per_second": 3.347, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.002288578901482127, |
|
"grad_norm": 1.392196774482727, |
|
"learning_rate": 9.999863775065389e-06, |
|
"loss": 10.4844, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.002288578901482127, |
|
"eval_accuracy": 0.048623971051575024, |
|
"eval_loss": 10.46875, |
|
"eval_runtime": 279.283, |
|
"eval_samples_per_second": 120.906, |
|
"eval_steps_per_second": 3.359, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0023430688753269396, |
|
"grad_norm": 1.4084738492965698, |
|
"learning_rate": 9.999858326068004e-06, |
|
"loss": 10.4766, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0023430688753269396, |
|
"eval_accuracy": 0.0492689812287014, |
|
"eval_loss": 10.4609375, |
|
"eval_runtime": 279.956, |
|
"eval_samples_per_second": 120.615, |
|
"eval_steps_per_second": 3.351, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0023975588491717526, |
|
"grad_norm": 1.3411015272140503, |
|
"learning_rate": 9.999852877070619e-06, |
|
"loss": 10.4844, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0023975588491717526, |
|
"eval_accuracy": 0.04953739531973483, |
|
"eval_loss": 10.453125, |
|
"eval_runtime": 278.5767, |
|
"eval_samples_per_second": 121.213, |
|
"eval_steps_per_second": 3.367, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.002452048823016565, |
|
"grad_norm": 1.381066083908081, |
|
"learning_rate": 9.999847428073235e-06, |
|
"loss": 10.4688, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.002452048823016565, |
|
"eval_accuracy": 0.05031273339136427, |
|
"eval_loss": 10.4453125, |
|
"eval_runtime": 278.3035, |
|
"eval_samples_per_second": 121.332, |
|
"eval_steps_per_second": 3.37, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0025065387968613777, |
|
"grad_norm": 1.2761576175689697, |
|
"learning_rate": 9.99984197907585e-06, |
|
"loss": 10.4844, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0025065387968613777, |
|
"eval_accuracy": 0.05126917570733207, |
|
"eval_loss": 10.4453125, |
|
"eval_runtime": 278.7818, |
|
"eval_samples_per_second": 121.123, |
|
"eval_steps_per_second": 3.365, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.00256102877070619, |
|
"grad_norm": 1.2938231229782104, |
|
"learning_rate": 9.999836530078465e-06, |
|
"loss": 10.4609, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.00256102877070619, |
|
"eval_accuracy": 0.05223774760743953, |
|
"eval_loss": 10.4375, |
|
"eval_runtime": 280.01, |
|
"eval_samples_per_second": 120.592, |
|
"eval_steps_per_second": 3.35, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0026155187445510027, |
|
"grad_norm": 1.408036231994629, |
|
"learning_rate": 9.999831081081082e-06, |
|
"loss": 10.4453, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0026155187445510027, |
|
"eval_accuracy": 0.05263593956410096, |
|
"eval_loss": 10.4296875, |
|
"eval_runtime": 279.456, |
|
"eval_samples_per_second": 120.831, |
|
"eval_steps_per_second": 3.357, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0026700087183958152, |
|
"grad_norm": 1.3031139373779297, |
|
"learning_rate": 9.999825632083697e-06, |
|
"loss": 10.4453, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0026700087183958152, |
|
"eval_accuracy": 0.05322177821382523, |
|
"eval_loss": 10.4296875, |
|
"eval_runtime": 279.6217, |
|
"eval_samples_per_second": 120.76, |
|
"eval_steps_per_second": 3.355, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0027244986922406278, |
|
"grad_norm": 1.3556911945343018, |
|
"learning_rate": 9.999820183086312e-06, |
|
"loss": 10.4297, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0027244986922406278, |
|
"eval_accuracy": 0.05367262819805243, |
|
"eval_loss": 10.421875, |
|
"eval_runtime": 278.4251, |
|
"eval_samples_per_second": 121.279, |
|
"eval_steps_per_second": 3.369, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0027789886660854403, |
|
"grad_norm": 1.3359757661819458, |
|
"learning_rate": 9.999814734088929e-06, |
|
"loss": 10.4219, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0027789886660854403, |
|
"eval_accuracy": 0.05440648251294645, |
|
"eval_loss": 10.4140625, |
|
"eval_runtime": 278.6487, |
|
"eval_samples_per_second": 121.181, |
|
"eval_steps_per_second": 3.366, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.002833478639930253, |
|
"grad_norm": 1.2961536645889282, |
|
"learning_rate": 9.999809285091544e-06, |
|
"loss": 10.4297, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.002833478639930253, |
|
"eval_accuracy": 0.05480528239625927, |
|
"eval_loss": 10.4140625, |
|
"eval_runtime": 278.1813, |
|
"eval_samples_per_second": 121.385, |
|
"eval_steps_per_second": 3.372, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0028879686137750654, |
|
"grad_norm": 1.2359050512313843, |
|
"learning_rate": 9.999803836094159e-06, |
|
"loss": 10.4375, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0028879686137750654, |
|
"eval_accuracy": 0.055358814086795306, |
|
"eval_loss": 10.40625, |
|
"eval_runtime": 279.3681, |
|
"eval_samples_per_second": 120.869, |
|
"eval_steps_per_second": 3.358, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.002942458587619878, |
|
"grad_norm": 1.2747548818588257, |
|
"learning_rate": 9.999798387096776e-06, |
|
"loss": 10.4219, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.002942458587619878, |
|
"eval_accuracy": 0.055845676487895415, |
|
"eval_loss": 10.40625, |
|
"eval_runtime": 280.0513, |
|
"eval_samples_per_second": 120.574, |
|
"eval_steps_per_second": 3.349, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0029969485614646904, |
|
"grad_norm": 1.304934024810791, |
|
"learning_rate": 9.99979293809939e-06, |
|
"loss": 10.4141, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0029969485614646904, |
|
"eval_accuracy": 0.05652635169523676, |
|
"eval_loss": 10.3984375, |
|
"eval_runtime": 280.159, |
|
"eval_samples_per_second": 120.528, |
|
"eval_steps_per_second": 3.348, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.003051438535309503, |
|
"grad_norm": 1.2796216011047363, |
|
"learning_rate": 9.999787489102006e-06, |
|
"loss": 10.4141, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.003051438535309503, |
|
"eval_accuracy": 0.05738361512036325, |
|
"eval_loss": 10.390625, |
|
"eval_runtime": 280.8546, |
|
"eval_samples_per_second": 120.229, |
|
"eval_steps_per_second": 3.34, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0031059285091543155, |
|
"grad_norm": 1.273751974105835, |
|
"learning_rate": 9.999782040104623e-06, |
|
"loss": 10.4219, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0031059285091543155, |
|
"eval_accuracy": 0.05828062536893549, |
|
"eval_loss": 10.390625, |
|
"eval_runtime": 280.5946, |
|
"eval_samples_per_second": 120.341, |
|
"eval_steps_per_second": 3.343, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.003160418482999128, |
|
"grad_norm": 1.2347089052200317, |
|
"learning_rate": 9.999776591107238e-06, |
|
"loss": 10.4219, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.003160418482999128, |
|
"eval_accuracy": 0.05910523444821581, |
|
"eval_loss": 10.3828125, |
|
"eval_runtime": 281.2657, |
|
"eval_samples_per_second": 120.054, |
|
"eval_steps_per_second": 3.335, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0032149084568439405, |
|
"grad_norm": 1.2617988586425781, |
|
"learning_rate": 9.999771142109853e-06, |
|
"loss": 10.3984, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0032149084568439405, |
|
"eval_accuracy": 0.059817348148100545, |
|
"eval_loss": 10.3828125, |
|
"eval_runtime": 279.7299, |
|
"eval_samples_per_second": 120.713, |
|
"eval_steps_per_second": 3.353, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.003269398430688753, |
|
"grad_norm": 1.2854173183441162, |
|
"learning_rate": 9.999765693112468e-06, |
|
"loss": 10.3984, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.003269398430688753, |
|
"eval_accuracy": 0.06033486742176367, |
|
"eval_loss": 10.375, |
|
"eval_runtime": 279.8061, |
|
"eval_samples_per_second": 120.68, |
|
"eval_steps_per_second": 3.352, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.003323888404533566, |
|
"grad_norm": 1.2649012804031372, |
|
"learning_rate": 9.999760244115083e-06, |
|
"loss": 10.3984, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.003323888404533566, |
|
"eval_accuracy": 0.06070726591907321, |
|
"eval_loss": 10.375, |
|
"eval_runtime": 279.7805, |
|
"eval_samples_per_second": 120.691, |
|
"eval_steps_per_second": 3.353, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0033783783783783786, |
|
"grad_norm": 1.284862756729126, |
|
"learning_rate": 9.9997547951177e-06, |
|
"loss": 10.3906, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0033783783783783786, |
|
"eval_accuracy": 0.061101260286951224, |
|
"eval_loss": 10.3671875, |
|
"eval_runtime": 280.6475, |
|
"eval_samples_per_second": 120.318, |
|
"eval_steps_per_second": 3.342, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.003432868352223191, |
|
"grad_norm": 1.3201900720596313, |
|
"learning_rate": 9.999749346120314e-06, |
|
"loss": 10.3672, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.003432868352223191, |
|
"eval_accuracy": 0.06146384511117401, |
|
"eval_loss": 10.3671875, |
|
"eval_runtime": 280.4398, |
|
"eval_samples_per_second": 120.407, |
|
"eval_steps_per_second": 3.345, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0034873583260680036, |
|
"grad_norm": 1.265735387802124, |
|
"learning_rate": 9.99974389712293e-06, |
|
"loss": 10.3906, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0034873583260680036, |
|
"eval_accuracy": 0.0615552078022117, |
|
"eval_loss": 10.359375, |
|
"eval_runtime": 280.4115, |
|
"eval_samples_per_second": 120.419, |
|
"eval_steps_per_second": 3.345, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.003541848299912816, |
|
"grad_norm": 1.2759215831756592, |
|
"learning_rate": 9.999738448125546e-06, |
|
"loss": 10.3828, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.003541848299912816, |
|
"eval_accuracy": 0.061487409506137465, |
|
"eval_loss": 10.359375, |
|
"eval_runtime": 280.0889, |
|
"eval_samples_per_second": 120.558, |
|
"eval_steps_per_second": 3.349, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0035963382737576287, |
|
"grad_norm": 1.2972633838653564, |
|
"learning_rate": 9.999732999128161e-06, |
|
"loss": 10.3594, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0035963382737576287, |
|
"eval_accuracy": 0.06136165553596391, |
|
"eval_loss": 10.3515625, |
|
"eval_runtime": 280.2984, |
|
"eval_samples_per_second": 120.468, |
|
"eval_steps_per_second": 3.346, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.003650828247602441, |
|
"grad_norm": 1.3436386585235596, |
|
"learning_rate": 9.999727550130776e-06, |
|
"loss": 10.3516, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.003650828247602441, |
|
"eval_accuracy": 0.06103861489296974, |
|
"eval_loss": 10.34375, |
|
"eval_runtime": 280.5172, |
|
"eval_samples_per_second": 120.374, |
|
"eval_steps_per_second": 3.344, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0037053182214472537, |
|
"grad_norm": 1.3115813732147217, |
|
"learning_rate": 9.999722101133393e-06, |
|
"loss": 10.3516, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0037053182214472537, |
|
"eval_accuracy": 0.060934138355594886, |
|
"eval_loss": 10.34375, |
|
"eval_runtime": 279.6787, |
|
"eval_samples_per_second": 120.735, |
|
"eval_steps_per_second": 3.354, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0037598081952920663, |
|
"grad_norm": 1.3387259244918823, |
|
"learning_rate": 9.999716652136008e-06, |
|
"loss": 10.3438, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0037598081952920663, |
|
"eval_accuracy": 0.061064147812328176, |
|
"eval_loss": 10.3359375, |
|
"eval_runtime": 279.3007, |
|
"eval_samples_per_second": 120.898, |
|
"eval_steps_per_second": 3.358, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.003814298169136879, |
|
"grad_norm": 1.507016897201538, |
|
"learning_rate": 9.999711203138623e-06, |
|
"loss": 10.3594, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.003814298169136879, |
|
"eval_accuracy": 0.061038383301864445, |
|
"eval_loss": 10.3359375, |
|
"eval_runtime": 280.6833, |
|
"eval_samples_per_second": 120.303, |
|
"eval_steps_per_second": 3.342, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0038687881429816913, |
|
"grad_norm": 1.2702033519744873, |
|
"learning_rate": 9.99970575414124e-06, |
|
"loss": 10.3594, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0038687881429816913, |
|
"eval_accuracy": 0.06097628793675803, |
|
"eval_loss": 10.328125, |
|
"eval_runtime": 279.2628, |
|
"eval_samples_per_second": 120.915, |
|
"eval_steps_per_second": 3.359, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.003923278116826504, |
|
"grad_norm": 1.3358463048934937, |
|
"learning_rate": 9.999700305143855e-06, |
|
"loss": 10.3203, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.003923278116826504, |
|
"eval_accuracy": 0.06098071711664674, |
|
"eval_loss": 10.328125, |
|
"eval_runtime": 280.3079, |
|
"eval_samples_per_second": 120.464, |
|
"eval_steps_per_second": 3.346, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.003977768090671316, |
|
"grad_norm": 1.259023904800415, |
|
"learning_rate": 9.99969485614647e-06, |
|
"loss": 10.3516, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.003977768090671316, |
|
"eval_accuracy": 0.06099933125173458, |
|
"eval_loss": 10.3203125, |
|
"eval_runtime": 280.2434, |
|
"eval_samples_per_second": 120.492, |
|
"eval_steps_per_second": 3.347, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.004032258064516129, |
|
"grad_norm": 1.458150029182434, |
|
"learning_rate": 9.999689407149085e-06, |
|
"loss": 10.3203, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.004032258064516129, |
|
"eval_accuracy": 0.061083225129626606, |
|
"eval_loss": 10.3125, |
|
"eval_runtime": 280.0404, |
|
"eval_samples_per_second": 120.579, |
|
"eval_steps_per_second": 3.35, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0040867480383609414, |
|
"grad_norm": 1.2943834066390991, |
|
"learning_rate": 9.999683958151702e-06, |
|
"loss": 10.3281, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0040867480383609414, |
|
"eval_accuracy": 0.06123138553923716, |
|
"eval_loss": 10.3125, |
|
"eval_runtime": 280.4851, |
|
"eval_samples_per_second": 120.388, |
|
"eval_steps_per_second": 3.344, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.004141238012205754, |
|
"grad_norm": 1.2712702751159668, |
|
"learning_rate": 9.999678509154317e-06, |
|
"loss": 10.3438, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.004141238012205754, |
|
"eval_accuracy": 0.061402212928278174, |
|
"eval_loss": 10.3046875, |
|
"eval_runtime": 279.7875, |
|
"eval_samples_per_second": 120.688, |
|
"eval_steps_per_second": 3.353, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0041957279860505665, |
|
"grad_norm": 1.3315693140029907, |
|
"learning_rate": 9.999673060156932e-06, |
|
"loss": 10.2969, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0041957279860505665, |
|
"eval_accuracy": 0.0617529576572429, |
|
"eval_loss": 10.3046875, |
|
"eval_runtime": 280.9468, |
|
"eval_samples_per_second": 120.19, |
|
"eval_steps_per_second": 3.339, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.004250217959895379, |
|
"grad_norm": 1.2693438529968262, |
|
"learning_rate": 9.999667611159547e-06, |
|
"loss": 10.3281, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.004250217959895379, |
|
"eval_accuracy": 0.06221338972345156, |
|
"eval_loss": 10.296875, |
|
"eval_runtime": 280.2001, |
|
"eval_samples_per_second": 120.51, |
|
"eval_steps_per_second": 3.348, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.004304707933740192, |
|
"grad_norm": 1.4125028848648071, |
|
"learning_rate": 9.999662162162162e-06, |
|
"loss": 10.2891, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.004304707933740192, |
|
"eval_accuracy": 0.0628054234352424, |
|
"eval_loss": 10.296875, |
|
"eval_runtime": 280.5046, |
|
"eval_samples_per_second": 120.379, |
|
"eval_steps_per_second": 3.344, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.004359197907585004, |
|
"grad_norm": 1.4127213954925537, |
|
"learning_rate": 9.999656713164779e-06, |
|
"loss": 10.3047, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.004359197907585004, |
|
"eval_accuracy": 0.0632026600785945, |
|
"eval_loss": 10.2890625, |
|
"eval_runtime": 281.1308, |
|
"eval_samples_per_second": 120.111, |
|
"eval_steps_per_second": 3.337, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.004413687881429817, |
|
"grad_norm": 1.2919211387634277, |
|
"learning_rate": 9.999651264167394e-06, |
|
"loss": 10.2969, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.004413687881429817, |
|
"eval_accuracy": 0.06365513120055874, |
|
"eval_loss": 10.28125, |
|
"eval_runtime": 280.4747, |
|
"eval_samples_per_second": 120.392, |
|
"eval_steps_per_second": 3.344, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.004468177855274629, |
|
"grad_norm": 1.3636354207992554, |
|
"learning_rate": 9.999645815170009e-06, |
|
"loss": 10.2891, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.004468177855274629, |
|
"eval_accuracy": 0.06432144775937197, |
|
"eval_loss": 10.28125, |
|
"eval_runtime": 280.6686, |
|
"eval_samples_per_second": 120.309, |
|
"eval_steps_per_second": 3.342, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.004522667829119442, |
|
"grad_norm": 1.2588390111923218, |
|
"learning_rate": 9.999640366172625e-06, |
|
"loss": 10.3125, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.004522667829119442, |
|
"eval_accuracy": 0.06491886596436085, |
|
"eval_loss": 10.2734375, |
|
"eval_runtime": 280.7018, |
|
"eval_samples_per_second": 120.295, |
|
"eval_steps_per_second": 3.342, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.004577157802964254, |
|
"grad_norm": 1.2841159105300903, |
|
"learning_rate": 9.99963491717524e-06, |
|
"loss": 10.2891, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.004577157802964254, |
|
"eval_accuracy": 0.0653847114726557, |
|
"eval_loss": 10.2734375, |
|
"eval_runtime": 280.4683, |
|
"eval_samples_per_second": 120.395, |
|
"eval_steps_per_second": 3.344, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.004631647776809067, |
|
"grad_norm": 1.2950408458709717, |
|
"learning_rate": 9.999629468177855e-06, |
|
"loss": 10.2812, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.004631647776809067, |
|
"eval_accuracy": 0.06567657416309995, |
|
"eval_loss": 10.265625, |
|
"eval_runtime": 279.8035, |
|
"eval_samples_per_second": 120.681, |
|
"eval_steps_per_second": 3.352, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.004686137750653879, |
|
"grad_norm": 1.284853458404541, |
|
"learning_rate": 9.999624019180472e-06, |
|
"loss": 10.3047, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.004686137750653879, |
|
"eval_accuracy": 0.06585295973866796, |
|
"eval_loss": 10.265625, |
|
"eval_runtime": 280.2049, |
|
"eval_samples_per_second": 120.508, |
|
"eval_steps_per_second": 3.348, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.004740627724498693, |
|
"grad_norm": 1.229882836341858, |
|
"learning_rate": 9.999618570183087e-06, |
|
"loss": 10.2969, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.004740627724498693, |
|
"eval_accuracy": 0.06600204651269968, |
|
"eval_loss": 10.2578125, |
|
"eval_runtime": 280.4096, |
|
"eval_samples_per_second": 120.42, |
|
"eval_steps_per_second": 3.345, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.004795117698343505, |
|
"grad_norm": 1.3210368156433105, |
|
"learning_rate": 9.999613121185702e-06, |
|
"loss": 10.2578, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.004795117698343505, |
|
"eval_accuracy": 0.06612053431194471, |
|
"eval_loss": 10.25, |
|
"eval_runtime": 280.0298, |
|
"eval_samples_per_second": 120.584, |
|
"eval_steps_per_second": 3.35, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.004849607672188318, |
|
"grad_norm": 1.2671563625335693, |
|
"learning_rate": 9.999607672188319e-06, |
|
"loss": 10.2812, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.004849607672188318, |
|
"eval_accuracy": 0.06618060325487982, |
|
"eval_loss": 10.25, |
|
"eval_runtime": 280.1841, |
|
"eval_samples_per_second": 120.517, |
|
"eval_steps_per_second": 3.348, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.00490409764603313, |
|
"grad_norm": 1.2486541271209717, |
|
"learning_rate": 9.999602223190934e-06, |
|
"loss": 10.2734, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00490409764603313, |
|
"eval_accuracy": 0.06629969898077623, |
|
"eval_loss": 10.2421875, |
|
"eval_runtime": 279.9304, |
|
"eval_samples_per_second": 120.626, |
|
"eval_steps_per_second": 3.351, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004958587619877943, |
|
"grad_norm": 1.219537615776062, |
|
"learning_rate": 9.999596774193549e-06, |
|
"loss": 10.2891, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.004958587619877943, |
|
"eval_accuracy": 0.06638553243417508, |
|
"eval_loss": 10.2421875, |
|
"eval_runtime": 280.2115, |
|
"eval_samples_per_second": 120.505, |
|
"eval_steps_per_second": 3.347, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.005013077593722755, |
|
"grad_norm": 1.3005180358886719, |
|
"learning_rate": 9.999591325196166e-06, |
|
"loss": 10.2578, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.005013077593722755, |
|
"eval_accuracy": 0.06655224908109715, |
|
"eval_loss": 10.234375, |
|
"eval_runtime": 280.5566, |
|
"eval_samples_per_second": 120.357, |
|
"eval_steps_per_second": 3.343, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.005067567567567568, |
|
"grad_norm": 1.2326613664627075, |
|
"learning_rate": 9.99958587619878e-06, |
|
"loss": 10.2734, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.005067567567567568, |
|
"eval_accuracy": 0.06677831094875031, |
|
"eval_loss": 10.234375, |
|
"eval_runtime": 279.0494, |
|
"eval_samples_per_second": 121.007, |
|
"eval_steps_per_second": 3.361, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.00512205754141238, |
|
"grad_norm": 1.343145728111267, |
|
"learning_rate": 9.999580427201396e-06, |
|
"loss": 10.2266, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.00512205754141238, |
|
"eval_accuracy": 0.0670887877742824, |
|
"eval_loss": 10.2265625, |
|
"eval_runtime": 278.9829, |
|
"eval_samples_per_second": 121.036, |
|
"eval_steps_per_second": 3.362, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.005176547515257193, |
|
"grad_norm": 1.2524234056472778, |
|
"learning_rate": 9.999574978204011e-06, |
|
"loss": 10.2578, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.005176547515257193, |
|
"eval_accuracy": 0.06735230950321652, |
|
"eval_loss": 10.2265625, |
|
"eval_runtime": 278.058, |
|
"eval_samples_per_second": 121.439, |
|
"eval_steps_per_second": 3.373, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0052310374891020054, |
|
"grad_norm": 1.232895016670227, |
|
"learning_rate": 9.999569529206626e-06, |
|
"loss": 10.25, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0052310374891020054, |
|
"eval_accuracy": 0.0675672260489275, |
|
"eval_loss": 10.21875, |
|
"eval_runtime": 280.2022, |
|
"eval_samples_per_second": 120.509, |
|
"eval_steps_per_second": 3.348, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.005285527462946818, |
|
"grad_norm": 1.2870153188705444, |
|
"learning_rate": 9.999564080209243e-06, |
|
"loss": 10.2266, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.005285527462946818, |
|
"eval_accuracy": 0.06775721760193143, |
|
"eval_loss": 10.21875, |
|
"eval_runtime": 279.5356, |
|
"eval_samples_per_second": 120.797, |
|
"eval_steps_per_second": 3.356, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0053400174367916305, |
|
"grad_norm": 1.2629221677780151, |
|
"learning_rate": 9.999558631211858e-06, |
|
"loss": 10.2266, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0053400174367916305, |
|
"eval_accuracy": 0.06793869818181586, |
|
"eval_loss": 10.2109375, |
|
"eval_runtime": 279.0624, |
|
"eval_samples_per_second": 121.002, |
|
"eval_steps_per_second": 3.361, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.005394507410636443, |
|
"grad_norm": 1.279175043106079, |
|
"learning_rate": 9.999553182214473e-06, |
|
"loss": 10.2344, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.005394507410636443, |
|
"eval_accuracy": 0.06805782285660043, |
|
"eval_loss": 10.2109375, |
|
"eval_runtime": 278.9598, |
|
"eval_samples_per_second": 121.046, |
|
"eval_steps_per_second": 3.362, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0054489973844812556, |
|
"grad_norm": 1.21150803565979, |
|
"learning_rate": 9.99954773321709e-06, |
|
"loss": 10.2422, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0054489973844812556, |
|
"eval_accuracy": 0.0681561332807969, |
|
"eval_loss": 10.203125, |
|
"eval_runtime": 278.9899, |
|
"eval_samples_per_second": 121.033, |
|
"eval_steps_per_second": 3.362, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.005503487358326068, |
|
"grad_norm": 1.2411643266677856, |
|
"learning_rate": 9.999542284219704e-06, |
|
"loss": 10.2422, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.005503487358326068, |
|
"eval_accuracy": 0.06829934343053183, |
|
"eval_loss": 10.203125, |
|
"eval_runtime": 278.3501, |
|
"eval_samples_per_second": 121.311, |
|
"eval_steps_per_second": 3.37, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.005557977332170881, |
|
"grad_norm": 1.2620774507522583, |
|
"learning_rate": 9.99953683522232e-06, |
|
"loss": 10.2266, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.005557977332170881, |
|
"eval_accuracy": 0.0684582438776503, |
|
"eval_loss": 10.1953125, |
|
"eval_runtime": 280.5409, |
|
"eval_samples_per_second": 120.364, |
|
"eval_steps_per_second": 3.344, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.005612467306015693, |
|
"grad_norm": 1.2730474472045898, |
|
"learning_rate": 9.999531386224936e-06, |
|
"loss": 10.2188, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.005612467306015693, |
|
"eval_accuracy": 0.06855808859291931, |
|
"eval_loss": 10.1953125, |
|
"eval_runtime": 279.3284, |
|
"eval_samples_per_second": 120.886, |
|
"eval_steps_per_second": 3.358, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.005666957279860506, |
|
"grad_norm": 1.250596046447754, |
|
"learning_rate": 9.999525937227551e-06, |
|
"loss": 10.2109, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.005666957279860506, |
|
"eval_accuracy": 0.06870066186711471, |
|
"eval_loss": 10.1875, |
|
"eval_runtime": 279.5974, |
|
"eval_samples_per_second": 120.77, |
|
"eval_steps_per_second": 3.355, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.005721447253705318, |
|
"grad_norm": 1.3434346914291382, |
|
"learning_rate": 9.999520488230166e-06, |
|
"loss": 10.1797, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.005721447253705318, |
|
"eval_accuracy": 0.06888807696907226, |
|
"eval_loss": 10.1875, |
|
"eval_runtime": 279.5391, |
|
"eval_samples_per_second": 120.795, |
|
"eval_steps_per_second": 3.356, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.005775937227550131, |
|
"grad_norm": 1.28876793384552, |
|
"learning_rate": 9.999515039232783e-06, |
|
"loss": 10.1797, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.005775937227550131, |
|
"eval_accuracy": 0.06907245243777284, |
|
"eval_loss": 10.1796875, |
|
"eval_runtime": 279.0416, |
|
"eval_samples_per_second": 121.011, |
|
"eval_steps_per_second": 3.362, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.005830427201394943, |
|
"grad_norm": 1.3488516807556152, |
|
"learning_rate": 9.999509590235398e-06, |
|
"loss": 10.1719, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.005830427201394943, |
|
"eval_accuracy": 0.06933892695329945, |
|
"eval_loss": 10.1796875, |
|
"eval_runtime": 279.7608, |
|
"eval_samples_per_second": 120.7, |
|
"eval_steps_per_second": 3.353, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.005884917175239756, |
|
"grad_norm": 1.272630214691162, |
|
"learning_rate": 9.999504141238013e-06, |
|
"loss": 10.1875, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.005884917175239756, |
|
"eval_accuracy": 0.06959364822023249, |
|
"eval_loss": 10.171875, |
|
"eval_runtime": 279.3313, |
|
"eval_samples_per_second": 120.885, |
|
"eval_steps_per_second": 3.358, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.005939407149084568, |
|
"grad_norm": 1.275840401649475, |
|
"learning_rate": 9.999498692240628e-06, |
|
"loss": 10.1797, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.005939407149084568, |
|
"eval_accuracy": 0.06980685678154193, |
|
"eval_loss": 10.171875, |
|
"eval_runtime": 279.6516, |
|
"eval_samples_per_second": 120.747, |
|
"eval_steps_per_second": 3.354, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.005993897122929381, |
|
"grad_norm": 1.2549141645431519, |
|
"learning_rate": 9.999493243243245e-06, |
|
"loss": 10.1797, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.005993897122929381, |
|
"eval_accuracy": 0.07002058642283829, |
|
"eval_loss": 10.1640625, |
|
"eval_runtime": 279.1415, |
|
"eval_samples_per_second": 120.967, |
|
"eval_steps_per_second": 3.36, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.006048387096774193, |
|
"grad_norm": 1.3456748723983765, |
|
"learning_rate": 9.999487794245858e-06, |
|
"loss": 10.1406, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.006048387096774193, |
|
"eval_accuracy": 0.07022299704886349, |
|
"eval_loss": 10.1640625, |
|
"eval_runtime": 279.1359, |
|
"eval_samples_per_second": 120.97, |
|
"eval_steps_per_second": 3.36, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.006102877070619006, |
|
"grad_norm": 1.2664400339126587, |
|
"learning_rate": 9.999482345248475e-06, |
|
"loss": 10.1719, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.006102877070619006, |
|
"eval_accuracy": 0.07042644983486251, |
|
"eval_loss": 10.1640625, |
|
"eval_runtime": 279.7876, |
|
"eval_samples_per_second": 120.688, |
|
"eval_steps_per_second": 3.353, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0061573670444638184, |
|
"grad_norm": 1.226805329322815, |
|
"learning_rate": 9.99947689625109e-06, |
|
"loss": 10.1953, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.0061573670444638184, |
|
"eval_accuracy": 0.07062440233211086, |
|
"eval_loss": 10.15625, |
|
"eval_runtime": 280.1774, |
|
"eval_samples_per_second": 120.52, |
|
"eval_steps_per_second": 3.348, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.006211857018308631, |
|
"grad_norm": 1.2513021230697632, |
|
"learning_rate": 9.999471447253705e-06, |
|
"loss": 10.1719, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.006211857018308631, |
|
"eval_accuracy": 0.070792566423441, |
|
"eval_loss": 10.15625, |
|
"eval_runtime": 279.7452, |
|
"eval_samples_per_second": 120.706, |
|
"eval_steps_per_second": 3.353, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0062663469921534435, |
|
"grad_norm": 1.2754563093185425, |
|
"learning_rate": 9.999465998256322e-06, |
|
"loss": 10.1641, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0062663469921534435, |
|
"eval_accuracy": 0.0709808499920434, |
|
"eval_loss": 10.1484375, |
|
"eval_runtime": 279.6651, |
|
"eval_samples_per_second": 120.741, |
|
"eval_steps_per_second": 3.354, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.006320836965998256, |
|
"grad_norm": 1.2457561492919922, |
|
"learning_rate": 9.999460549258937e-06, |
|
"loss": 10.1719, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.006320836965998256, |
|
"eval_accuracy": 0.07117214424501458, |
|
"eval_loss": 10.1484375, |
|
"eval_runtime": 279.2608, |
|
"eval_samples_per_second": 120.916, |
|
"eval_steps_per_second": 3.359, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0063753269398430686, |
|
"grad_norm": 1.2697603702545166, |
|
"learning_rate": 9.999455100261552e-06, |
|
"loss": 10.1484, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.0063753269398430686, |
|
"eval_accuracy": 0.07134343481626618, |
|
"eval_loss": 10.140625, |
|
"eval_runtime": 279.2939, |
|
"eval_samples_per_second": 120.901, |
|
"eval_steps_per_second": 3.358, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.006429816913687881, |
|
"grad_norm": 1.2824610471725464, |
|
"learning_rate": 9.999449651264169e-06, |
|
"loss": 10.1562, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.006429816913687881, |
|
"eval_accuracy": 0.07150007725010805, |
|
"eval_loss": 10.140625, |
|
"eval_runtime": 280.6459, |
|
"eval_samples_per_second": 120.319, |
|
"eval_steps_per_second": 3.342, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.006484306887532694, |
|
"grad_norm": 1.2311967611312866, |
|
"learning_rate": 9.999444202266784e-06, |
|
"loss": 10.1562, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.006484306887532694, |
|
"eval_accuracy": 0.07164412691759968, |
|
"eval_loss": 10.1328125, |
|
"eval_runtime": 278.8366, |
|
"eval_samples_per_second": 121.1, |
|
"eval_steps_per_second": 3.364, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.006538796861377506, |
|
"grad_norm": 1.260802984237671, |
|
"learning_rate": 9.999438753269399e-06, |
|
"loss": 10.1484, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.006538796861377506, |
|
"eval_accuracy": 0.07180059565811259, |
|
"eval_loss": 10.1328125, |
|
"eval_runtime": 279.3119, |
|
"eval_samples_per_second": 120.894, |
|
"eval_steps_per_second": 3.358, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.006593286835222319, |
|
"grad_norm": 1.2933627367019653, |
|
"learning_rate": 9.999433304272015e-06, |
|
"loss": 10.1406, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.006593286835222319, |
|
"eval_accuracy": 0.07193274733256984, |
|
"eval_loss": 10.125, |
|
"eval_runtime": 280.0499, |
|
"eval_samples_per_second": 120.575, |
|
"eval_steps_per_second": 3.349, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.006647776809067132, |
|
"grad_norm": 1.2777124643325806, |
|
"learning_rate": 9.99942785527463e-06, |
|
"loss": 10.1328, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.006647776809067132, |
|
"eval_accuracy": 0.07206449372259283, |
|
"eval_loss": 10.125, |
|
"eval_runtime": 278.3594, |
|
"eval_samples_per_second": 121.307, |
|
"eval_steps_per_second": 3.37, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.006702266782911945, |
|
"grad_norm": 1.225155234336853, |
|
"learning_rate": 9.999422406277245e-06, |
|
"loss": 10.1641, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.006702266782911945, |
|
"eval_accuracy": 0.07220330364132721, |
|
"eval_loss": 10.1171875, |
|
"eval_runtime": 279.7375, |
|
"eval_samples_per_second": 120.71, |
|
"eval_steps_per_second": 3.353, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"grad_norm": 1.2431164979934692, |
|
"learning_rate": 9.999416957279862e-06, |
|
"loss": 10.1328, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"eval_accuracy": 0.07233015766925091, |
|
"eval_loss": 10.1171875, |
|
"eval_runtime": 278.5065, |
|
"eval_samples_per_second": 121.243, |
|
"eval_steps_per_second": 3.368, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.00681124673060157, |
|
"grad_norm": 1.2372488975524902, |
|
"learning_rate": 9.999411508282477e-06, |
|
"loss": 10.1484, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.00681124673060157, |
|
"eval_accuracy": 0.07246410417477417, |
|
"eval_loss": 10.109375, |
|
"eval_runtime": 278.7434, |
|
"eval_samples_per_second": 121.14, |
|
"eval_steps_per_second": 3.365, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.006865736704446382, |
|
"grad_norm": 1.2289838790893555, |
|
"learning_rate": 9.999406059285092e-06, |
|
"loss": 10.1406, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.006865736704446382, |
|
"eval_accuracy": 0.07261460944432581, |
|
"eval_loss": 10.109375, |
|
"eval_runtime": 279.5991, |
|
"eval_samples_per_second": 120.769, |
|
"eval_steps_per_second": 3.355, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.006920226678291195, |
|
"grad_norm": 1.231143832206726, |
|
"learning_rate": 9.999400610287709e-06, |
|
"loss": 10.1406, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.006920226678291195, |
|
"eval_accuracy": 0.07277023866708203, |
|
"eval_loss": 10.1015625, |
|
"eval_runtime": 279.4963, |
|
"eval_samples_per_second": 120.814, |
|
"eval_steps_per_second": 3.356, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.006974716652136007, |
|
"grad_norm": 1.3046151399612427, |
|
"learning_rate": 9.999395161290324e-06, |
|
"loss": 10.125, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.006974716652136007, |
|
"eval_accuracy": 0.07291527259677114, |
|
"eval_loss": 10.1015625, |
|
"eval_runtime": 280.7304, |
|
"eval_samples_per_second": 120.283, |
|
"eval_steps_per_second": 3.341, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.00702920662598082, |
|
"grad_norm": 1.2855069637298584, |
|
"learning_rate": 9.999389712292939e-06, |
|
"loss": 10.1172, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.00702920662598082, |
|
"eval_accuracy": 0.07305231663332767, |
|
"eval_loss": 10.09375, |
|
"eval_runtime": 279.8824, |
|
"eval_samples_per_second": 120.647, |
|
"eval_steps_per_second": 3.351, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.007083696599825632, |
|
"grad_norm": 1.282074213027954, |
|
"learning_rate": 9.999384263295554e-06, |
|
"loss": 10.1016, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.007083696599825632, |
|
"eval_accuracy": 0.07317312034362562, |
|
"eval_loss": 10.09375, |
|
"eval_runtime": 279.9541, |
|
"eval_samples_per_second": 120.616, |
|
"eval_steps_per_second": 3.351, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.007138186573670445, |
|
"grad_norm": 1.218042254447937, |
|
"learning_rate": 9.999378814298169e-06, |
|
"loss": 10.1172, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.007138186573670445, |
|
"eval_accuracy": 0.07330454829587883, |
|
"eval_loss": 10.0859375, |
|
"eval_runtime": 280.2317, |
|
"eval_samples_per_second": 120.497, |
|
"eval_steps_per_second": 3.347, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.007192676547515257, |
|
"grad_norm": 1.2612570524215698, |
|
"learning_rate": 9.999373365300786e-06, |
|
"loss": 10.1172, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.007192676547515257, |
|
"eval_accuracy": 0.07344506619901475, |
|
"eval_loss": 10.0859375, |
|
"eval_runtime": 280.7991, |
|
"eval_samples_per_second": 120.253, |
|
"eval_steps_per_second": 3.34, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.00724716652136007, |
|
"grad_norm": 1.2458122968673706, |
|
"learning_rate": 9.999367916303401e-06, |
|
"loss": 10.1172, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.00724716652136007, |
|
"eval_accuracy": 0.07357432298465584, |
|
"eval_loss": 10.0859375, |
|
"eval_runtime": 281.5847, |
|
"eval_samples_per_second": 119.918, |
|
"eval_steps_per_second": 3.331, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.007301656495204882, |
|
"grad_norm": 1.2749879360198975, |
|
"learning_rate": 9.999362467306016e-06, |
|
"loss": 10.0938, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.007301656495204882, |
|
"eval_accuracy": 0.07369055277062427, |
|
"eval_loss": 10.078125, |
|
"eval_runtime": 279.1661, |
|
"eval_samples_per_second": 120.957, |
|
"eval_steps_per_second": 3.36, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.007356146469049695, |
|
"grad_norm": 1.2771598100662231, |
|
"learning_rate": 9.999357018308633e-06, |
|
"loss": 10.1094, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.007356146469049695, |
|
"eval_accuracy": 0.07383138911152996, |
|
"eval_loss": 10.078125, |
|
"eval_runtime": 280.6696, |
|
"eval_samples_per_second": 120.309, |
|
"eval_steps_per_second": 3.342, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0074106364428945075, |
|
"grad_norm": 1.2409011125564575, |
|
"learning_rate": 9.999351569311248e-06, |
|
"loss": 10.1094, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0074106364428945075, |
|
"eval_accuracy": 0.07399631092738602, |
|
"eval_loss": 10.0703125, |
|
"eval_runtime": 279.9187, |
|
"eval_samples_per_second": 120.631, |
|
"eval_steps_per_second": 3.351, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.00746512641673932, |
|
"grad_norm": 1.3785778284072876, |
|
"learning_rate": 9.999346120313863e-06, |
|
"loss": 10.0703, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.00746512641673932, |
|
"eval_accuracy": 0.07418262597159345, |
|
"eval_loss": 10.0703125, |
|
"eval_runtime": 278.8604, |
|
"eval_samples_per_second": 121.089, |
|
"eval_steps_per_second": 3.364, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0075196163905841325, |
|
"grad_norm": 1.2543950080871582, |
|
"learning_rate": 9.99934067131648e-06, |
|
"loss": 10.0781, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0075196163905841325, |
|
"eval_accuracy": 0.07433802360324437, |
|
"eval_loss": 10.0625, |
|
"eval_runtime": 278.9667, |
|
"eval_samples_per_second": 121.043, |
|
"eval_steps_per_second": 3.362, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.007574106364428945, |
|
"grad_norm": 1.2550749778747559, |
|
"learning_rate": 9.999335222319094e-06, |
|
"loss": 10.0781, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.007574106364428945, |
|
"eval_accuracy": 0.07445636665804858, |
|
"eval_loss": 10.0625, |
|
"eval_runtime": 280.3577, |
|
"eval_samples_per_second": 120.443, |
|
"eval_steps_per_second": 3.346, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.007628596338273758, |
|
"grad_norm": 1.2808057069778442, |
|
"learning_rate": 9.99932977332171e-06, |
|
"loss": 10.0781, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.007628596338273758, |
|
"eval_accuracy": 0.07457149638626687, |
|
"eval_loss": 10.0546875, |
|
"eval_runtime": 278.8572, |
|
"eval_samples_per_second": 121.091, |
|
"eval_steps_per_second": 3.364, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00768308631211857, |
|
"grad_norm": 1.2947779893875122, |
|
"learning_rate": 9.999324324324326e-06, |
|
"loss": 10.0625, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.00768308631211857, |
|
"eval_accuracy": 0.0747105089472184, |
|
"eval_loss": 10.0546875, |
|
"eval_runtime": 278.2341, |
|
"eval_samples_per_second": 121.362, |
|
"eval_steps_per_second": 3.371, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.007737576285963383, |
|
"grad_norm": 1.2656594514846802, |
|
"learning_rate": 9.999318875326941e-06, |
|
"loss": 10.0781, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.007737576285963383, |
|
"eval_accuracy": 0.07485380594361782, |
|
"eval_loss": 10.046875, |
|
"eval_runtime": 279.1203, |
|
"eval_samples_per_second": 120.976, |
|
"eval_steps_per_second": 3.361, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.007792066259808195, |
|
"grad_norm": 1.3355940580368042, |
|
"learning_rate": 9.999313426329556e-06, |
|
"loss": 10.0391, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.007792066259808195, |
|
"eval_accuracy": 0.07498022573821908, |
|
"eval_loss": 10.046875, |
|
"eval_runtime": 279.5611, |
|
"eval_samples_per_second": 120.786, |
|
"eval_steps_per_second": 3.355, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.007846556233653008, |
|
"grad_norm": 1.2356702089309692, |
|
"learning_rate": 9.999307977332171e-06, |
|
"loss": 10.0703, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.007846556233653008, |
|
"eval_accuracy": 0.07511967253249303, |
|
"eval_loss": 10.046875, |
|
"eval_runtime": 279.9704, |
|
"eval_samples_per_second": 120.609, |
|
"eval_steps_per_second": 3.35, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.007901046207497821, |
|
"grad_norm": 1.2823630571365356, |
|
"learning_rate": 9.999302528334788e-06, |
|
"loss": 10.0391, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.007901046207497821, |
|
"eval_accuracy": 0.07525402432245056, |
|
"eval_loss": 10.0390625, |
|
"eval_runtime": 279.3818, |
|
"eval_samples_per_second": 120.863, |
|
"eval_steps_per_second": 3.357, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.007955536181342633, |
|
"grad_norm": 1.2719727754592896, |
|
"learning_rate": 9.999297079337401e-06, |
|
"loss": 10.0469, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.007955536181342633, |
|
"eval_accuracy": 0.07538672602578286, |
|
"eval_loss": 10.0390625, |
|
"eval_runtime": 280.0065, |
|
"eval_samples_per_second": 120.594, |
|
"eval_steps_per_second": 3.35, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.008010026155187446, |
|
"grad_norm": 1.2450186014175415, |
|
"learning_rate": 9.999291630340018e-06, |
|
"loss": 10.0547, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.008010026155187446, |
|
"eval_accuracy": 0.07551505644700279, |
|
"eval_loss": 10.03125, |
|
"eval_runtime": 279.889, |
|
"eval_samples_per_second": 120.644, |
|
"eval_steps_per_second": 3.351, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.008064516129032258, |
|
"grad_norm": 1.2184418439865112, |
|
"learning_rate": 9.999286181342633e-06, |
|
"loss": 10.0703, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.008064516129032258, |
|
"eval_accuracy": 0.07562749392862206, |
|
"eval_loss": 10.03125, |
|
"eval_runtime": 278.8225, |
|
"eval_samples_per_second": 121.106, |
|
"eval_steps_per_second": 3.364, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.008119006102877071, |
|
"grad_norm": 1.241905689239502, |
|
"learning_rate": 9.999280732345248e-06, |
|
"loss": 10.0469, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.008119006102877071, |
|
"eval_accuracy": 0.0757480660478147, |
|
"eval_loss": 10.0234375, |
|
"eval_runtime": 279.6408, |
|
"eval_samples_per_second": 120.751, |
|
"eval_steps_per_second": 3.354, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.008173496076721883, |
|
"grad_norm": 1.2825335264205933, |
|
"learning_rate": 9.999275283347865e-06, |
|
"loss": 10.0391, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.008173496076721883, |
|
"eval_accuracy": 0.07586571432930304, |
|
"eval_loss": 10.0234375, |
|
"eval_runtime": 279.96, |
|
"eval_samples_per_second": 120.614, |
|
"eval_steps_per_second": 3.35, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.008227986050566696, |
|
"grad_norm": 1.2704492807388306, |
|
"learning_rate": 9.99926983435048e-06, |
|
"loss": 10.0391, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.008227986050566696, |
|
"eval_accuracy": 0.07599531850160207, |
|
"eval_loss": 10.015625, |
|
"eval_runtime": 278.0769, |
|
"eval_samples_per_second": 121.43, |
|
"eval_steps_per_second": 3.373, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.008282476024411508, |
|
"grad_norm": 1.2123051881790161, |
|
"learning_rate": 9.999264385353095e-06, |
|
"loss": 10.0391, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.008282476024411508, |
|
"eval_accuracy": 0.07611594851857105, |
|
"eval_loss": 10.015625, |
|
"eval_runtime": 279.8596, |
|
"eval_samples_per_second": 120.657, |
|
"eval_steps_per_second": 3.352, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.008336965998256321, |
|
"grad_norm": 1.258999228477478, |
|
"learning_rate": 9.999258936355712e-06, |
|
"loss": 10.0391, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.008336965998256321, |
|
"eval_accuracy": 0.07623133878678279, |
|
"eval_loss": 10.015625, |
|
"eval_runtime": 279.3191, |
|
"eval_samples_per_second": 120.89, |
|
"eval_steps_per_second": 3.358, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.008391455972101133, |
|
"grad_norm": 1.194441795349121, |
|
"learning_rate": 9.999253487358327e-06, |
|
"loss": 10.0469, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.008391455972101133, |
|
"eval_accuracy": 0.07634800280607362, |
|
"eval_loss": 10.0078125, |
|
"eval_runtime": 279.5107, |
|
"eval_samples_per_second": 120.808, |
|
"eval_steps_per_second": 3.356, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.008445945945945946, |
|
"grad_norm": 1.289880633354187, |
|
"learning_rate": 9.999248038360942e-06, |
|
"loss": 10.0312, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.008445945945945946, |
|
"eval_accuracy": 0.07645618480113316, |
|
"eval_loss": 10.0078125, |
|
"eval_runtime": 279.6787, |
|
"eval_samples_per_second": 120.735, |
|
"eval_steps_per_second": 3.354, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.008500435919790758, |
|
"grad_norm": 1.318518042564392, |
|
"learning_rate": 9.999242589363559e-06, |
|
"loss": 9.9844, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.008500435919790758, |
|
"eval_accuracy": 0.07656841964053529, |
|
"eval_loss": 10.0, |
|
"eval_runtime": 279.7302, |
|
"eval_samples_per_second": 120.713, |
|
"eval_steps_per_second": 3.353, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.008554925893635571, |
|
"grad_norm": 1.3092557191848755, |
|
"learning_rate": 9.999237140366174e-06, |
|
"loss": 10.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.008554925893635571, |
|
"eval_accuracy": 0.0766785701599898, |
|
"eval_loss": 10.0, |
|
"eval_runtime": 279.5756, |
|
"eval_samples_per_second": 120.779, |
|
"eval_steps_per_second": 3.355, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.008609415867480383, |
|
"grad_norm": 1.2869155406951904, |
|
"learning_rate": 9.999231691368789e-06, |
|
"loss": 10.0078, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.008609415867480383, |
|
"eval_accuracy": 0.07677942808634446, |
|
"eval_loss": 9.9921875, |
|
"eval_runtime": 281.549, |
|
"eval_samples_per_second": 119.933, |
|
"eval_steps_per_second": 3.332, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.008663905841325197, |
|
"grad_norm": 1.2500320672988892, |
|
"learning_rate": 9.999226242371405e-06, |
|
"loss": 10.0078, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.008663905841325197, |
|
"eval_accuracy": 0.07687990967715302, |
|
"eval_loss": 9.9921875, |
|
"eval_runtime": 280.7605, |
|
"eval_samples_per_second": 120.27, |
|
"eval_steps_per_second": 3.341, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.008718395815170008, |
|
"grad_norm": 1.2038897275924683, |
|
"learning_rate": 9.99922079337402e-06, |
|
"loss": 10.0234, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.008718395815170008, |
|
"eval_accuracy": 0.07697283560815144, |
|
"eval_loss": 9.9921875, |
|
"eval_runtime": 278.5618, |
|
"eval_samples_per_second": 121.219, |
|
"eval_steps_per_second": 3.367, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.008772885789014822, |
|
"grad_norm": 1.2646440267562866, |
|
"learning_rate": 9.999215344376635e-06, |
|
"loss": 9.9922, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.008772885789014822, |
|
"eval_accuracy": 0.07707207239676907, |
|
"eval_loss": 9.984375, |
|
"eval_runtime": 279.0044, |
|
"eval_samples_per_second": 121.027, |
|
"eval_steps_per_second": 3.362, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.008827375762859633, |
|
"grad_norm": 1.2453374862670898, |
|
"learning_rate": 9.999209895379252e-06, |
|
"loss": 9.9922, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.008827375762859633, |
|
"eval_accuracy": 0.07716395616779366, |
|
"eval_loss": 9.984375, |
|
"eval_runtime": 280.4701, |
|
"eval_samples_per_second": 120.394, |
|
"eval_steps_per_second": 3.344, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.008881865736704447, |
|
"grad_norm": 1.3001788854599, |
|
"learning_rate": 9.999204446381865e-06, |
|
"loss": 9.9766, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.008881865736704447, |
|
"eval_accuracy": 0.07725401615886408, |
|
"eval_loss": 9.9765625, |
|
"eval_runtime": 280.3097, |
|
"eval_samples_per_second": 120.463, |
|
"eval_steps_per_second": 3.346, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.008936355710549258, |
|
"grad_norm": 1.2622265815734863, |
|
"learning_rate": 9.999198997384482e-06, |
|
"loss": 9.9922, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.008936355710549258, |
|
"eval_accuracy": 0.07734222342109218, |
|
"eval_loss": 9.9765625, |
|
"eval_runtime": 279.8973, |
|
"eval_samples_per_second": 120.641, |
|
"eval_steps_per_second": 3.351, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.008990845684394072, |
|
"grad_norm": 1.3013081550598145, |
|
"learning_rate": 9.999193548387097e-06, |
|
"loss": 9.9766, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.008990845684394072, |
|
"eval_accuracy": 0.07743468616988, |
|
"eval_loss": 9.96875, |
|
"eval_runtime": 280.2864, |
|
"eval_samples_per_second": 120.473, |
|
"eval_steps_per_second": 3.347, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.009045335658238883, |
|
"grad_norm": 1.258622407913208, |
|
"learning_rate": 9.999188099389712e-06, |
|
"loss": 9.9844, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.009045335658238883, |
|
"eval_accuracy": 0.07752159073214084, |
|
"eval_loss": 9.96875, |
|
"eval_runtime": 280.7557, |
|
"eval_samples_per_second": 120.272, |
|
"eval_steps_per_second": 3.341, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.009099825632083697, |
|
"grad_norm": 1.2523554563522339, |
|
"learning_rate": 9.999182650392329e-06, |
|
"loss": 9.9766, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.009099825632083697, |
|
"eval_accuracy": 0.07759642360803831, |
|
"eval_loss": 9.96875, |
|
"eval_runtime": 281.1831, |
|
"eval_samples_per_second": 120.089, |
|
"eval_steps_per_second": 3.336, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.009154315605928508, |
|
"grad_norm": 1.2290598154067993, |
|
"learning_rate": 9.999177201394944e-06, |
|
"loss": 9.9844, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.009154315605928508, |
|
"eval_accuracy": 0.07767707521045625, |
|
"eval_loss": 9.9609375, |
|
"eval_runtime": 281.1351, |
|
"eval_samples_per_second": 120.11, |
|
"eval_steps_per_second": 3.336, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.009208805579773322, |
|
"grad_norm": 1.2900922298431396, |
|
"learning_rate": 9.999171752397559e-06, |
|
"loss": 9.9609, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.009208805579773322, |
|
"eval_accuracy": 0.07775179229080108, |
|
"eval_loss": 9.9609375, |
|
"eval_runtime": 280.4797, |
|
"eval_samples_per_second": 120.39, |
|
"eval_steps_per_second": 3.344, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.009263295553618133, |
|
"grad_norm": 1.246096134185791, |
|
"learning_rate": 9.999166303400176e-06, |
|
"loss": 9.9766, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.009263295553618133, |
|
"eval_accuracy": 0.07782975164662, |
|
"eval_loss": 9.953125, |
|
"eval_runtime": 278.9976, |
|
"eval_samples_per_second": 121.03, |
|
"eval_steps_per_second": 3.362, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.009317785527462947, |
|
"grad_norm": 1.2970362901687622, |
|
"learning_rate": 9.99916085440279e-06, |
|
"loss": 9.9531, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.009317785527462947, |
|
"eval_accuracy": 0.07790701622912304, |
|
"eval_loss": 9.953125, |
|
"eval_runtime": 281.1285, |
|
"eval_samples_per_second": 120.112, |
|
"eval_steps_per_second": 3.337, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.009372275501307759, |
|
"grad_norm": 1.204124093055725, |
|
"learning_rate": 9.999155405405406e-06, |
|
"loss": 9.9922, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.009372275501307759, |
|
"eval_accuracy": 0.07798121222948096, |
|
"eval_loss": 9.953125, |
|
"eval_runtime": 280.5802, |
|
"eval_samples_per_second": 120.347, |
|
"eval_steps_per_second": 3.343, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.009426765475152572, |
|
"grad_norm": 1.29256010055542, |
|
"learning_rate": 9.999149956408023e-06, |
|
"loss": 9.9531, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.009426765475152572, |
|
"eval_accuracy": 0.07805448186541772, |
|
"eval_loss": 9.9453125, |
|
"eval_runtime": 279.9622, |
|
"eval_samples_per_second": 120.613, |
|
"eval_steps_per_second": 3.35, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.009481255448997385, |
|
"grad_norm": 1.3079378604888916, |
|
"learning_rate": 9.999144507410638e-06, |
|
"loss": 9.9375, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.009481255448997385, |
|
"eval_accuracy": 0.07812670934138066, |
|
"eval_loss": 9.9453125, |
|
"eval_runtime": 281.3535, |
|
"eval_samples_per_second": 120.016, |
|
"eval_steps_per_second": 3.334, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.009535745422842197, |
|
"grad_norm": 1.2296249866485596, |
|
"learning_rate": 9.999139058413253e-06, |
|
"loss": 9.9688, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.009535745422842197, |
|
"eval_accuracy": 0.07820545031717994, |
|
"eval_loss": 9.9375, |
|
"eval_runtime": 281.384, |
|
"eval_samples_per_second": 120.003, |
|
"eval_steps_per_second": 3.334, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.00959023539668701, |
|
"grad_norm": 1.2954541444778442, |
|
"learning_rate": 9.99913360941587e-06, |
|
"loss": 9.9453, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.00959023539668701, |
|
"eval_accuracy": 0.07826963000223398, |
|
"eval_loss": 9.9375, |
|
"eval_runtime": 281.0155, |
|
"eval_samples_per_second": 120.161, |
|
"eval_steps_per_second": 3.338, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.009644725370531822, |
|
"grad_norm": 1.2820461988449097, |
|
"learning_rate": 9.999128160418484e-06, |
|
"loss": 9.9375, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.009644725370531822, |
|
"eval_accuracy": 0.0783392520782624, |
|
"eval_loss": 9.9375, |
|
"eval_runtime": 280.6598, |
|
"eval_samples_per_second": 120.313, |
|
"eval_steps_per_second": 3.342, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.009699215344376635, |
|
"grad_norm": 1.278702974319458, |
|
"learning_rate": 9.9991227114211e-06, |
|
"loss": 9.9375, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.009699215344376635, |
|
"eval_accuracy": 0.07841200063421225, |
|
"eval_loss": 9.9296875, |
|
"eval_runtime": 279.7691, |
|
"eval_samples_per_second": 120.696, |
|
"eval_steps_per_second": 3.353, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.009753705318221447, |
|
"grad_norm": 1.253942847251892, |
|
"learning_rate": 9.999117262423716e-06, |
|
"loss": 9.9453, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.009753705318221447, |
|
"eval_accuracy": 0.07850298698970383, |
|
"eval_loss": 9.9296875, |
|
"eval_runtime": 279.9236, |
|
"eval_samples_per_second": 120.629, |
|
"eval_steps_per_second": 3.351, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.00980819529206626, |
|
"grad_norm": 1.2406857013702393, |
|
"learning_rate": 9.99911181342633e-06, |
|
"loss": 9.9453, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00980819529206626, |
|
"eval_accuracy": 0.078591744280807, |
|
"eval_loss": 9.921875, |
|
"eval_runtime": 280.2024, |
|
"eval_samples_per_second": 120.509, |
|
"eval_steps_per_second": 3.348, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.009862685265911072, |
|
"grad_norm": 1.2425099611282349, |
|
"learning_rate": 9.999106364428946e-06, |
|
"loss": 9.9297, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.009862685265911072, |
|
"eval_accuracy": 0.07866854568109945, |
|
"eval_loss": 9.921875, |
|
"eval_runtime": 278.8945, |
|
"eval_samples_per_second": 121.074, |
|
"eval_steps_per_second": 3.363, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.009917175239755886, |
|
"grad_norm": 1.2617361545562744, |
|
"learning_rate": 9.999100915431561e-06, |
|
"loss": 9.9375, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.009917175239755886, |
|
"eval_accuracy": 0.07874769194133299, |
|
"eval_loss": 9.9140625, |
|
"eval_runtime": 281.1799, |
|
"eval_samples_per_second": 120.09, |
|
"eval_steps_per_second": 3.336, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.009971665213600697, |
|
"grad_norm": 1.2626757621765137, |
|
"learning_rate": 9.999095466434176e-06, |
|
"loss": 9.9375, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.009971665213600697, |
|
"eval_accuracy": 0.07882507231938868, |
|
"eval_loss": 9.9140625, |
|
"eval_runtime": 280.0578, |
|
"eval_samples_per_second": 120.572, |
|
"eval_steps_per_second": 3.349, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.01002615518744551, |
|
"grad_norm": 1.312477469444275, |
|
"learning_rate": 9.999090017436791e-06, |
|
"loss": 9.8984, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.01002615518744551, |
|
"eval_accuracy": 0.07890256849299702, |
|
"eval_loss": 9.9140625, |
|
"eval_runtime": 279.5807, |
|
"eval_samples_per_second": 120.777, |
|
"eval_steps_per_second": 3.355, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.010080645161290322, |
|
"grad_norm": 1.2198225259780884, |
|
"learning_rate": 9.999084568439408e-06, |
|
"loss": 9.9375, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.010080645161290322, |
|
"eval_accuracy": 0.07897815403998669, |
|
"eval_loss": 9.90625, |
|
"eval_runtime": 281.1433, |
|
"eval_samples_per_second": 120.106, |
|
"eval_steps_per_second": 3.336, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.010135135135135136, |
|
"grad_norm": 1.2455923557281494, |
|
"learning_rate": 9.999079119442023e-06, |
|
"loss": 9.9297, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.010135135135135136, |
|
"eval_accuracy": 0.07905443436029225, |
|
"eval_loss": 9.90625, |
|
"eval_runtime": 280.4858, |
|
"eval_samples_per_second": 120.388, |
|
"eval_steps_per_second": 3.344, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.010189625108979947, |
|
"grad_norm": 1.2409745454788208, |
|
"learning_rate": 9.999073670444638e-06, |
|
"loss": 9.9297, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.010189625108979947, |
|
"eval_accuracy": 0.07912625655182093, |
|
"eval_loss": 9.8984375, |
|
"eval_runtime": 280.6787, |
|
"eval_samples_per_second": 120.305, |
|
"eval_steps_per_second": 3.342, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.01024411508282476, |
|
"grad_norm": 1.2540324926376343, |
|
"learning_rate": 9.999068221447255e-06, |
|
"loss": 9.9141, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.01024411508282476, |
|
"eval_accuracy": 0.07919081257242107, |
|
"eval_loss": 9.8984375, |
|
"eval_runtime": 281.0078, |
|
"eval_samples_per_second": 120.164, |
|
"eval_steps_per_second": 3.338, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.010298605056669572, |
|
"grad_norm": 1.2172249555587769, |
|
"learning_rate": 9.99906277244987e-06, |
|
"loss": 9.9219, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.010298605056669572, |
|
"eval_accuracy": 0.07925064992425089, |
|
"eval_loss": 9.8984375, |
|
"eval_runtime": 280.6094, |
|
"eval_samples_per_second": 120.335, |
|
"eval_steps_per_second": 3.343, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.010353095030514386, |
|
"grad_norm": 1.2848864793777466, |
|
"learning_rate": 9.999057323452485e-06, |
|
"loss": 9.8984, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.010353095030514386, |
|
"eval_accuracy": 0.07933312530662301, |
|
"eval_loss": 9.890625, |
|
"eval_runtime": 280.1069, |
|
"eval_samples_per_second": 120.55, |
|
"eval_steps_per_second": 3.349, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.010407585004359197, |
|
"grad_norm": 1.3001790046691895, |
|
"learning_rate": 9.999051874455102e-06, |
|
"loss": 9.8828, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.010407585004359197, |
|
"eval_accuracy": 0.07942619598206223, |
|
"eval_loss": 9.890625, |
|
"eval_runtime": 279.216, |
|
"eval_samples_per_second": 120.935, |
|
"eval_steps_per_second": 3.359, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.010462074978204011, |
|
"grad_norm": 1.2808548212051392, |
|
"learning_rate": 9.999046425457717e-06, |
|
"loss": 9.8984, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.010462074978204011, |
|
"eval_accuracy": 0.07953382794824668, |
|
"eval_loss": 9.8828125, |
|
"eval_runtime": 279.7965, |
|
"eval_samples_per_second": 120.684, |
|
"eval_steps_per_second": 3.352, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.010516564952048823, |
|
"grad_norm": 1.2758278846740723, |
|
"learning_rate": 9.999040976460332e-06, |
|
"loss": 9.8906, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.010516564952048823, |
|
"eval_accuracy": 0.07961514537509233, |
|
"eval_loss": 9.8828125, |
|
"eval_runtime": 280.4068, |
|
"eval_samples_per_second": 120.421, |
|
"eval_steps_per_second": 3.345, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.010571054925893636, |
|
"grad_norm": 1.2733352184295654, |
|
"learning_rate": 9.999035527462949e-06, |
|
"loss": 9.9062, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.010571054925893636, |
|
"eval_accuracy": 0.07969660754637879, |
|
"eval_loss": 9.8828125, |
|
"eval_runtime": 280.0801, |
|
"eval_samples_per_second": 120.562, |
|
"eval_steps_per_second": 3.349, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.010625544899738448, |
|
"grad_norm": 1.3207552433013916, |
|
"learning_rate": 9.999030078465564e-06, |
|
"loss": 9.875, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.010625544899738448, |
|
"eval_accuracy": 0.07977876449098113, |
|
"eval_loss": 9.875, |
|
"eval_runtime": 279.7352, |
|
"eval_samples_per_second": 120.711, |
|
"eval_steps_per_second": 3.353, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.010680034873583261, |
|
"grad_norm": 1.3311365842819214, |
|
"learning_rate": 9.999024629468179e-06, |
|
"loss": 9.8594, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.010680034873583261, |
|
"eval_accuracy": 0.07984216255605482, |
|
"eval_loss": 9.875, |
|
"eval_runtime": 280.0513, |
|
"eval_samples_per_second": 120.574, |
|
"eval_steps_per_second": 3.349, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.010734524847428073, |
|
"grad_norm": 1.2795319557189941, |
|
"learning_rate": 9.999019180470794e-06, |
|
"loss": 9.8828, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.010734524847428073, |
|
"eval_accuracy": 0.07991207412096483, |
|
"eval_loss": 9.875, |
|
"eval_runtime": 279.4752, |
|
"eval_samples_per_second": 120.823, |
|
"eval_steps_per_second": 3.356, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.010789014821272886, |
|
"grad_norm": 1.2289413213729858, |
|
"learning_rate": 9.999013731473409e-06, |
|
"loss": 9.8984, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.010789014821272886, |
|
"eval_accuracy": 0.07998430159692778, |
|
"eval_loss": 9.8671875, |
|
"eval_runtime": 279.5837, |
|
"eval_samples_per_second": 120.776, |
|
"eval_steps_per_second": 3.355, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.010843504795117698, |
|
"grad_norm": 1.233830451965332, |
|
"learning_rate": 9.999008282476025e-06, |
|
"loss": 9.8906, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.010843504795117698, |
|
"eval_accuracy": 0.0800536920818509, |
|
"eval_loss": 9.8671875, |
|
"eval_runtime": 279.9047, |
|
"eval_samples_per_second": 120.637, |
|
"eval_steps_per_second": 3.351, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.010897994768962511, |
|
"grad_norm": 1.2242302894592285, |
|
"learning_rate": 9.99900283347864e-06, |
|
"loss": 9.9062, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.010897994768962511, |
|
"eval_accuracy": 0.08014829704836268, |
|
"eval_loss": 9.859375, |
|
"eval_runtime": 278.8987, |
|
"eval_samples_per_second": 121.073, |
|
"eval_steps_per_second": 3.363, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.010952484742807323, |
|
"grad_norm": 1.2602059841156006, |
|
"learning_rate": 9.998997384481255e-06, |
|
"loss": 9.8672, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.010952484742807323, |
|
"eval_accuracy": 0.08021710855552257, |
|
"eval_loss": 9.859375, |
|
"eval_runtime": 279.1708, |
|
"eval_samples_per_second": 120.955, |
|
"eval_steps_per_second": 3.36, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.011006974716652136, |
|
"grad_norm": 1.2747395038604736, |
|
"learning_rate": 9.998991935483872e-06, |
|
"loss": 9.8672, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.011006974716652136, |
|
"eval_accuracy": 0.08028279358276101, |
|
"eval_loss": 9.859375, |
|
"eval_runtime": 279.6011, |
|
"eval_samples_per_second": 120.768, |
|
"eval_steps_per_second": 3.355, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.011061464690496948, |
|
"grad_norm": 1.2887182235717773, |
|
"learning_rate": 9.998986486486487e-06, |
|
"loss": 9.8906, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.011061464690496948, |
|
"eval_accuracy": 0.08035612111647408, |
|
"eval_loss": 9.8515625, |
|
"eval_runtime": 278.7511, |
|
"eval_samples_per_second": 121.137, |
|
"eval_steps_per_second": 3.365, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.011115954664341761, |
|
"grad_norm": 1.2620630264282227, |
|
"learning_rate": 9.998981037489102e-06, |
|
"loss": 9.8828, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.011115954664341761, |
|
"eval_accuracy": 0.08042151665483091, |
|
"eval_loss": 9.8515625, |
|
"eval_runtime": 280.8141, |
|
"eval_samples_per_second": 120.247, |
|
"eval_steps_per_second": 3.34, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.011170444638186573, |
|
"grad_norm": 1.204312801361084, |
|
"learning_rate": 9.998975588491719e-06, |
|
"loss": 9.8906, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.011170444638186573, |
|
"eval_accuracy": 0.08048413309992423, |
|
"eval_loss": 9.84375, |
|
"eval_runtime": 278.2837, |
|
"eval_samples_per_second": 121.34, |
|
"eval_steps_per_second": 3.371, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.011224934612031386, |
|
"grad_norm": 1.2498817443847656, |
|
"learning_rate": 9.998970139494334e-06, |
|
"loss": 9.8828, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.011224934612031386, |
|
"eval_accuracy": 0.08053965706741799, |
|
"eval_loss": 9.84375, |
|
"eval_runtime": 280.6201, |
|
"eval_samples_per_second": 120.33, |
|
"eval_steps_per_second": 3.343, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.011279424585876198, |
|
"grad_norm": 1.2628120183944702, |
|
"learning_rate": 9.998964690496949e-06, |
|
"loss": 9.8594, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.011279424585876198, |
|
"eval_accuracy": 0.08057526419985664, |
|
"eval_loss": 9.84375, |
|
"eval_runtime": 281.1793, |
|
"eval_samples_per_second": 120.091, |
|
"eval_steps_per_second": 3.336, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.011333914559721011, |
|
"grad_norm": 1.286372184753418, |
|
"learning_rate": 9.998959241499566e-06, |
|
"loss": 9.875, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.011333914559721011, |
|
"eval_accuracy": 0.0806213797786979, |
|
"eval_loss": 9.8359375, |
|
"eval_runtime": 279.1646, |
|
"eval_samples_per_second": 120.957, |
|
"eval_steps_per_second": 3.36, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.011388404533565825, |
|
"grad_norm": 1.2716000080108643, |
|
"learning_rate": 9.99895379250218e-06, |
|
"loss": 9.8594, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.011388404533565825, |
|
"eval_accuracy": 0.0807168821607427, |
|
"eval_loss": 9.8359375, |
|
"eval_runtime": 279.4887, |
|
"eval_samples_per_second": 120.817, |
|
"eval_steps_per_second": 3.356, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.011442894507410636, |
|
"grad_norm": 1.2634248733520508, |
|
"learning_rate": 9.998948343504796e-06, |
|
"loss": 9.8516, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.011442894507410636, |
|
"eval_accuracy": 0.08081287667388623, |
|
"eval_loss": 9.828125, |
|
"eval_runtime": 280.2441, |
|
"eval_samples_per_second": 120.491, |
|
"eval_steps_per_second": 3.347, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01149738448125545, |
|
"grad_norm": 1.2706140279769897, |
|
"learning_rate": 9.998942894507413e-06, |
|
"loss": 9.8359, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.01149738448125545, |
|
"eval_accuracy": 0.08091327141803031, |
|
"eval_loss": 9.828125, |
|
"eval_runtime": 279.7835, |
|
"eval_samples_per_second": 120.69, |
|
"eval_steps_per_second": 3.353, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.011551874455100261, |
|
"grad_norm": 1.3309648036956787, |
|
"learning_rate": 9.998937445510028e-06, |
|
"loss": 9.8281, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.011551874455100261, |
|
"eval_accuracy": 0.08096497413228675, |
|
"eval_loss": 9.828125, |
|
"eval_runtime": 279.8407, |
|
"eval_samples_per_second": 120.665, |
|
"eval_steps_per_second": 3.352, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.011606364428945075, |
|
"grad_norm": 1.228298544883728, |
|
"learning_rate": 9.998931996512643e-06, |
|
"loss": 9.8516, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.011606364428945075, |
|
"eval_accuracy": 0.08100848431119348, |
|
"eval_loss": 9.8203125, |
|
"eval_runtime": 278.724, |
|
"eval_samples_per_second": 121.149, |
|
"eval_steps_per_second": 3.365, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.011660854402789887, |
|
"grad_norm": 1.2184377908706665, |
|
"learning_rate": 9.998926547515258e-06, |
|
"loss": 9.8516, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.011660854402789887, |
|
"eval_accuracy": 0.0810596080476867, |
|
"eval_loss": 9.8203125, |
|
"eval_runtime": 279.3251, |
|
"eval_samples_per_second": 120.888, |
|
"eval_steps_per_second": 3.358, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.0117153443766347, |
|
"grad_norm": 1.311761498451233, |
|
"learning_rate": 9.998921098517873e-06, |
|
"loss": 9.8281, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.0117153443766347, |
|
"eval_accuracy": 0.08114680209882913, |
|
"eval_loss": 9.8203125, |
|
"eval_runtime": 279.8598, |
|
"eval_samples_per_second": 120.657, |
|
"eval_steps_per_second": 3.352, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.011769834350479512, |
|
"grad_norm": 1.2210040092468262, |
|
"learning_rate": 9.99891564952049e-06, |
|
"loss": 9.8438, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.011769834350479512, |
|
"eval_accuracy": 0.08124644417188101, |
|
"eval_loss": 9.8125, |
|
"eval_runtime": 280.305, |
|
"eval_samples_per_second": 120.465, |
|
"eval_steps_per_second": 3.346, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.011824324324324325, |
|
"grad_norm": 1.2239922285079956, |
|
"learning_rate": 9.998910200523105e-06, |
|
"loss": 9.8359, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.011824324324324325, |
|
"eval_accuracy": 0.08132301398106818, |
|
"eval_loss": 9.8125, |
|
"eval_runtime": 280.4951, |
|
"eval_samples_per_second": 120.384, |
|
"eval_steps_per_second": 3.344, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.011878814298169137, |
|
"grad_norm": 1.26374089717865, |
|
"learning_rate": 9.99890475152572e-06, |
|
"loss": 9.8281, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.011878814298169137, |
|
"eval_accuracy": 0.08139370716595856, |
|
"eval_loss": 9.8046875, |
|
"eval_runtime": 280.3905, |
|
"eval_samples_per_second": 120.428, |
|
"eval_steps_per_second": 3.345, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.01193330427201395, |
|
"grad_norm": 1.255954384803772, |
|
"learning_rate": 9.998899302528335e-06, |
|
"loss": 9.8281, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.01193330427201395, |
|
"eval_accuracy": 0.08145580253106498, |
|
"eval_loss": 9.8046875, |
|
"eval_runtime": 278.85, |
|
"eval_samples_per_second": 121.094, |
|
"eval_steps_per_second": 3.364, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.011987794245858762, |
|
"grad_norm": 1.2596890926361084, |
|
"learning_rate": 9.998893853530951e-06, |
|
"loss": 9.8281, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.011987794245858762, |
|
"eval_accuracy": 0.08151057382746654, |
|
"eval_loss": 9.8046875, |
|
"eval_runtime": 280.1905, |
|
"eval_samples_per_second": 120.514, |
|
"eval_steps_per_second": 3.348, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.012042284219703575, |
|
"grad_norm": 1.3286200761795044, |
|
"learning_rate": 9.998888404533566e-06, |
|
"loss": 9.7969, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.012042284219703575, |
|
"eval_accuracy": 0.08155735523073553, |
|
"eval_loss": 9.796875, |
|
"eval_runtime": 279.7838, |
|
"eval_samples_per_second": 120.69, |
|
"eval_steps_per_second": 3.353, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.012096774193548387, |
|
"grad_norm": 1.2406117916107178, |
|
"learning_rate": 9.998882955536181e-06, |
|
"loss": 9.8281, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.012096774193548387, |
|
"eval_accuracy": 0.08160451296955061, |
|
"eval_loss": 9.796875, |
|
"eval_runtime": 279.2237, |
|
"eval_samples_per_second": 120.932, |
|
"eval_steps_per_second": 3.359, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.0121512641673932, |
|
"grad_norm": 1.2828328609466553, |
|
"learning_rate": 9.998877506538798e-06, |
|
"loss": 9.8047, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.0121512641673932, |
|
"eval_accuracy": 0.08167708783217148, |
|
"eval_loss": 9.7890625, |
|
"eval_runtime": 280.1502, |
|
"eval_samples_per_second": 120.532, |
|
"eval_steps_per_second": 3.348, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.012205754141238012, |
|
"grad_norm": 1.3072834014892578, |
|
"learning_rate": 9.998872057541413e-06, |
|
"loss": 9.8047, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.012205754141238012, |
|
"eval_accuracy": 0.08175794207680655, |
|
"eval_loss": 9.7890625, |
|
"eval_runtime": 279.8278, |
|
"eval_samples_per_second": 120.671, |
|
"eval_steps_per_second": 3.352, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.012260244115082825, |
|
"grad_norm": 1.2602524757385254, |
|
"learning_rate": 9.998866608544028e-06, |
|
"loss": 9.8047, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.012260244115082825, |
|
"eval_accuracy": 0.0818193426685971, |
|
"eval_loss": 9.7890625, |
|
"eval_runtime": 279.4782, |
|
"eval_samples_per_second": 120.822, |
|
"eval_steps_per_second": 3.356, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.012314734088927637, |
|
"grad_norm": 1.2501016855239868, |
|
"learning_rate": 9.998861159546645e-06, |
|
"loss": 9.8047, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.012314734088927637, |
|
"eval_accuracy": 0.08185301022552892, |
|
"eval_loss": 9.78125, |
|
"eval_runtime": 279.6131, |
|
"eval_samples_per_second": 120.763, |
|
"eval_steps_per_second": 3.355, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.01236922406277245, |
|
"grad_norm": 1.2058792114257812, |
|
"learning_rate": 9.99885571054926e-06, |
|
"loss": 9.8281, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.01236922406277245, |
|
"eval_accuracy": 0.08187451924943291, |
|
"eval_loss": 9.78125, |
|
"eval_runtime": 279.6288, |
|
"eval_samples_per_second": 120.757, |
|
"eval_steps_per_second": 3.354, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.012423714036617262, |
|
"grad_norm": 1.2502729892730713, |
|
"learning_rate": 9.998850261551875e-06, |
|
"loss": 9.7812, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.012423714036617262, |
|
"eval_accuracy": 0.08191142908183882, |
|
"eval_loss": 9.78125, |
|
"eval_runtime": 279.6067, |
|
"eval_samples_per_second": 120.766, |
|
"eval_steps_per_second": 3.355, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.012478204010462075, |
|
"grad_norm": 1.2606781721115112, |
|
"learning_rate": 9.998844812554492e-06, |
|
"loss": 9.7891, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.012478204010462075, |
|
"eval_accuracy": 0.08197398762915582, |
|
"eval_loss": 9.7734375, |
|
"eval_runtime": 279.4306, |
|
"eval_samples_per_second": 120.842, |
|
"eval_steps_per_second": 3.357, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.012532693984306887, |
|
"grad_norm": 1.248340368270874, |
|
"learning_rate": 9.998839363557107e-06, |
|
"loss": 9.7969, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.012532693984306887, |
|
"eval_accuracy": 0.08206019741810078, |
|
"eval_loss": 9.7734375, |
|
"eval_runtime": 278.547, |
|
"eval_samples_per_second": 121.226, |
|
"eval_steps_per_second": 3.367, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0125871839581517, |
|
"grad_norm": 1.3996437788009644, |
|
"learning_rate": 9.998833914559722e-06, |
|
"loss": 9.7578, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.0125871839581517, |
|
"eval_accuracy": 0.08214620456482859, |
|
"eval_loss": 9.765625, |
|
"eval_runtime": 279.48, |
|
"eval_samples_per_second": 120.821, |
|
"eval_steps_per_second": 3.356, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.012641673931996512, |
|
"grad_norm": 1.2275718450546265, |
|
"learning_rate": 9.998828465562337e-06, |
|
"loss": 9.8125, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.012641673931996512, |
|
"eval_accuracy": 0.08221081848320505, |
|
"eval_loss": 9.765625, |
|
"eval_runtime": 280.1352, |
|
"eval_samples_per_second": 120.538, |
|
"eval_steps_per_second": 3.348, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.012696163905841325, |
|
"grad_norm": 1.294257640838623, |
|
"learning_rate": 9.998823016564952e-06, |
|
"loss": 9.7734, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.012696163905841325, |
|
"eval_accuracy": 0.0822595105130927, |
|
"eval_loss": 9.765625, |
|
"eval_runtime": 278.3308, |
|
"eval_samples_per_second": 121.32, |
|
"eval_steps_per_second": 3.37, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.012750653879686137, |
|
"grad_norm": 1.302538275718689, |
|
"learning_rate": 9.998817567567569e-06, |
|
"loss": 9.7656, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.012750653879686137, |
|
"eval_accuracy": 0.0823088394185199, |
|
"eval_loss": 9.7578125, |
|
"eval_runtime": 279.0657, |
|
"eval_samples_per_second": 121.0, |
|
"eval_steps_per_second": 3.361, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.01280514385353095, |
|
"grad_norm": 1.2675853967666626, |
|
"learning_rate": 9.998812118570184e-06, |
|
"loss": 9.7578, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.01280514385353095, |
|
"eval_accuracy": 0.0823553023840191, |
|
"eval_loss": 9.7578125, |
|
"eval_runtime": 279.2742, |
|
"eval_samples_per_second": 120.91, |
|
"eval_steps_per_second": 3.359, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.012859633827375762, |
|
"grad_norm": 1.2685391902923584, |
|
"learning_rate": 9.998806669572799e-06, |
|
"loss": 9.7891, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.012859633827375762, |
|
"eval_accuracy": 0.08240431285167651, |
|
"eval_loss": 9.7578125, |
|
"eval_runtime": 279.3255, |
|
"eval_samples_per_second": 120.888, |
|
"eval_steps_per_second": 3.358, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.012914123801220576, |
|
"grad_norm": 1.279767632484436, |
|
"learning_rate": 9.998801220575415e-06, |
|
"loss": 9.7812, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.012914123801220576, |
|
"eval_accuracy": 0.08243551975311462, |
|
"eval_loss": 9.75, |
|
"eval_runtime": 278.6385, |
|
"eval_samples_per_second": 121.186, |
|
"eval_steps_per_second": 3.366, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.012968613775065387, |
|
"grad_norm": 1.2205240726470947, |
|
"learning_rate": 9.99879577157803e-06, |
|
"loss": 9.7656, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.012968613775065387, |
|
"eval_accuracy": 0.08245219431269564, |
|
"eval_loss": 9.75, |
|
"eval_runtime": 278.8375, |
|
"eval_samples_per_second": 121.099, |
|
"eval_steps_per_second": 3.364, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.0130231037489102, |
|
"grad_norm": 1.1911712884902954, |
|
"learning_rate": 9.998790322580645e-06, |
|
"loss": 9.7969, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.0130231037489102, |
|
"eval_accuracy": 0.08246684245010535, |
|
"eval_loss": 9.75, |
|
"eval_runtime": 279.7565, |
|
"eval_samples_per_second": 120.701, |
|
"eval_steps_per_second": 3.353, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.013077593722755012, |
|
"grad_norm": 1.250552773475647, |
|
"learning_rate": 9.998784873583262e-06, |
|
"loss": 9.75, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.013077593722755012, |
|
"eval_accuracy": 0.0824862092562854, |
|
"eval_loss": 9.7421875, |
|
"eval_runtime": 279.3878, |
|
"eval_samples_per_second": 120.861, |
|
"eval_steps_per_second": 3.357, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.013132083696599826, |
|
"grad_norm": 1.1946778297424316, |
|
"learning_rate": 9.998779424585877e-06, |
|
"loss": 9.7734, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.013132083696599826, |
|
"eval_accuracy": 0.08250629978466949, |
|
"eval_loss": 9.7421875, |
|
"eval_runtime": 278.1923, |
|
"eval_samples_per_second": 121.38, |
|
"eval_steps_per_second": 3.372, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.013186573670444637, |
|
"grad_norm": 1.2602076530456543, |
|
"learning_rate": 9.998773975588492e-06, |
|
"loss": 9.7578, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.013186573670444637, |
|
"eval_accuracy": 0.08251411598447309, |
|
"eval_loss": 9.734375, |
|
"eval_runtime": 278.443, |
|
"eval_samples_per_second": 121.271, |
|
"eval_steps_per_second": 3.369, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.01324106364428945, |
|
"grad_norm": 1.2365894317626953, |
|
"learning_rate": 9.998768526591109e-06, |
|
"loss": 9.7656, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.01324106364428945, |
|
"eval_accuracy": 0.08253744878833126, |
|
"eval_loss": 9.734375, |
|
"eval_runtime": 278.0592, |
|
"eval_samples_per_second": 121.438, |
|
"eval_steps_per_second": 3.373, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.013295553618134264, |
|
"grad_norm": 1.2928210496902466, |
|
"learning_rate": 9.998763077593724e-06, |
|
"loss": 9.7266, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.013295553618134264, |
|
"eval_accuracy": 0.0825691478208681, |
|
"eval_loss": 9.734375, |
|
"eval_runtime": 277.8043, |
|
"eval_samples_per_second": 121.55, |
|
"eval_steps_per_second": 3.376, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.013350043591979076, |
|
"grad_norm": 1.2674510478973389, |
|
"learning_rate": 9.998757628596339e-06, |
|
"loss": 9.75, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.013350043591979076, |
|
"eval_accuracy": 0.08259832830013489, |
|
"eval_loss": 9.7265625, |
|
"eval_runtime": 278.1561, |
|
"eval_samples_per_second": 121.396, |
|
"eval_steps_per_second": 3.372, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.01340453356582389, |
|
"grad_norm": 1.2813974618911743, |
|
"learning_rate": 9.998752179598956e-06, |
|
"loss": 9.7422, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.01340453356582389, |
|
"eval_accuracy": 0.08266126318299799, |
|
"eval_loss": 9.7265625, |
|
"eval_runtime": 280.0345, |
|
"eval_samples_per_second": 120.582, |
|
"eval_steps_per_second": 3.35, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.0134590235396687, |
|
"grad_norm": 1.2468206882476807, |
|
"learning_rate": 9.99874673060157e-06, |
|
"loss": 9.75, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.0134590235396687, |
|
"eval_accuracy": 0.08272399542364396, |
|
"eval_loss": 9.7265625, |
|
"eval_runtime": 277.7706, |
|
"eval_samples_per_second": 121.564, |
|
"eval_steps_per_second": 3.377, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"grad_norm": 1.1757469177246094, |
|
"learning_rate": 9.998741281604186e-06, |
|
"loss": 9.7656, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"eval_accuracy": 0.08277827458894678, |
|
"eval_loss": 9.71875, |
|
"eval_runtime": 278.8949, |
|
"eval_samples_per_second": 121.074, |
|
"eval_steps_per_second": 3.363, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.013568003487358326, |
|
"grad_norm": 1.2581287622451782, |
|
"learning_rate": 9.998735832606801e-06, |
|
"loss": 9.7266, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.013568003487358326, |
|
"eval_accuracy": 0.08281637132576732, |
|
"eval_loss": 9.71875, |
|
"eval_runtime": 277.5935, |
|
"eval_samples_per_second": 121.642, |
|
"eval_steps_per_second": 3.379, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.01362249346120314, |
|
"grad_norm": 1.2253607511520386, |
|
"learning_rate": 9.998730383609416e-06, |
|
"loss": 9.75, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01362249346120314, |
|
"eval_accuracy": 0.08284589919169204, |
|
"eval_loss": 9.7109375, |
|
"eval_runtime": 277.5385, |
|
"eval_samples_per_second": 121.666, |
|
"eval_steps_per_second": 3.38, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.013676983435047951, |
|
"grad_norm": 1.2590970993041992, |
|
"learning_rate": 9.998724934612033e-06, |
|
"loss": 9.7266, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.013676983435047951, |
|
"eval_accuracy": 0.08286613446451693, |
|
"eval_loss": 9.7109375, |
|
"eval_runtime": 278.6492, |
|
"eval_samples_per_second": 121.181, |
|
"eval_steps_per_second": 3.366, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.013731473408892764, |
|
"grad_norm": 1.2588908672332764, |
|
"learning_rate": 9.998719485614648e-06, |
|
"loss": 9.7266, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.013731473408892764, |
|
"eval_accuracy": 0.08288451700849948, |
|
"eval_loss": 9.7109375, |
|
"eval_runtime": 278.1724, |
|
"eval_samples_per_second": 121.389, |
|
"eval_steps_per_second": 3.372, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.013785963382737576, |
|
"grad_norm": 1.2583962678909302, |
|
"learning_rate": 9.998714036617263e-06, |
|
"loss": 9.7266, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.013785963382737576, |
|
"eval_accuracy": 0.0829049549235415, |
|
"eval_loss": 9.703125, |
|
"eval_runtime": 277.9162, |
|
"eval_samples_per_second": 121.501, |
|
"eval_steps_per_second": 3.375, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.01384045335658239, |
|
"grad_norm": 1.2444450855255127, |
|
"learning_rate": 9.998708587619878e-06, |
|
"loss": 9.7266, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.01384045335658239, |
|
"eval_accuracy": 0.08292394539417544, |
|
"eval_loss": 9.703125, |
|
"eval_runtime": 279.5035, |
|
"eval_samples_per_second": 120.811, |
|
"eval_steps_per_second": 3.356, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.013894943330427201, |
|
"grad_norm": 1.201521396636963, |
|
"learning_rate": 9.998703138622494e-06, |
|
"loss": 9.7344, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.013894943330427201, |
|
"eval_accuracy": 0.08293364327170955, |
|
"eval_loss": 9.703125, |
|
"eval_runtime": 278.5449, |
|
"eval_samples_per_second": 121.226, |
|
"eval_steps_per_second": 3.368, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.013949433304272014, |
|
"grad_norm": 1.2879900932312012, |
|
"learning_rate": 9.99869768962511e-06, |
|
"loss": 9.7109, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.013949433304272014, |
|
"eval_accuracy": 0.08292825877851151, |
|
"eval_loss": 9.6953125, |
|
"eval_runtime": 278.9219, |
|
"eval_samples_per_second": 121.063, |
|
"eval_steps_per_second": 3.363, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.014003923278116826, |
|
"grad_norm": 1.2371270656585693, |
|
"learning_rate": 9.998692240627725e-06, |
|
"loss": 9.7109, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.014003923278116826, |
|
"eval_accuracy": 0.08293071943400523, |
|
"eval_loss": 9.6953125, |
|
"eval_runtime": 280.4149, |
|
"eval_samples_per_second": 120.418, |
|
"eval_steps_per_second": 3.345, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.01405841325196164, |
|
"grad_norm": 1.2438043355941772, |
|
"learning_rate": 9.998686791630341e-06, |
|
"loss": 9.7109, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.01405841325196164, |
|
"eval_accuracy": 0.08297243478184596, |
|
"eval_loss": 9.6953125, |
|
"eval_runtime": 279.9479, |
|
"eval_samples_per_second": 120.619, |
|
"eval_steps_per_second": 3.351, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.014112903225806451, |
|
"grad_norm": 1.2312678098678589, |
|
"learning_rate": 9.998681342632956e-06, |
|
"loss": 9.7031, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.014112903225806451, |
|
"eval_accuracy": 0.08300974989868613, |
|
"eval_loss": 9.6875, |
|
"eval_runtime": 278.9806, |
|
"eval_samples_per_second": 121.037, |
|
"eval_steps_per_second": 3.362, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.014167393199651265, |
|
"grad_norm": 1.289109706878662, |
|
"learning_rate": 9.998675893635571e-06, |
|
"loss": 9.7109, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.014167393199651265, |
|
"eval_accuracy": 0.0830896198811237, |
|
"eval_loss": 9.6875, |
|
"eval_runtime": 280.8942, |
|
"eval_samples_per_second": 120.213, |
|
"eval_steps_per_second": 3.339, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.014221883173496076, |
|
"grad_norm": 1.2733935117721558, |
|
"learning_rate": 9.998670444638188e-06, |
|
"loss": 9.6953, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.014221883173496076, |
|
"eval_accuracy": 0.08316147102154055, |
|
"eval_loss": 9.6796875, |
|
"eval_runtime": 279.4498, |
|
"eval_samples_per_second": 120.834, |
|
"eval_steps_per_second": 3.357, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.01427637314734089, |
|
"grad_norm": 1.2618088722229004, |
|
"learning_rate": 9.998664995640803e-06, |
|
"loss": 9.7031, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.01427637314734089, |
|
"eval_accuracy": 0.08321233421804031, |
|
"eval_loss": 9.6796875, |
|
"eval_runtime": 278.1291, |
|
"eval_samples_per_second": 121.408, |
|
"eval_steps_per_second": 3.373, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.014330863121185701, |
|
"grad_norm": 1.2773399353027344, |
|
"learning_rate": 9.998659546643418e-06, |
|
"loss": 9.6953, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.014330863121185701, |
|
"eval_accuracy": 0.08324872297045931, |
|
"eval_loss": 9.6796875, |
|
"eval_runtime": 278.6034, |
|
"eval_samples_per_second": 121.201, |
|
"eval_steps_per_second": 3.367, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.014385353095030515, |
|
"grad_norm": 1.246199131011963, |
|
"learning_rate": 9.998654097646035e-06, |
|
"loss": 9.6875, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.014385353095030515, |
|
"eval_accuracy": 0.08326855295884994, |
|
"eval_loss": 9.671875, |
|
"eval_runtime": 278.4105, |
|
"eval_samples_per_second": 121.285, |
|
"eval_steps_per_second": 3.369, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.014439843068875326, |
|
"grad_norm": 1.2552728652954102, |
|
"learning_rate": 9.998648648648648e-06, |
|
"loss": 9.6719, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.014439843068875326, |
|
"eval_accuracy": 0.08325928931463826, |
|
"eval_loss": 9.671875, |
|
"eval_runtime": 279.4834, |
|
"eval_samples_per_second": 120.819, |
|
"eval_steps_per_second": 3.356, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.01449433304272014, |
|
"grad_norm": 1.2612943649291992, |
|
"learning_rate": 9.998643199651265e-06, |
|
"loss": 9.6797, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.01449433304272014, |
|
"eval_accuracy": 0.08324947564155151, |
|
"eval_loss": 9.671875, |
|
"eval_runtime": 277.8081, |
|
"eval_samples_per_second": 121.548, |
|
"eval_steps_per_second": 3.376, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.014548823016564951, |
|
"grad_norm": 1.1741077899932861, |
|
"learning_rate": 9.99863775065388e-06, |
|
"loss": 9.7188, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.014548823016564951, |
|
"eval_accuracy": 0.08325022831264371, |
|
"eval_loss": 9.6640625, |
|
"eval_runtime": 278.7089, |
|
"eval_samples_per_second": 121.155, |
|
"eval_steps_per_second": 3.366, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.014603312990409765, |
|
"grad_norm": 1.2685978412628174, |
|
"learning_rate": 9.998632301656495e-06, |
|
"loss": 9.6953, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.014603312990409765, |
|
"eval_accuracy": 0.08326976881215271, |
|
"eval_loss": 9.6640625, |
|
"eval_runtime": 278.6178, |
|
"eval_samples_per_second": 121.195, |
|
"eval_steps_per_second": 3.367, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.014657802964254577, |
|
"grad_norm": 1.329729437828064, |
|
"learning_rate": 9.998626852659112e-06, |
|
"loss": 9.6797, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.014657802964254577, |
|
"eval_accuracy": 0.08330664969567048, |
|
"eval_loss": 9.6640625, |
|
"eval_runtime": 276.7799, |
|
"eval_samples_per_second": 121.999, |
|
"eval_steps_per_second": 3.389, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.01471229293809939, |
|
"grad_norm": 1.2404329776763916, |
|
"learning_rate": 9.998621403661727e-06, |
|
"loss": 9.6719, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.01471229293809939, |
|
"eval_accuracy": 0.08336324477202621, |
|
"eval_loss": 9.65625, |
|
"eval_runtime": 277.6213, |
|
"eval_samples_per_second": 121.63, |
|
"eval_steps_per_second": 3.379, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.014766782911944202, |
|
"grad_norm": 1.2414337396621704, |
|
"learning_rate": 9.998615954664342e-06, |
|
"loss": 9.6875, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.014766782911944202, |
|
"eval_accuracy": 0.0834355011968773, |
|
"eval_loss": 9.65625, |
|
"eval_runtime": 278.0197, |
|
"eval_samples_per_second": 121.455, |
|
"eval_steps_per_second": 3.374, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.014821272885789015, |
|
"grad_norm": 1.241061806678772, |
|
"learning_rate": 9.998610505666959e-06, |
|
"loss": 9.6641, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.014821272885789015, |
|
"eval_accuracy": 0.083517542345927, |
|
"eval_loss": 9.6484375, |
|
"eval_runtime": 278.7783, |
|
"eval_samples_per_second": 121.125, |
|
"eval_steps_per_second": 3.365, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.014875762859633827, |
|
"grad_norm": 1.2449415922164917, |
|
"learning_rate": 9.998605056669574e-06, |
|
"loss": 9.6719, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.014875762859633827, |
|
"eval_accuracy": 0.08358869871302796, |
|
"eval_loss": 9.6484375, |
|
"eval_runtime": 278.069, |
|
"eval_samples_per_second": 121.434, |
|
"eval_steps_per_second": 3.373, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.01493025283347864, |
|
"grad_norm": 1.2785589694976807, |
|
"learning_rate": 9.998599607672189e-06, |
|
"loss": 9.6719, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.01493025283347864, |
|
"eval_accuracy": 0.08364344106054136, |
|
"eval_loss": 9.6484375, |
|
"eval_runtime": 278.0063, |
|
"eval_samples_per_second": 121.461, |
|
"eval_steps_per_second": 3.374, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.014984742807323452, |
|
"grad_norm": 1.2922983169555664, |
|
"learning_rate": 9.998594158674805e-06, |
|
"loss": 9.6406, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.014984742807323452, |
|
"eval_accuracy": 0.08369166990821841, |
|
"eval_loss": 9.640625, |
|
"eval_runtime": 278.9804, |
|
"eval_samples_per_second": 121.037, |
|
"eval_steps_per_second": 3.362, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.015039232781168265, |
|
"grad_norm": 1.2422248125076294, |
|
"learning_rate": 9.99858870967742e-06, |
|
"loss": 9.6641, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.015039232781168265, |
|
"eval_accuracy": 0.08373764074261889, |
|
"eval_loss": 9.640625, |
|
"eval_runtime": 279.0599, |
|
"eval_samples_per_second": 121.003, |
|
"eval_steps_per_second": 3.361, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.015093722755013077, |
|
"grad_norm": 1.3546416759490967, |
|
"learning_rate": 9.998583260680035e-06, |
|
"loss": 9.6328, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.015093722755013077, |
|
"eval_accuracy": 0.08375952610206898, |
|
"eval_loss": 9.640625, |
|
"eval_runtime": 279.3312, |
|
"eval_samples_per_second": 120.885, |
|
"eval_steps_per_second": 3.358, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.01514821272885789, |
|
"grad_norm": 1.3069231510162354, |
|
"learning_rate": 9.998577811682652e-06, |
|
"loss": 9.6328, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.01514821272885789, |
|
"eval_accuracy": 0.08376647383522773, |
|
"eval_loss": 9.6328125, |
|
"eval_runtime": 277.9079, |
|
"eval_samples_per_second": 121.504, |
|
"eval_steps_per_second": 3.375, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.015202702702702704, |
|
"grad_norm": 1.2460886240005493, |
|
"learning_rate": 9.998572362685267e-06, |
|
"loss": 9.6484, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.015202702702702704, |
|
"eval_accuracy": 0.08376496849304334, |
|
"eval_loss": 9.6328125, |
|
"eval_runtime": 277.6965, |
|
"eval_samples_per_second": 121.597, |
|
"eval_steps_per_second": 3.378, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.015257192676547515, |
|
"grad_norm": 1.2432016134262085, |
|
"learning_rate": 9.998566913687882e-06, |
|
"loss": 9.6484, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.015257192676547515, |
|
"eval_accuracy": 0.08377012139513608, |
|
"eval_loss": 9.6328125, |
|
"eval_runtime": 278.8575, |
|
"eval_samples_per_second": 121.091, |
|
"eval_steps_per_second": 3.364, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.015311682650392329, |
|
"grad_norm": 1.1862634420394897, |
|
"learning_rate": 9.998561464690499e-06, |
|
"loss": 9.6875, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.015311682650392329, |
|
"eval_accuracy": 0.08376899238849779, |
|
"eval_loss": 9.625, |
|
"eval_runtime": 277.3994, |
|
"eval_samples_per_second": 121.727, |
|
"eval_steps_per_second": 3.381, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.01536617262423714, |
|
"grad_norm": 1.2821177244186401, |
|
"learning_rate": 9.998556015693112e-06, |
|
"loss": 9.6328, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.01536617262423714, |
|
"eval_accuracy": 0.08377649015053161, |
|
"eval_loss": 9.625, |
|
"eval_runtime": 277.0927, |
|
"eval_samples_per_second": 121.862, |
|
"eval_steps_per_second": 3.385, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.015420662598081954, |
|
"grad_norm": 1.2484102249145508, |
|
"learning_rate": 9.998550566695729e-06, |
|
"loss": 9.6562, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.015420662598081954, |
|
"eval_accuracy": 0.08380494690759437, |
|
"eval_loss": 9.6171875, |
|
"eval_runtime": 277.7775, |
|
"eval_samples_per_second": 121.561, |
|
"eval_steps_per_second": 3.377, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.015475152571926765, |
|
"grad_norm": 1.188653588294983, |
|
"learning_rate": 9.998545117698344e-06, |
|
"loss": 9.6719, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.015475152571926765, |
|
"eval_accuracy": 0.08383511164905864, |
|
"eval_loss": 9.6171875, |
|
"eval_runtime": 278.6388, |
|
"eval_samples_per_second": 121.186, |
|
"eval_steps_per_second": 3.366, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.015529642545771579, |
|
"grad_norm": 1.2269041538238525, |
|
"learning_rate": 9.998539668700959e-06, |
|
"loss": 9.6641, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.015529642545771579, |
|
"eval_accuracy": 0.08384709648875752, |
|
"eval_loss": 9.6171875, |
|
"eval_runtime": 278.3677, |
|
"eval_samples_per_second": 121.304, |
|
"eval_steps_per_second": 3.37, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.01558413251961639, |
|
"grad_norm": 1.266385555267334, |
|
"learning_rate": 9.998534219703576e-06, |
|
"loss": 9.6328, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.01558413251961639, |
|
"eval_accuracy": 0.08384223307554638, |
|
"eval_loss": 9.609375, |
|
"eval_runtime": 278.3739, |
|
"eval_samples_per_second": 121.301, |
|
"eval_steps_per_second": 3.37, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.015638622493461204, |
|
"grad_norm": 1.28980553150177, |
|
"learning_rate": 9.998528770706191e-06, |
|
"loss": 9.6328, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.015638622493461204, |
|
"eval_accuracy": 0.08388331154784755, |
|
"eval_loss": 9.609375, |
|
"eval_runtime": 279.4156, |
|
"eval_samples_per_second": 120.849, |
|
"eval_steps_per_second": 3.357, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.015693112467306015, |
|
"grad_norm": 1.2614293098449707, |
|
"learning_rate": 9.998523321708806e-06, |
|
"loss": 9.625, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.015693112467306015, |
|
"eval_accuracy": 0.08391469214261461, |
|
"eval_loss": 9.609375, |
|
"eval_runtime": 278.4116, |
|
"eval_samples_per_second": 121.284, |
|
"eval_steps_per_second": 3.369, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.015747602441150827, |
|
"grad_norm": 1.226367473602295, |
|
"learning_rate": 9.998517872711423e-06, |
|
"loss": 9.6328, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.015747602441150827, |
|
"eval_accuracy": 0.08397444264777995, |
|
"eval_loss": 9.6015625, |
|
"eval_runtime": 277.9542, |
|
"eval_samples_per_second": 121.484, |
|
"eval_steps_per_second": 3.375, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.015802092414995642, |
|
"grad_norm": 1.2424851655960083, |
|
"learning_rate": 9.998512423714038e-06, |
|
"loss": 9.6172, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.015802092414995642, |
|
"eval_accuracy": 0.08402672433979962, |
|
"eval_loss": 9.6015625, |
|
"eval_runtime": 277.2733, |
|
"eval_samples_per_second": 121.782, |
|
"eval_steps_per_second": 3.383, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.015856582388840454, |
|
"grad_norm": 1.2392754554748535, |
|
"learning_rate": 9.998506974716653e-06, |
|
"loss": 9.6172, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.015856582388840454, |
|
"eval_accuracy": 0.08409255411147887, |
|
"eval_loss": 9.6015625, |
|
"eval_runtime": 276.0303, |
|
"eval_samples_per_second": 122.331, |
|
"eval_steps_per_second": 3.398, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.015911072362685266, |
|
"grad_norm": 1.2615457773208618, |
|
"learning_rate": 9.998501525719268e-06, |
|
"loss": 9.6094, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.015911072362685266, |
|
"eval_accuracy": 0.08412827703947016, |
|
"eval_loss": 9.59375, |
|
"eval_runtime": 278.5317, |
|
"eval_samples_per_second": 121.232, |
|
"eval_steps_per_second": 3.368, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.015965562336530077, |
|
"grad_norm": 1.2395169734954834, |
|
"learning_rate": 9.998496076721884e-06, |
|
"loss": 9.6172, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.015965562336530077, |
|
"eval_accuracy": 0.0841603524075531, |
|
"eval_loss": 9.59375, |
|
"eval_runtime": 278.5884, |
|
"eval_samples_per_second": 121.207, |
|
"eval_steps_per_second": 3.367, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.016020052310374892, |
|
"grad_norm": 1.2416415214538574, |
|
"learning_rate": 9.9984906277245e-06, |
|
"loss": 9.6094, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.016020052310374892, |
|
"eval_accuracy": 0.08417905338930542, |
|
"eval_loss": 9.59375, |
|
"eval_runtime": 279.278, |
|
"eval_samples_per_second": 120.908, |
|
"eval_steps_per_second": 3.359, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.016074542284219704, |
|
"grad_norm": 1.2306995391845703, |
|
"learning_rate": 9.998485178727115e-06, |
|
"loss": 9.6328, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.016074542284219704, |
|
"eval_accuracy": 0.08419016976235945, |
|
"eval_loss": 9.5859375, |
|
"eval_runtime": 279.0203, |
|
"eval_samples_per_second": 121.02, |
|
"eval_steps_per_second": 3.362, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"grad_norm": 1.2327263355255127, |
|
"learning_rate": 9.998479729729731e-06, |
|
"loss": 9.5938, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"eval_accuracy": 0.08418545109358912, |
|
"eval_loss": 9.5859375, |
|
"eval_runtime": 279.0896, |
|
"eval_samples_per_second": 120.99, |
|
"eval_steps_per_second": 3.361, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.016183522231909327, |
|
"grad_norm": 1.322943091392517, |
|
"learning_rate": 9.998474280732346e-06, |
|
"loss": 9.5938, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.016183522231909327, |
|
"eval_accuracy": 0.08417552162494972, |
|
"eval_loss": 9.578125, |
|
"eval_runtime": 278.5612, |
|
"eval_samples_per_second": 121.219, |
|
"eval_steps_per_second": 3.367, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.016238012205754142, |
|
"grad_norm": 1.264687180519104, |
|
"learning_rate": 9.998468831734961e-06, |
|
"loss": 9.6016, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.016238012205754142, |
|
"eval_accuracy": 0.08418681169133271, |
|
"eval_loss": 9.578125, |
|
"eval_runtime": 276.7181, |
|
"eval_samples_per_second": 122.027, |
|
"eval_steps_per_second": 3.39, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.016292502179598954, |
|
"grad_norm": 1.2931830883026123, |
|
"learning_rate": 9.998463382737576e-06, |
|
"loss": 9.5781, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.016292502179598954, |
|
"eval_accuracy": 0.08422736908364697, |
|
"eval_loss": 9.578125, |
|
"eval_runtime": 278.6618, |
|
"eval_samples_per_second": 121.176, |
|
"eval_steps_per_second": 3.366, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.016346992153443766, |
|
"grad_norm": 1.247689962387085, |
|
"learning_rate": 9.998457933740191e-06, |
|
"loss": 9.5938, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.016346992153443766, |
|
"eval_accuracy": 0.08427264514473155, |
|
"eval_loss": 9.5703125, |
|
"eval_runtime": 278.3198, |
|
"eval_samples_per_second": 121.324, |
|
"eval_steps_per_second": 3.37, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.016401482127288577, |
|
"grad_norm": 1.269108772277832, |
|
"learning_rate": 9.998452484742808e-06, |
|
"loss": 9.5938, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.016401482127288577, |
|
"eval_accuracy": 0.08434733327618822, |
|
"eval_loss": 9.5703125, |
|
"eval_runtime": 277.9862, |
|
"eval_samples_per_second": 121.47, |
|
"eval_steps_per_second": 3.374, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.016455972101133393, |
|
"grad_norm": 1.2147337198257446, |
|
"learning_rate": 9.998447035745423e-06, |
|
"loss": 9.6016, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.016455972101133393, |
|
"eval_accuracy": 0.08440334937478074, |
|
"eval_loss": 9.5703125, |
|
"eval_runtime": 278.4928, |
|
"eval_samples_per_second": 121.249, |
|
"eval_steps_per_second": 3.368, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.016510462074978204, |
|
"grad_norm": 1.2472692728042603, |
|
"learning_rate": 9.998441586748038e-06, |
|
"loss": 9.5781, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.016510462074978204, |
|
"eval_accuracy": 0.08445693376676766, |
|
"eval_loss": 9.5625, |
|
"eval_runtime": 277.2562, |
|
"eval_samples_per_second": 121.79, |
|
"eval_steps_per_second": 3.383, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.016564952048823016, |
|
"grad_norm": 1.2579458951950073, |
|
"learning_rate": 9.998436137750655e-06, |
|
"loss": 9.6016, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.016564952048823016, |
|
"eval_accuracy": 0.08448973285705465, |
|
"eval_loss": 9.5625, |
|
"eval_runtime": 277.2268, |
|
"eval_samples_per_second": 121.803, |
|
"eval_steps_per_second": 3.384, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.016619442022667828, |
|
"grad_norm": 1.2708834409713745, |
|
"learning_rate": 9.99843068875327e-06, |
|
"loss": 9.5703, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.016619442022667828, |
|
"eval_accuracy": 0.084512920916472, |
|
"eval_loss": 9.5625, |
|
"eval_runtime": 277.9135, |
|
"eval_samples_per_second": 121.502, |
|
"eval_steps_per_second": 3.375, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.016673931996512643, |
|
"grad_norm": 1.2249526977539062, |
|
"learning_rate": 9.998425239755885e-06, |
|
"loss": 9.5781, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.016673931996512643, |
|
"eval_accuracy": 0.08454019076912014, |
|
"eval_loss": 9.5546875, |
|
"eval_runtime": 277.7743, |
|
"eval_samples_per_second": 121.563, |
|
"eval_steps_per_second": 3.377, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.016728421970357454, |
|
"grad_norm": 1.214102029800415, |
|
"learning_rate": 9.998419790758502e-06, |
|
"loss": 9.5938, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.016728421970357454, |
|
"eval_accuracy": 0.08456963178838038, |
|
"eval_loss": 9.5546875, |
|
"eval_runtime": 277.7599, |
|
"eval_samples_per_second": 121.569, |
|
"eval_steps_per_second": 3.377, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.016782911944202266, |
|
"grad_norm": 1.2691110372543335, |
|
"learning_rate": 9.998414341761117e-06, |
|
"loss": 9.5391, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.016782911944202266, |
|
"eval_accuracy": 0.08458882490123146, |
|
"eval_loss": 9.5546875, |
|
"eval_runtime": 278.1571, |
|
"eval_samples_per_second": 121.395, |
|
"eval_steps_per_second": 3.372, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.016837401918047078, |
|
"grad_norm": 1.2669826745986938, |
|
"learning_rate": 9.998408892763732e-06, |
|
"loss": 9.5625, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.016837401918047078, |
|
"eval_accuracy": 0.08460961020293142, |
|
"eval_loss": 9.546875, |
|
"eval_runtime": 277.952, |
|
"eval_samples_per_second": 121.485, |
|
"eval_steps_per_second": 3.375, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.016891891891891893, |
|
"grad_norm": 1.2808586359024048, |
|
"learning_rate": 9.998403443766349e-06, |
|
"loss": 9.5547, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.016891891891891893, |
|
"eval_accuracy": 0.08461615265165591, |
|
"eval_loss": 9.546875, |
|
"eval_runtime": 277.1252, |
|
"eval_samples_per_second": 121.847, |
|
"eval_steps_per_second": 3.385, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.016946381865736704, |
|
"grad_norm": 1.252959132194519, |
|
"learning_rate": 9.998397994768964e-06, |
|
"loss": 9.5703, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.016946381865736704, |
|
"eval_accuracy": 0.08458865120790249, |
|
"eval_loss": 9.546875, |
|
"eval_runtime": 278.3797, |
|
"eval_samples_per_second": 121.298, |
|
"eval_steps_per_second": 3.369, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.017000871839581516, |
|
"grad_norm": 1.2392700910568237, |
|
"learning_rate": 9.998392545771579e-06, |
|
"loss": 9.5625, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.017000871839581516, |
|
"eval_accuracy": 0.08457846119926964, |
|
"eval_loss": 9.5390625, |
|
"eval_runtime": 278.5979, |
|
"eval_samples_per_second": 121.203, |
|
"eval_steps_per_second": 3.367, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.01705536181342633, |
|
"grad_norm": 1.363663673400879, |
|
"learning_rate": 9.998387096774195e-06, |
|
"loss": 9.5469, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.01705536181342633, |
|
"eval_accuracy": 0.0846050362786019, |
|
"eval_loss": 9.5390625, |
|
"eval_runtime": 279.48, |
|
"eval_samples_per_second": 120.821, |
|
"eval_steps_per_second": 3.356, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.017109851787271143, |
|
"grad_norm": 1.258362889289856, |
|
"learning_rate": 9.99838164777681e-06, |
|
"loss": 9.5469, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.017109851787271143, |
|
"eval_accuracy": 0.08462159504263028, |
|
"eval_loss": 9.5390625, |
|
"eval_runtime": 280.9528, |
|
"eval_samples_per_second": 120.187, |
|
"eval_steps_per_second": 3.339, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.017164341761115955, |
|
"grad_norm": 1.2624891996383667, |
|
"learning_rate": 9.998376198779425e-06, |
|
"loss": 9.5391, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.017164341761115955, |
|
"eval_accuracy": 0.08465442308180542, |
|
"eval_loss": 9.53125, |
|
"eval_runtime": 280.6538, |
|
"eval_samples_per_second": 120.316, |
|
"eval_steps_per_second": 3.342, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.017218831734960766, |
|
"grad_norm": 1.2132024765014648, |
|
"learning_rate": 9.998370749782042e-06, |
|
"loss": 9.5781, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.017218831734960766, |
|
"eval_accuracy": 0.08470062550731117, |
|
"eval_loss": 9.53125, |
|
"eval_runtime": 279.7295, |
|
"eval_samples_per_second": 120.713, |
|
"eval_steps_per_second": 3.353, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.01727332170880558, |
|
"grad_norm": 1.2480347156524658, |
|
"learning_rate": 9.998365300784656e-06, |
|
"loss": 9.5469, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.01727332170880558, |
|
"eval_accuracy": 0.08472815589995276, |
|
"eval_loss": 9.53125, |
|
"eval_runtime": 278.8002, |
|
"eval_samples_per_second": 121.115, |
|
"eval_steps_per_second": 3.364, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.017327811682650393, |
|
"grad_norm": 1.304702639579773, |
|
"learning_rate": 9.998359851787272e-06, |
|
"loss": 9.5312, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.017327811682650393, |
|
"eval_accuracy": 0.0847501570549555, |
|
"eval_loss": 9.5234375, |
|
"eval_runtime": 279.8383, |
|
"eval_samples_per_second": 120.666, |
|
"eval_steps_per_second": 3.352, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.017382301656495205, |
|
"grad_norm": 1.221549153327942, |
|
"learning_rate": 9.998354402789887e-06, |
|
"loss": 9.5703, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.017382301656495205, |
|
"eval_accuracy": 0.08476648422787858, |
|
"eval_loss": 9.5234375, |
|
"eval_runtime": 278.8177, |
|
"eval_samples_per_second": 121.108, |
|
"eval_steps_per_second": 3.364, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.017436791630340016, |
|
"grad_norm": 1.262052297592163, |
|
"learning_rate": 9.998348953792502e-06, |
|
"loss": 9.5312, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.017436791630340016, |
|
"eval_accuracy": 0.08480269928696862, |
|
"eval_loss": 9.5234375, |
|
"eval_runtime": 278.2907, |
|
"eval_samples_per_second": 121.337, |
|
"eval_steps_per_second": 3.371, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.01749128160418483, |
|
"grad_norm": 1.20883047580719, |
|
"learning_rate": 9.998343504795119e-06, |
|
"loss": 9.5703, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.01749128160418483, |
|
"eval_accuracy": 0.08483929068160476, |
|
"eval_loss": 9.515625, |
|
"eval_runtime": 278.7692, |
|
"eval_samples_per_second": 121.129, |
|
"eval_steps_per_second": 3.365, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.017545771578029643, |
|
"grad_norm": 1.2607730627059937, |
|
"learning_rate": 9.998338055797734e-06, |
|
"loss": 9.5312, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.017545771578029643, |
|
"eval_accuracy": 0.08487046863415469, |
|
"eval_loss": 9.515625, |
|
"eval_runtime": 278.4626, |
|
"eval_samples_per_second": 121.262, |
|
"eval_steps_per_second": 3.368, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.017600261551874455, |
|
"grad_norm": 1.2543152570724487, |
|
"learning_rate": 9.998332606800349e-06, |
|
"loss": 9.5391, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.017600261551874455, |
|
"eval_accuracy": 0.08489484359798667, |
|
"eval_loss": 9.5078125, |
|
"eval_runtime": 278.034, |
|
"eval_samples_per_second": 121.449, |
|
"eval_steps_per_second": 3.374, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.017654751525719267, |
|
"grad_norm": 1.261744737625122, |
|
"learning_rate": 9.998327157802966e-06, |
|
"loss": 9.5156, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.017654751525719267, |
|
"eval_accuracy": 0.08491759742408161, |
|
"eval_loss": 9.5078125, |
|
"eval_runtime": 278.3626, |
|
"eval_samples_per_second": 121.306, |
|
"eval_steps_per_second": 3.37, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.01770924149956408, |
|
"grad_norm": 1.261576771736145, |
|
"learning_rate": 9.998321708805581e-06, |
|
"loss": 9.5234, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.01770924149956408, |
|
"eval_accuracy": 0.08492923487712253, |
|
"eval_loss": 9.5078125, |
|
"eval_runtime": 278.7706, |
|
"eval_samples_per_second": 121.128, |
|
"eval_steps_per_second": 3.365, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.017763731473408893, |
|
"grad_norm": 1.2752763032913208, |
|
"learning_rate": 9.998316259808196e-06, |
|
"loss": 9.5391, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.017763731473408893, |
|
"eval_accuracy": 0.08490972332650168, |
|
"eval_loss": 9.5, |
|
"eval_runtime": 277.4247, |
|
"eval_samples_per_second": 121.716, |
|
"eval_steps_per_second": 3.381, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.017818221447253705, |
|
"grad_norm": 1.2762752771377563, |
|
"learning_rate": 9.998310810810811e-06, |
|
"loss": 9.5078, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.017818221447253705, |
|
"eval_accuracy": 0.08489041441809797, |
|
"eval_loss": 9.5, |
|
"eval_runtime": 278.791, |
|
"eval_samples_per_second": 121.119, |
|
"eval_steps_per_second": 3.365, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.017872711421098517, |
|
"grad_norm": 1.2357895374298096, |
|
"learning_rate": 9.998305361813428e-06, |
|
"loss": 9.5312, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.017872711421098517, |
|
"eval_accuracy": 0.08484099866600628, |
|
"eval_loss": 9.5, |
|
"eval_runtime": 278.1228, |
|
"eval_samples_per_second": 121.41, |
|
"eval_steps_per_second": 3.373, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.017927201394943332, |
|
"grad_norm": 1.2696473598480225, |
|
"learning_rate": 9.998299912816043e-06, |
|
"loss": 9.5078, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.017927201394943332, |
|
"eval_accuracy": 0.08476593419900351, |
|
"eval_loss": 9.4921875, |
|
"eval_runtime": 279.1969, |
|
"eval_samples_per_second": 120.943, |
|
"eval_steps_per_second": 3.36, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.017981691368788143, |
|
"grad_norm": 1.2310556173324585, |
|
"learning_rate": 9.998294463818658e-06, |
|
"loss": 9.5234, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.017981691368788143, |
|
"eval_accuracy": 0.08473466939978909, |
|
"eval_loss": 9.4921875, |
|
"eval_runtime": 278.5891, |
|
"eval_samples_per_second": 121.207, |
|
"eval_steps_per_second": 3.367, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.018036181342632955, |
|
"grad_norm": 1.2741543054580688, |
|
"learning_rate": 9.998289014821274e-06, |
|
"loss": 9.5078, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.018036181342632955, |
|
"eval_accuracy": 0.08475229927267945, |
|
"eval_loss": 9.4921875, |
|
"eval_runtime": 278.1693, |
|
"eval_samples_per_second": 121.39, |
|
"eval_steps_per_second": 3.372, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.018090671316477767, |
|
"grad_norm": 1.2879000902175903, |
|
"learning_rate": 9.99828356582389e-06, |
|
"loss": 9.4922, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.018090671316477767, |
|
"eval_accuracy": 0.08481100761787097, |
|
"eval_loss": 9.484375, |
|
"eval_runtime": 279.2612, |
|
"eval_samples_per_second": 120.915, |
|
"eval_steps_per_second": 3.359, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.018145161290322582, |
|
"grad_norm": 1.2958968877792358, |
|
"learning_rate": 9.998278116826505e-06, |
|
"loss": 9.5, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.018145161290322582, |
|
"eval_accuracy": 0.08491096812869263, |
|
"eval_loss": 9.484375, |
|
"eval_runtime": 278.8921, |
|
"eval_samples_per_second": 121.076, |
|
"eval_steps_per_second": 3.363, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.018199651264167394, |
|
"grad_norm": 1.2337111234664917, |
|
"learning_rate": 9.99827266782912e-06, |
|
"loss": 9.5078, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.018199651264167394, |
|
"eval_accuracy": 0.08498186395580014, |
|
"eval_loss": 9.484375, |
|
"eval_runtime": 279.3346, |
|
"eval_samples_per_second": 120.884, |
|
"eval_steps_per_second": 3.358, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.018254141238012205, |
|
"grad_norm": 1.3057183027267456, |
|
"learning_rate": 9.998267218831735e-06, |
|
"loss": 9.4766, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.018254141238012205, |
|
"eval_accuracy": 0.0850502412296376, |
|
"eval_loss": 9.4765625, |
|
"eval_runtime": 278.5471, |
|
"eval_samples_per_second": 121.225, |
|
"eval_steps_per_second": 3.367, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.018308631211857017, |
|
"grad_norm": 1.2413270473480225, |
|
"learning_rate": 9.998261769834351e-06, |
|
"loss": 9.5, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.018308631211857017, |
|
"eval_accuracy": 0.08509852797509099, |
|
"eval_loss": 9.4765625, |
|
"eval_runtime": 277.2384, |
|
"eval_samples_per_second": 121.798, |
|
"eval_steps_per_second": 3.383, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.018308631211857017, |
|
"step": 336, |
|
"total_flos": 352197518819328.0, |
|
"train_loss": 10.035667782738095, |
|
"train_runtime": 94125.7847, |
|
"train_samples_per_second": 701.873, |
|
"train_steps_per_second": 19.497 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1835200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 1000000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 352197518819328.0, |
|
"train_batch_size": 36, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|