|
{ |
|
"best_metric": 3.5649592876434326, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained/fr_clm/wikipedia_30/checkpoint-36000", |
|
"epoch": 68.70638754696726, |
|
"eval_steps": 2000, |
|
"global_step": 64000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.1470746108427265, |
|
"eval_loss": 7.138043403625488, |
|
"eval_runtime": 0.8415, |
|
"eval_samples_per_second": 1273.93, |
|
"eval_steps_per_second": 79.621, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.294149221685453, |
|
"grad_norm": 1.455039381980896, |
|
"learning_rate": 1e-05, |
|
"loss": 7.213, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.294149221685453, |
|
"eval_loss": 5.8541717529296875, |
|
"eval_runtime": 0.7753, |
|
"eval_samples_per_second": 1382.676, |
|
"eval_steps_per_second": 86.417, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.4412238325281805, |
|
"eval_loss": 5.403579235076904, |
|
"eval_runtime": 0.7542, |
|
"eval_samples_per_second": 1421.437, |
|
"eval_steps_per_second": 88.84, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.588298443370906, |
|
"grad_norm": 3.117489814758301, |
|
"learning_rate": 1.9997500000000003e-05, |
|
"loss": 5.4304, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.588298443370906, |
|
"eval_loss": 5.049880504608154, |
|
"eval_runtime": 0.7597, |
|
"eval_samples_per_second": 1411.127, |
|
"eval_steps_per_second": 88.195, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 10.735373054213634, |
|
"eval_loss": 4.760603427886963, |
|
"eval_runtime": 0.7624, |
|
"eval_samples_per_second": 1406.053, |
|
"eval_steps_per_second": 87.878, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 12.882447665056361, |
|
"grad_norm": 2.9511141777038574, |
|
"learning_rate": 2.9995e-05, |
|
"loss": 4.771, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 12.882447665056361, |
|
"eval_loss": 4.517208576202393, |
|
"eval_runtime": 0.764, |
|
"eval_samples_per_second": 1403.097, |
|
"eval_steps_per_second": 87.694, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 15.029522275899087, |
|
"eval_loss": 4.3206024169921875, |
|
"eval_runtime": 0.7595, |
|
"eval_samples_per_second": 1411.435, |
|
"eval_steps_per_second": 88.215, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 17.176596886741816, |
|
"grad_norm": 2.955515146255493, |
|
"learning_rate": 3.999e-05, |
|
"loss": 4.2888, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 17.176596886741816, |
|
"eval_loss": 4.152973175048828, |
|
"eval_runtime": 0.8247, |
|
"eval_samples_per_second": 1299.91, |
|
"eval_steps_per_second": 81.244, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 19.32367149758454, |
|
"eval_loss": 4.015476703643799, |
|
"eval_runtime": 0.7501, |
|
"eval_samples_per_second": 1429.237, |
|
"eval_steps_per_second": 89.327, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 21.470746108427267, |
|
"grad_norm": 3.0531320571899414, |
|
"learning_rate": 4.99875e-05, |
|
"loss": 3.9141, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 21.470746108427267, |
|
"eval_loss": 3.896575927734375, |
|
"eval_runtime": 0.768, |
|
"eval_samples_per_second": 1395.843, |
|
"eval_steps_per_second": 87.24, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 23.617820719269993, |
|
"eval_loss": 3.8046905994415283, |
|
"eval_runtime": 0.7601, |
|
"eval_samples_per_second": 1410.272, |
|
"eval_steps_per_second": 88.142, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 25.764895330112722, |
|
"grad_norm": 3.089989185333252, |
|
"learning_rate": 5.998250000000001e-05, |
|
"loss": 3.6154, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 25.764895330112722, |
|
"eval_loss": 3.7359092235565186, |
|
"eval_runtime": 0.7778, |
|
"eval_samples_per_second": 1378.238, |
|
"eval_steps_per_second": 86.14, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 27.911969940955448, |
|
"eval_loss": 3.6784231662750244, |
|
"eval_runtime": 0.7541, |
|
"eval_samples_per_second": 1421.491, |
|
"eval_steps_per_second": 88.843, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 30.059044551798173, |
|
"grad_norm": 3.3495922088623047, |
|
"learning_rate": 6.997500000000001e-05, |
|
"loss": 3.3661, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 30.059044551798173, |
|
"eval_loss": 3.6360318660736084, |
|
"eval_runtime": 0.7603, |
|
"eval_samples_per_second": 1409.913, |
|
"eval_steps_per_second": 88.12, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 32.2061191626409, |
|
"eval_loss": 3.6019041538238525, |
|
"eval_runtime": 0.7638, |
|
"eval_samples_per_second": 1403.547, |
|
"eval_steps_per_second": 87.722, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 34.35319377348363, |
|
"grad_norm": 3.1256051063537598, |
|
"learning_rate": 7.997250000000001e-05, |
|
"loss": 3.1473, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 34.35319377348363, |
|
"eval_loss": 3.5816054344177246, |
|
"eval_runtime": 0.7628, |
|
"eval_samples_per_second": 1405.391, |
|
"eval_steps_per_second": 87.837, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 36.500268384326354, |
|
"eval_loss": 3.5698702335357666, |
|
"eval_runtime": 0.7617, |
|
"eval_samples_per_second": 1407.381, |
|
"eval_steps_per_second": 87.961, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 38.64734299516908, |
|
"grad_norm": 2.921318769454956, |
|
"learning_rate": 8.997000000000001e-05, |
|
"loss": 2.9533, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 38.64734299516908, |
|
"eval_loss": 3.5649592876434326, |
|
"eval_runtime": 0.7607, |
|
"eval_samples_per_second": 1409.149, |
|
"eval_steps_per_second": 88.072, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 40.794417606011805, |
|
"eval_loss": 3.5666539669036865, |
|
"eval_runtime": 0.7558, |
|
"eval_samples_per_second": 1418.297, |
|
"eval_steps_per_second": 88.644, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 42.941492216854535, |
|
"grad_norm": 3.202237129211426, |
|
"learning_rate": 9.996500000000001e-05, |
|
"loss": 2.777, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 42.941492216854535, |
|
"eval_loss": 3.5746841430664062, |
|
"eval_runtime": 0.7722, |
|
"eval_samples_per_second": 1388.315, |
|
"eval_steps_per_second": 86.77, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 45.088566827697264, |
|
"eval_loss": 3.5878281593322754, |
|
"eval_runtime": 0.7639, |
|
"eval_samples_per_second": 1403.298, |
|
"eval_steps_per_second": 87.706, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 47.235641438539986, |
|
"grad_norm": 2.9902379512786865, |
|
"learning_rate": 9.336e-05, |
|
"loss": 2.6015, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 47.235641438539986, |
|
"eval_loss": 3.6107263565063477, |
|
"eval_runtime": 0.7601, |
|
"eval_samples_per_second": 1410.262, |
|
"eval_steps_per_second": 88.141, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 49.382716049382715, |
|
"eval_loss": 3.6260793209075928, |
|
"eval_runtime": 0.7605, |
|
"eval_samples_per_second": 1409.681, |
|
"eval_steps_per_second": 88.105, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 51.529790660225444, |
|
"grad_norm": 3.1626901626586914, |
|
"learning_rate": 8.669833333333334e-05, |
|
"loss": 2.4429, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 51.529790660225444, |
|
"eval_loss": 3.6414248943328857, |
|
"eval_runtime": 0.7596, |
|
"eval_samples_per_second": 1411.254, |
|
"eval_steps_per_second": 88.203, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 53.676865271068166, |
|
"eval_loss": 3.6636786460876465, |
|
"eval_runtime": 0.7637, |
|
"eval_samples_per_second": 1403.705, |
|
"eval_steps_per_second": 87.732, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 55.823939881910896, |
|
"grad_norm": 3.162830114364624, |
|
"learning_rate": 8.003333333333333e-05, |
|
"loss": 2.3125, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 55.823939881910896, |
|
"eval_loss": 3.6777544021606445, |
|
"eval_runtime": 0.7597, |
|
"eval_samples_per_second": 1411.137, |
|
"eval_steps_per_second": 88.196, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 57.971014492753625, |
|
"eval_loss": 3.703284502029419, |
|
"eval_runtime": 0.757, |
|
"eval_samples_per_second": 1416.089, |
|
"eval_steps_per_second": 88.506, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 60.11808910359635, |
|
"grad_norm": 3.4666566848754883, |
|
"learning_rate": 7.337000000000001e-05, |
|
"loss": 2.1989, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 60.11808910359635, |
|
"eval_loss": 3.7409818172454834, |
|
"eval_runtime": 0.7476, |
|
"eval_samples_per_second": 1433.993, |
|
"eval_steps_per_second": 89.625, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 62.265163714439076, |
|
"eval_loss": 3.7754786014556885, |
|
"eval_runtime": 0.7506, |
|
"eval_samples_per_second": 1428.237, |
|
"eval_steps_per_second": 89.265, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 64.4122383252818, |
|
"grad_norm": 3.3885388374328613, |
|
"learning_rate": 6.670666666666668e-05, |
|
"loss": 2.1044, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 64.4122383252818, |
|
"eval_loss": 3.7875912189483643, |
|
"eval_runtime": 0.7543, |
|
"eval_samples_per_second": 1421.139, |
|
"eval_steps_per_second": 88.821, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 66.55931293612453, |
|
"eval_loss": 3.8081138134002686, |
|
"eval_runtime": 0.7633, |
|
"eval_samples_per_second": 1404.466, |
|
"eval_steps_per_second": 87.779, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 68.70638754696726, |
|
"grad_norm": 3.763136148452759, |
|
"learning_rate": 6.004333333333334e-05, |
|
"loss": 2.0257, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 68.70638754696726, |
|
"eval_loss": 3.822213649749756, |
|
"eval_runtime": 0.7598, |
|
"eval_samples_per_second": 1410.912, |
|
"eval_steps_per_second": 88.182, |
|
"step": 64000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 108, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6543844929634304e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|