{ "best_metric": 3.5649592876434326, "best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained/fr_clm/wikipedia_30/checkpoint-36000", "epoch": 68.70638754696726, "eval_steps": 2000, "global_step": 64000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.1470746108427265, "eval_loss": 7.138043403625488, "eval_runtime": 0.8415, "eval_samples_per_second": 1273.93, "eval_steps_per_second": 79.621, "step": 2000 }, { "epoch": 4.294149221685453, "grad_norm": 1.455039381980896, "learning_rate": 1e-05, "loss": 7.213, "step": 4000 }, { "epoch": 4.294149221685453, "eval_loss": 5.8541717529296875, "eval_runtime": 0.7753, "eval_samples_per_second": 1382.676, "eval_steps_per_second": 86.417, "step": 4000 }, { "epoch": 6.4412238325281805, "eval_loss": 5.403579235076904, "eval_runtime": 0.7542, "eval_samples_per_second": 1421.437, "eval_steps_per_second": 88.84, "step": 6000 }, { "epoch": 8.588298443370906, "grad_norm": 3.117489814758301, "learning_rate": 1.9997500000000003e-05, "loss": 5.4304, "step": 8000 }, { "epoch": 8.588298443370906, "eval_loss": 5.049880504608154, "eval_runtime": 0.7597, "eval_samples_per_second": 1411.127, "eval_steps_per_second": 88.195, "step": 8000 }, { "epoch": 10.735373054213634, "eval_loss": 4.760603427886963, "eval_runtime": 0.7624, "eval_samples_per_second": 1406.053, "eval_steps_per_second": 87.878, "step": 10000 }, { "epoch": 12.882447665056361, "grad_norm": 2.9511141777038574, "learning_rate": 2.9995e-05, "loss": 4.771, "step": 12000 }, { "epoch": 12.882447665056361, "eval_loss": 4.517208576202393, "eval_runtime": 0.764, "eval_samples_per_second": 1403.097, "eval_steps_per_second": 87.694, "step": 12000 }, { "epoch": 15.029522275899087, "eval_loss": 4.3206024169921875, "eval_runtime": 0.7595, "eval_samples_per_second": 1411.435, "eval_steps_per_second": 88.215, "step": 14000 }, { "epoch": 17.176596886741816, "grad_norm": 2.955515146255493, "learning_rate": 3.999e-05, "loss": 4.2888, "step": 16000 }, { "epoch": 17.176596886741816, "eval_loss": 4.152973175048828, "eval_runtime": 0.8247, "eval_samples_per_second": 1299.91, "eval_steps_per_second": 81.244, "step": 16000 }, { "epoch": 19.32367149758454, "eval_loss": 4.015476703643799, "eval_runtime": 0.7501, "eval_samples_per_second": 1429.237, "eval_steps_per_second": 89.327, "step": 18000 }, { "epoch": 21.470746108427267, "grad_norm": 3.0531320571899414, "learning_rate": 4.99875e-05, "loss": 3.9141, "step": 20000 }, { "epoch": 21.470746108427267, "eval_loss": 3.896575927734375, "eval_runtime": 0.768, "eval_samples_per_second": 1395.843, "eval_steps_per_second": 87.24, "step": 20000 }, { "epoch": 23.617820719269993, "eval_loss": 3.8046905994415283, "eval_runtime": 0.7601, "eval_samples_per_second": 1410.272, "eval_steps_per_second": 88.142, "step": 22000 }, { "epoch": 25.764895330112722, "grad_norm": 3.089989185333252, "learning_rate": 5.998250000000001e-05, "loss": 3.6154, "step": 24000 }, { "epoch": 25.764895330112722, "eval_loss": 3.7359092235565186, "eval_runtime": 0.7778, "eval_samples_per_second": 1378.238, "eval_steps_per_second": 86.14, "step": 24000 }, { "epoch": 27.911969940955448, "eval_loss": 3.6784231662750244, "eval_runtime": 0.7541, "eval_samples_per_second": 1421.491, "eval_steps_per_second": 88.843, "step": 26000 }, { "epoch": 30.059044551798173, "grad_norm": 3.3495922088623047, "learning_rate": 6.997500000000001e-05, "loss": 3.3661, "step": 28000 }, { "epoch": 30.059044551798173, "eval_loss": 3.6360318660736084, "eval_runtime": 0.7603, "eval_samples_per_second": 1409.913, "eval_steps_per_second": 88.12, "step": 28000 }, { "epoch": 32.2061191626409, "eval_loss": 3.6019041538238525, "eval_runtime": 0.7638, "eval_samples_per_second": 1403.547, "eval_steps_per_second": 87.722, "step": 30000 }, { "epoch": 34.35319377348363, "grad_norm": 3.1256051063537598, "learning_rate": 7.997250000000001e-05, "loss": 3.1473, "step": 32000 }, { "epoch": 34.35319377348363, "eval_loss": 3.5816054344177246, "eval_runtime": 0.7628, "eval_samples_per_second": 1405.391, "eval_steps_per_second": 87.837, "step": 32000 }, { "epoch": 36.500268384326354, "eval_loss": 3.5698702335357666, "eval_runtime": 0.7617, "eval_samples_per_second": 1407.381, "eval_steps_per_second": 87.961, "step": 34000 }, { "epoch": 38.64734299516908, "grad_norm": 2.921318769454956, "learning_rate": 8.997000000000001e-05, "loss": 2.9533, "step": 36000 }, { "epoch": 38.64734299516908, "eval_loss": 3.5649592876434326, "eval_runtime": 0.7607, "eval_samples_per_second": 1409.149, "eval_steps_per_second": 88.072, "step": 36000 }, { "epoch": 40.794417606011805, "eval_loss": 3.5666539669036865, "eval_runtime": 0.7558, "eval_samples_per_second": 1418.297, "eval_steps_per_second": 88.644, "step": 38000 }, { "epoch": 42.941492216854535, "grad_norm": 3.202237129211426, "learning_rate": 9.996500000000001e-05, "loss": 2.777, "step": 40000 }, { "epoch": 42.941492216854535, "eval_loss": 3.5746841430664062, "eval_runtime": 0.7722, "eval_samples_per_second": 1388.315, "eval_steps_per_second": 86.77, "step": 40000 }, { "epoch": 45.088566827697264, "eval_loss": 3.5878281593322754, "eval_runtime": 0.7639, "eval_samples_per_second": 1403.298, "eval_steps_per_second": 87.706, "step": 42000 }, { "epoch": 47.235641438539986, "grad_norm": 2.9902379512786865, "learning_rate": 9.336e-05, "loss": 2.6015, "step": 44000 }, { "epoch": 47.235641438539986, "eval_loss": 3.6107263565063477, "eval_runtime": 0.7601, "eval_samples_per_second": 1410.262, "eval_steps_per_second": 88.141, "step": 44000 }, { "epoch": 49.382716049382715, "eval_loss": 3.6260793209075928, "eval_runtime": 0.7605, "eval_samples_per_second": 1409.681, "eval_steps_per_second": 88.105, "step": 46000 }, { "epoch": 51.529790660225444, "grad_norm": 3.1626901626586914, "learning_rate": 8.669833333333334e-05, "loss": 2.4429, "step": 48000 }, { "epoch": 51.529790660225444, "eval_loss": 3.6414248943328857, "eval_runtime": 0.7596, "eval_samples_per_second": 1411.254, "eval_steps_per_second": 88.203, "step": 48000 }, { "epoch": 53.676865271068166, "eval_loss": 3.6636786460876465, "eval_runtime": 0.7637, "eval_samples_per_second": 1403.705, "eval_steps_per_second": 87.732, "step": 50000 }, { "epoch": 55.823939881910896, "grad_norm": 3.162830114364624, "learning_rate": 8.003333333333333e-05, "loss": 2.3125, "step": 52000 }, { "epoch": 55.823939881910896, "eval_loss": 3.6777544021606445, "eval_runtime": 0.7597, "eval_samples_per_second": 1411.137, "eval_steps_per_second": 88.196, "step": 52000 }, { "epoch": 57.971014492753625, "eval_loss": 3.703284502029419, "eval_runtime": 0.757, "eval_samples_per_second": 1416.089, "eval_steps_per_second": 88.506, "step": 54000 }, { "epoch": 60.11808910359635, "grad_norm": 3.4666566848754883, "learning_rate": 7.337000000000001e-05, "loss": 2.1989, "step": 56000 }, { "epoch": 60.11808910359635, "eval_loss": 3.7409818172454834, "eval_runtime": 0.7476, "eval_samples_per_second": 1433.993, "eval_steps_per_second": 89.625, "step": 56000 }, { "epoch": 62.265163714439076, "eval_loss": 3.7754786014556885, "eval_runtime": 0.7506, "eval_samples_per_second": 1428.237, "eval_steps_per_second": 89.265, "step": 58000 }, { "epoch": 64.4122383252818, "grad_norm": 3.3885388374328613, "learning_rate": 6.670666666666668e-05, "loss": 2.1044, "step": 60000 }, { "epoch": 64.4122383252818, "eval_loss": 3.7875912189483643, "eval_runtime": 0.7543, "eval_samples_per_second": 1421.139, "eval_steps_per_second": 88.821, "step": 60000 }, { "epoch": 66.55931293612453, "eval_loss": 3.8081138134002686, "eval_runtime": 0.7633, "eval_samples_per_second": 1404.466, "eval_steps_per_second": 87.779, "step": 62000 }, { "epoch": 68.70638754696726, "grad_norm": 3.763136148452759, "learning_rate": 6.004333333333334e-05, "loss": 2.0257, "step": 64000 }, { "epoch": 68.70638754696726, "eval_loss": 3.822213649749756, "eval_runtime": 0.7598, "eval_samples_per_second": 1410.912, "eval_steps_per_second": 88.182, "step": 64000 } ], "logging_steps": 4000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 108, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6543844929634304e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }