{ "best_metric": 4.225712299346924, "best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained/en_clm/wikipedia_30/checkpoint-68000", "epoch": 35.7612411254273, "eval_steps": 2000, "global_step": 68000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0518012095713911, "eval_loss": 7.518920421600342, "eval_runtime": 2.2399, "eval_samples_per_second": 1422.808, "eval_steps_per_second": 89.288, "step": 2000 }, { "epoch": 2.1036024191427822, "grad_norm": 1.2947496175765991, "learning_rate": 1e-05, "loss": 7.6141, "step": 4000 }, { "epoch": 2.1036024191427822, "eval_loss": 6.568221092224121, "eval_runtime": 2.1236, "eval_samples_per_second": 1500.783, "eval_steps_per_second": 94.182, "step": 4000 }, { "epoch": 3.155403628714173, "eval_loss": 6.16925573348999, "eval_runtime": 2.1624, "eval_samples_per_second": 1473.825, "eval_steps_per_second": 92.49, "step": 6000 }, { "epoch": 4.2072048382855645, "grad_norm": 2.330080270767212, "learning_rate": 2e-05, "loss": 6.2396, "step": 8000 }, { "epoch": 4.2072048382855645, "eval_loss": 5.901575088500977, "eval_runtime": 2.1326, "eval_samples_per_second": 1494.388, "eval_steps_per_second": 93.78, "step": 8000 }, { "epoch": 5.259006047856955, "eval_loss": 5.67369270324707, "eval_runtime": 2.1364, "eval_samples_per_second": 1491.788, "eval_steps_per_second": 93.617, "step": 10000 }, { "epoch": 6.310807257428346, "grad_norm": 2.565220594406128, "learning_rate": 2.99925e-05, "loss": 5.7217, "step": 12000 }, { "epoch": 6.310807257428346, "eval_loss": 5.460005760192871, "eval_runtime": 2.1835, "eval_samples_per_second": 1459.563, "eval_steps_per_second": 91.595, "step": 12000 }, { "epoch": 7.362608466999737, "eval_loss": 5.272489547729492, "eval_runtime": 2.1833, "eval_samples_per_second": 1459.691, "eval_steps_per_second": 91.603, "step": 14000 }, { "epoch": 8.414409676571129, "grad_norm": 2.7334115505218506, "learning_rate": 3.99875e-05, "loss": 5.3064, "step": 16000 }, { "epoch": 8.414409676571129, "eval_loss": 5.1074652671813965, "eval_runtime": 2.2402, "eval_samples_per_second": 1422.652, "eval_steps_per_second": 89.278, "step": 16000 }, { "epoch": 9.46621088614252, "eval_loss": 4.9716596603393555, "eval_runtime": 2.1736, "eval_samples_per_second": 1466.258, "eval_steps_per_second": 92.015, "step": 18000 }, { "epoch": 10.51801209571391, "grad_norm": 2.6722376346588135, "learning_rate": 4.9985e-05, "loss": 4.9744, "step": 20000 }, { "epoch": 10.51801209571391, "eval_loss": 4.864916801452637, "eval_runtime": 2.2031, "eval_samples_per_second": 1446.585, "eval_steps_per_second": 90.78, "step": 20000 }, { "epoch": 11.569813305285301, "eval_loss": 4.763542652130127, "eval_runtime": 2.4382, "eval_samples_per_second": 1307.131, "eval_steps_per_second": 82.029, "step": 22000 }, { "epoch": 12.621614514856692, "grad_norm": 2.4539408683776855, "learning_rate": 5.9985e-05, "loss": 4.7273, "step": 24000 }, { "epoch": 12.621614514856692, "eval_loss": 4.683297634124756, "eval_runtime": 2.452, "eval_samples_per_second": 1299.733, "eval_steps_per_second": 81.565, "step": 24000 }, { "epoch": 13.673415724428082, "eval_loss": 4.621574401855469, "eval_runtime": 2.1825, "eval_samples_per_second": 1460.226, "eval_steps_per_second": 91.636, "step": 26000 }, { "epoch": 14.725216933999475, "grad_norm": 2.4913346767425537, "learning_rate": 6.998e-05, "loss": 4.5397, "step": 28000 }, { "epoch": 14.725216933999475, "eval_loss": 4.562623977661133, "eval_runtime": 2.201, "eval_samples_per_second": 1447.965, "eval_steps_per_second": 90.867, "step": 28000 }, { "epoch": 15.777018143570865, "eval_loss": 4.506168842315674, "eval_runtime": 2.2491, "eval_samples_per_second": 1416.988, "eval_steps_per_second": 88.923, "step": 30000 }, { "epoch": 16.828819353142258, "grad_norm": 2.2661523818969727, "learning_rate": 7.99775e-05, "loss": 4.3839, "step": 32000 }, { "epoch": 16.828819353142258, "eval_loss": 4.464449882507324, "eval_runtime": 2.253, "eval_samples_per_second": 1414.55, "eval_steps_per_second": 88.77, "step": 32000 }, { "epoch": 17.88062056271365, "eval_loss": 4.426318168640137, "eval_runtime": 2.2688, "eval_samples_per_second": 1404.709, "eval_steps_per_second": 88.152, "step": 34000 }, { "epoch": 18.93242177228504, "grad_norm": 2.226025104522705, "learning_rate": 8.9975e-05, "loss": 4.2529, "step": 36000 }, { "epoch": 18.93242177228504, "eval_loss": 4.3944478034973145, "eval_runtime": 2.294, "eval_samples_per_second": 1389.273, "eval_steps_per_second": 87.184, "step": 36000 }, { "epoch": 19.98422298185643, "eval_loss": 4.365135669708252, "eval_runtime": 2.2176, "eval_samples_per_second": 1437.165, "eval_steps_per_second": 90.189, "step": 38000 }, { "epoch": 21.03602419142782, "grad_norm": 2.1669206619262695, "learning_rate": 9.997e-05, "loss": 4.1409, "step": 40000 }, { "epoch": 21.03602419142782, "eval_loss": 4.351284503936768, "eval_runtime": 2.1934, "eval_samples_per_second": 1453.028, "eval_steps_per_second": 91.185, "step": 40000 }, { "epoch": 22.08782540099921, "eval_loss": 4.321634292602539, "eval_runtime": 2.2273, "eval_samples_per_second": 1430.895, "eval_steps_per_second": 89.796, "step": 42000 }, { "epoch": 23.139626610570602, "grad_norm": 2.0360896587371826, "learning_rate": 9.335333333333333e-05, "loss": 4.0334, "step": 44000 }, { "epoch": 23.139626610570602, "eval_loss": 4.300807476043701, "eval_runtime": 2.3226, "eval_samples_per_second": 1372.142, "eval_steps_per_second": 86.109, "step": 44000 }, { "epoch": 24.191427820141993, "eval_loss": 4.279116153717041, "eval_runtime": 2.2177, "eval_samples_per_second": 1437.082, "eval_steps_per_second": 90.184, "step": 46000 }, { "epoch": 25.243229029713383, "grad_norm": 2.137535572052002, "learning_rate": 8.668833333333334e-05, "loss": 3.9378, "step": 48000 }, { "epoch": 25.243229029713383, "eval_loss": 4.2718939781188965, "eval_runtime": 2.2019, "eval_samples_per_second": 1447.39, "eval_steps_per_second": 90.831, "step": 48000 }, { "epoch": 26.295030239284774, "eval_loss": 4.26237678527832, "eval_runtime": 2.2417, "eval_samples_per_second": 1421.698, "eval_steps_per_second": 89.219, "step": 50000 }, { "epoch": 27.346831448856165, "grad_norm": 2.1131036281585693, "learning_rate": 8.002333333333334e-05, "loss": 3.8595, "step": 52000 }, { "epoch": 27.346831448856165, "eval_loss": 4.253220558166504, "eval_runtime": 2.2045, "eval_samples_per_second": 1445.692, "eval_steps_per_second": 90.724, "step": 52000 }, { "epoch": 28.39863265842756, "eval_loss": 4.243188381195068, "eval_runtime": 2.1654, "eval_samples_per_second": 1471.751, "eval_steps_per_second": 92.36, "step": 54000 }, { "epoch": 29.45043386799895, "grad_norm": 2.1626858711242676, "learning_rate": 7.335833333333335e-05, "loss": 3.7927, "step": 56000 }, { "epoch": 29.45043386799895, "eval_loss": 4.237761974334717, "eval_runtime": 2.3384, "eval_samples_per_second": 1362.919, "eval_steps_per_second": 85.53, "step": 56000 }, { "epoch": 30.50223507757034, "eval_loss": 4.237727165222168, "eval_runtime": 2.2527, "eval_samples_per_second": 1414.746, "eval_steps_per_second": 88.782, "step": 58000 }, { "epoch": 31.55403628714173, "grad_norm": 2.1217854022979736, "learning_rate": 6.669333333333334e-05, "loss": 3.7367, "step": 60000 }, { "epoch": 31.55403628714173, "eval_loss": 4.233393669128418, "eval_runtime": 2.2874, "eval_samples_per_second": 1393.302, "eval_steps_per_second": 87.437, "step": 60000 }, { "epoch": 32.60583749671312, "eval_loss": 4.229984283447266, "eval_runtime": 2.3158, "eval_samples_per_second": 1376.171, "eval_steps_per_second": 86.362, "step": 62000 }, { "epoch": 33.657638706284516, "grad_norm": 2.3173110485076904, "learning_rate": 6.003e-05, "loss": 3.6869, "step": 64000 }, { "epoch": 33.657638706284516, "eval_loss": 4.226880073547363, "eval_runtime": 2.3034, "eval_samples_per_second": 1383.631, "eval_steps_per_second": 86.83, "step": 64000 }, { "epoch": 34.7094399158559, "eval_loss": 4.224526405334473, "eval_runtime": 2.2795, "eval_samples_per_second": 1398.138, "eval_steps_per_second": 87.74, "step": 66000 }, { "epoch": 35.7612411254273, "grad_norm": 2.3296120166778564, "learning_rate": 5.336333333333333e-05, "loss": 3.6416, "step": 68000 }, { "epoch": 35.7612411254273, "eval_loss": 4.225712299346924, "eval_runtime": 2.2719, "eval_samples_per_second": 1402.762, "eval_steps_per_second": 88.03, "step": 68000 } ], "logging_steps": 4000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 53, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7578256733563904e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }