| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995119570522206, | |
| "eval_steps": 500, | |
| "global_step": 512, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009760858955588092, | |
| "grad_norm": 23.375, | |
| "learning_rate": 0.0002, | |
| "loss": 2.8524, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.019521717911176184, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7471, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.029282576866764276, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4572, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03904343582235237, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3997, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04880429477794046, | |
| "grad_norm": 6.875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5013, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.05856515373352855, | |
| "grad_norm": 5.25, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4325, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06832601268911664, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3938, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07808687164470474, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4496, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08784773060029283, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4919, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09760858955588092, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4757, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10736944851146901, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5236, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1171303074670571, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.464, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1268911664226452, | |
| "grad_norm": 5.0, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4982, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1366520253782333, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3583, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14641288433382138, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3943, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.15617374328940947, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4915, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.16593460224499756, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5084, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.17569546120058566, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4788, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.18545632015617375, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4738, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.19521717911176184, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4498, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.20497803806734993, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5028, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.21473889702293802, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4605, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.22449975597852612, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4047, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2342606149341142, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4216, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2440214738897023, | |
| "grad_norm": 3.5, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4401, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2537823328452904, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4301, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2635431918008785, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4325, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2733040507564666, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3988, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.28306490971205467, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3768, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.29282576866764276, | |
| "grad_norm": 5.0, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4328, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.30258662762323085, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4189, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.31234748657881894, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3605, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.32210834553440704, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4092, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.33186920448999513, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3854, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3416300634455832, | |
| "grad_norm": 3.375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3923, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3513909224011713, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3395, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3611517813567594, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3532, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3709126403123475, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3764, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3806734992679356, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3179, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3904343582235237, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.309, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4001952171791118, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4215, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.40995607613469986, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4362, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.41971693509028796, | |
| "grad_norm": 2.875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3835, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.42947779404587605, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3099, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.43923865300146414, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3739, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.44899951195705223, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3374, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4587603709126403, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3693, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.4685212298682284, | |
| "grad_norm": 2.625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3478, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4782820888238165, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3685, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4880429477794046, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2963, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4978038067349927, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3452, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5075646656905808, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3321, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5173255246461689, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3702, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.527086383601757, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4063, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5368472425573451, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2899, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5466081015129332, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.309, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5563689604685212, | |
| "grad_norm": 2.5, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3354, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5661298194241093, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3682, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5758906783796974, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3351, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5856515373352855, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3483, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5954123962908736, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.292, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6051732552464617, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3021, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6149341142020498, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3805, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6246949731576379, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3212, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.634455832113226, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2793, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6442166910688141, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3733, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6539775500244022, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2849, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6637384089799903, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3962, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6734992679355783, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3527, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6832601268911664, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3286, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6930209858467545, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3914, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7027818448023426, | |
| "grad_norm": 5.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3359, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7125427037579307, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3115, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7223035627135188, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3279, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7320644216691069, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2885, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.741825280624695, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.322, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7515861395802831, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3047, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7613469985358712, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2534, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7711078574914593, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2539, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7808687164470474, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.277, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7906295754026355, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2932, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8003904343582235, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2782, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8101512933138116, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.294, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8199121522693997, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2311, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8296730112249878, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3014, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8394338701805759, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.325, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.849194729136164, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.187, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8589555880917521, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2626, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8687164470473402, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1967, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8784773060029283, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3065, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8882381649585164, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2892, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8979990239141045, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2817, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9077598828696926, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2344, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9175207418252807, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.349, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9272816007808687, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.255, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.9370424597364568, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2741, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9468033186920449, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3024, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.956564177647633, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.297, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9663250366032211, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2095, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.9760858955588092, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3445, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9858467545143973, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.349, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9956076134699854, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2542, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9995119570522206, | |
| "step": 512, | |
| "total_flos": 3518281600204800.0, | |
| "train_loss": 1.3758189086802304, | |
| "train_runtime": 549.4956, | |
| "train_samples_per_second": 14.915, | |
| "train_steps_per_second": 0.932 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 512, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3518281600204800.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |