{ "best_metric": 5.091330528259277, "best_model_checkpoint": "aristotle_new_layer_plain/checkpoint-2436", "epoch": 12.0, "eval_steps": 500, "global_step": 2436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.24721878862793573, "grad_norm": 4.709590435028076, "learning_rate": 5e-06, "loss": 10.2189, "step": 50 }, { "epoch": 0.49443757725587145, "grad_norm": 3.6981208324432373, "learning_rate": 1e-05, "loss": 9.1727, "step": 100 }, { "epoch": 0.7416563658838071, "grad_norm": 3.9341259002685547, "learning_rate": 1.5e-05, "loss": 8.6877, "step": 150 }, { "epoch": 0.9888751545117429, "grad_norm": 3.341215133666992, "learning_rate": 2e-05, "loss": 8.1544, "step": 200 }, { "epoch": 1.0, "eval_loss": 7.7336225509643555, "eval_runtime": 14.2669, "eval_samples_per_second": 50.396, "eval_steps_per_second": 6.308, "step": 203 }, { "epoch": 1.2323856613102595, "grad_norm": 4.8560404777526855, "learning_rate": 2.5e-05, "loss": 7.5653, "step": 250 }, { "epoch": 1.4796044499381953, "grad_norm": 3.1774024963378906, "learning_rate": 3e-05, "loss": 7.1093, "step": 300 }, { "epoch": 1.726823238566131, "grad_norm": 4.33836030960083, "learning_rate": 3.5e-05, "loss": 6.7529, "step": 350 }, { "epoch": 1.9740420271940669, "grad_norm": 2.5972180366516113, "learning_rate": 4e-05, "loss": 6.536, "step": 400 }, { "epoch": 2.0, "eval_loss": 6.426527500152588, "eval_runtime": 14.2681, "eval_samples_per_second": 50.392, "eval_steps_per_second": 6.308, "step": 406 }, { "epoch": 2.2175525339925835, "grad_norm": 3.43729567527771, "learning_rate": 4.5e-05, "loss": 6.3559, "step": 450 }, { "epoch": 2.464771322620519, "grad_norm": 3.318251848220825, "learning_rate": 5e-05, "loss": 6.3251, "step": 500 }, { "epoch": 2.711990111248455, "grad_norm": 3.502115488052368, "learning_rate": 4.9550359712230215e-05, "loss": 6.1368, "step": 550 }, { "epoch": 2.9592088998763906, "grad_norm": 3.497938394546509, "learning_rate": 4.9100719424460435e-05, "loss": 6.0775, "step": 600 }, { "epoch": 3.0, "eval_loss": 6.036961555480957, "eval_runtime": 14.2475, "eval_samples_per_second": 50.465, "eval_steps_per_second": 6.317, "step": 609 }, { "epoch": 3.202719406674907, "grad_norm": 3.4355390071868896, "learning_rate": 4.865107913669065e-05, "loss": 5.8684, "step": 650 }, { "epoch": 3.449938195302843, "grad_norm": 3.9220526218414307, "learning_rate": 4.820143884892087e-05, "loss": 5.8101, "step": 700 }, { "epoch": 3.6971569839307787, "grad_norm": 3.782421827316284, "learning_rate": 4.775179856115108e-05, "loss": 5.7784, "step": 750 }, { "epoch": 3.9443757725587143, "grad_norm": 3.5181567668914795, "learning_rate": 4.7302158273381294e-05, "loss": 5.7181, "step": 800 }, { "epoch": 4.0, "eval_loss": 5.772126197814941, "eval_runtime": 14.2949, "eval_samples_per_second": 50.298, "eval_steps_per_second": 6.296, "step": 812 }, { "epoch": 4.187886279357231, "grad_norm": 3.6087594032287598, "learning_rate": 4.685251798561151e-05, "loss": 5.5154, "step": 850 }, { "epoch": 4.435105067985167, "grad_norm": 3.8448667526245117, "learning_rate": 4.640287769784173e-05, "loss": 5.4664, "step": 900 }, { "epoch": 4.6823238566131025, "grad_norm": 3.594693660736084, "learning_rate": 4.595323741007194e-05, "loss": 5.4121, "step": 950 }, { "epoch": 4.929542645241038, "grad_norm": 3.6225693225860596, "learning_rate": 4.550359712230216e-05, "loss": 5.3158, "step": 1000 }, { "epoch": 5.0, "eval_loss": 5.521895885467529, "eval_runtime": 14.2777, "eval_samples_per_second": 50.358, "eval_steps_per_second": 6.304, "step": 1015 }, { "epoch": 5.173053152039555, "grad_norm": 4.245815277099609, "learning_rate": 4.505395683453237e-05, "loss": 5.1744, "step": 1050 }, { "epoch": 5.420271940667491, "grad_norm": 4.306251525878906, "learning_rate": 4.460431654676259e-05, "loss": 5.1245, "step": 1100 }, { "epoch": 5.667490729295427, "grad_norm": 3.7834959030151367, "learning_rate": 4.4154676258992806e-05, "loss": 5.0729, "step": 1150 }, { "epoch": 5.914709517923362, "grad_norm": 4.298359394073486, "learning_rate": 4.3705035971223026e-05, "loss": 5.0558, "step": 1200 }, { "epoch": 6.0, "eval_loss": 5.389599323272705, "eval_runtime": 14.2956, "eval_samples_per_second": 50.295, "eval_steps_per_second": 6.296, "step": 1218 }, { "epoch": 6.158220024721879, "grad_norm": 4.416134357452393, "learning_rate": 4.325539568345324e-05, "loss": 4.839, "step": 1250 }, { "epoch": 6.405438813349814, "grad_norm": 4.565963268280029, "learning_rate": 4.280575539568346e-05, "loss": 4.8297, "step": 1300 }, { "epoch": 6.652657601977751, "grad_norm": 4.854921817779541, "learning_rate": 4.235611510791367e-05, "loss": 4.8175, "step": 1350 }, { "epoch": 6.899876390605686, "grad_norm": 4.982056617736816, "learning_rate": 4.1906474820143885e-05, "loss": 4.8081, "step": 1400 }, { "epoch": 7.0, "eval_loss": 5.254246711730957, "eval_runtime": 14.2665, "eval_samples_per_second": 50.398, "eval_steps_per_second": 6.308, "step": 1421 }, { "epoch": 7.143386897404203, "grad_norm": 4.195478439331055, "learning_rate": 4.14568345323741e-05, "loss": 4.6322, "step": 1450 }, { "epoch": 7.3906056860321385, "grad_norm": 4.963181972503662, "learning_rate": 4.100719424460432e-05, "loss": 4.547, "step": 1500 }, { "epoch": 7.637824474660074, "grad_norm": 5.290962219238281, "learning_rate": 4.055755395683453e-05, "loss": 4.5553, "step": 1550 }, { "epoch": 7.88504326328801, "grad_norm": 5.0038838386535645, "learning_rate": 4.010791366906475e-05, "loss": 4.5651, "step": 1600 }, { "epoch": 8.0, "eval_loss": 5.183382987976074, "eval_runtime": 14.2977, "eval_samples_per_second": 50.288, "eval_steps_per_second": 6.295, "step": 1624 }, { "epoch": 8.128553770086526, "grad_norm": 5.3380446434021, "learning_rate": 3.965827338129496e-05, "loss": 4.3966, "step": 1650 }, { "epoch": 8.375772558714463, "grad_norm": 5.339470863342285, "learning_rate": 3.920863309352518e-05, "loss": 4.3068, "step": 1700 }, { "epoch": 8.622991347342397, "grad_norm": 4.9476189613342285, "learning_rate": 3.8758992805755396e-05, "loss": 4.3249, "step": 1750 }, { "epoch": 8.870210135970334, "grad_norm": 5.430028915405273, "learning_rate": 3.8309352517985616e-05, "loss": 4.3407, "step": 1800 }, { "epoch": 9.0, "eval_loss": 5.13620138168335, "eval_runtime": 14.2616, "eval_samples_per_second": 50.415, "eval_steps_per_second": 6.311, "step": 1827 }, { "epoch": 9.11372064276885, "grad_norm": 5.2561259269714355, "learning_rate": 3.785971223021583e-05, "loss": 4.1746, "step": 1850 }, { "epoch": 9.360939431396787, "grad_norm": 5.811314105987549, "learning_rate": 3.741007194244605e-05, "loss": 4.1324, "step": 1900 }, { "epoch": 9.608158220024722, "grad_norm": 5.552155017852783, "learning_rate": 3.696043165467626e-05, "loss": 4.1058, "step": 1950 }, { "epoch": 9.855377008652658, "grad_norm": 6.073920726776123, "learning_rate": 3.6510791366906475e-05, "loss": 4.0436, "step": 2000 }, { "epoch": 10.0, "eval_loss": 5.104895114898682, "eval_runtime": 14.2556, "eval_samples_per_second": 50.436, "eval_steps_per_second": 6.313, "step": 2030 }, { "epoch": 10.098887515451175, "grad_norm": 5.994938373565674, "learning_rate": 3.606115107913669e-05, "loss": 3.99, "step": 2050 }, { "epoch": 10.34610630407911, "grad_norm": 6.414961814880371, "learning_rate": 3.561151079136691e-05, "loss": 3.9013, "step": 2100 }, { "epoch": 10.593325092707046, "grad_norm": 6.1248459815979, "learning_rate": 3.516187050359712e-05, "loss": 3.8884, "step": 2150 }, { "epoch": 10.840543881334982, "grad_norm": 5.360867500305176, "learning_rate": 3.471223021582734e-05, "loss": 3.877, "step": 2200 }, { "epoch": 11.0, "eval_loss": 5.103781700134277, "eval_runtime": 14.2705, "eval_samples_per_second": 50.384, "eval_steps_per_second": 6.307, "step": 2233 }, { "epoch": 11.084054388133499, "grad_norm": 5.840531349182129, "learning_rate": 3.4262589928057554e-05, "loss": 3.8216, "step": 2250 }, { "epoch": 11.331273176761433, "grad_norm": 7.407821178436279, "learning_rate": 3.3812949640287773e-05, "loss": 3.6379, "step": 2300 }, { "epoch": 11.57849196538937, "grad_norm": 7.770689487457275, "learning_rate": 3.3363309352517986e-05, "loss": 3.7063, "step": 2350 }, { "epoch": 11.825710754017305, "grad_norm": 6.17808198928833, "learning_rate": 3.2913669064748206e-05, "loss": 3.7008, "step": 2400 }, { "epoch": 12.0, "eval_loss": 5.091330528259277, "eval_runtime": 14.3396, "eval_samples_per_second": 50.141, "eval_steps_per_second": 6.276, "step": 2436 } ], "logging_steps": 50, "max_steps": 6060, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0280442355712e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }