|
{ |
|
"best_metric": 5.091330528259277, |
|
"best_model_checkpoint": "aristotle_new_layer_plain/checkpoint-2436", |
|
"epoch": 14.0, |
|
"eval_steps": 500, |
|
"global_step": 2842, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.24721878862793573, |
|
"grad_norm": 4.709590435028076, |
|
"learning_rate": 5e-06, |
|
"loss": 10.2189, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.49443757725587145, |
|
"grad_norm": 3.6981208324432373, |
|
"learning_rate": 1e-05, |
|
"loss": 9.1727, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7416563658838071, |
|
"grad_norm": 3.9341259002685547, |
|
"learning_rate": 1.5e-05, |
|
"loss": 8.6877, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9888751545117429, |
|
"grad_norm": 3.341215133666992, |
|
"learning_rate": 2e-05, |
|
"loss": 8.1544, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 7.7336225509643555, |
|
"eval_runtime": 14.2669, |
|
"eval_samples_per_second": 50.396, |
|
"eval_steps_per_second": 6.308, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.2323856613102595, |
|
"grad_norm": 4.8560404777526855, |
|
"learning_rate": 2.5e-05, |
|
"loss": 7.5653, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4796044499381953, |
|
"grad_norm": 3.1774024963378906, |
|
"learning_rate": 3e-05, |
|
"loss": 7.1093, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.726823238566131, |
|
"grad_norm": 4.33836030960083, |
|
"learning_rate": 3.5e-05, |
|
"loss": 6.7529, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9740420271940669, |
|
"grad_norm": 2.5972180366516113, |
|
"learning_rate": 4e-05, |
|
"loss": 6.536, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.426527500152588, |
|
"eval_runtime": 14.2681, |
|
"eval_samples_per_second": 50.392, |
|
"eval_steps_per_second": 6.308, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.2175525339925835, |
|
"grad_norm": 3.43729567527771, |
|
"learning_rate": 4.5e-05, |
|
"loss": 6.3559, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.464771322620519, |
|
"grad_norm": 3.318251848220825, |
|
"learning_rate": 5e-05, |
|
"loss": 6.3251, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.711990111248455, |
|
"grad_norm": 3.502115488052368, |
|
"learning_rate": 4.9550359712230215e-05, |
|
"loss": 6.1368, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.9592088998763906, |
|
"grad_norm": 3.497938394546509, |
|
"learning_rate": 4.9100719424460435e-05, |
|
"loss": 6.0775, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 6.036961555480957, |
|
"eval_runtime": 14.2475, |
|
"eval_samples_per_second": 50.465, |
|
"eval_steps_per_second": 6.317, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.202719406674907, |
|
"grad_norm": 3.4355390071868896, |
|
"learning_rate": 4.865107913669065e-05, |
|
"loss": 5.8684, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.449938195302843, |
|
"grad_norm": 3.9220526218414307, |
|
"learning_rate": 4.820143884892087e-05, |
|
"loss": 5.8101, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.6971569839307787, |
|
"grad_norm": 3.782421827316284, |
|
"learning_rate": 4.775179856115108e-05, |
|
"loss": 5.7784, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.9443757725587143, |
|
"grad_norm": 3.5181567668914795, |
|
"learning_rate": 4.7302158273381294e-05, |
|
"loss": 5.7181, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 5.772126197814941, |
|
"eval_runtime": 14.2949, |
|
"eval_samples_per_second": 50.298, |
|
"eval_steps_per_second": 6.296, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 4.187886279357231, |
|
"grad_norm": 3.6087594032287598, |
|
"learning_rate": 4.685251798561151e-05, |
|
"loss": 5.5154, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.435105067985167, |
|
"grad_norm": 3.8448667526245117, |
|
"learning_rate": 4.640287769784173e-05, |
|
"loss": 5.4664, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.6823238566131025, |
|
"grad_norm": 3.594693660736084, |
|
"learning_rate": 4.595323741007194e-05, |
|
"loss": 5.4121, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.929542645241038, |
|
"grad_norm": 3.6225693225860596, |
|
"learning_rate": 4.550359712230216e-05, |
|
"loss": 5.3158, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 5.521895885467529, |
|
"eval_runtime": 14.2777, |
|
"eval_samples_per_second": 50.358, |
|
"eval_steps_per_second": 6.304, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 5.173053152039555, |
|
"grad_norm": 4.245815277099609, |
|
"learning_rate": 4.505395683453237e-05, |
|
"loss": 5.1744, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 5.420271940667491, |
|
"grad_norm": 4.306251525878906, |
|
"learning_rate": 4.460431654676259e-05, |
|
"loss": 5.1245, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.667490729295427, |
|
"grad_norm": 3.7834959030151367, |
|
"learning_rate": 4.4154676258992806e-05, |
|
"loss": 5.0729, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.914709517923362, |
|
"grad_norm": 4.298359394073486, |
|
"learning_rate": 4.3705035971223026e-05, |
|
"loss": 5.0558, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 5.389599323272705, |
|
"eval_runtime": 14.2956, |
|
"eval_samples_per_second": 50.295, |
|
"eval_steps_per_second": 6.296, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 6.158220024721879, |
|
"grad_norm": 4.416134357452393, |
|
"learning_rate": 4.325539568345324e-05, |
|
"loss": 4.839, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 6.405438813349814, |
|
"grad_norm": 4.565963268280029, |
|
"learning_rate": 4.280575539568346e-05, |
|
"loss": 4.8297, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.652657601977751, |
|
"grad_norm": 4.854921817779541, |
|
"learning_rate": 4.235611510791367e-05, |
|
"loss": 4.8175, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.899876390605686, |
|
"grad_norm": 4.982056617736816, |
|
"learning_rate": 4.1906474820143885e-05, |
|
"loss": 4.8081, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 5.254246711730957, |
|
"eval_runtime": 14.2665, |
|
"eval_samples_per_second": 50.398, |
|
"eval_steps_per_second": 6.308, |
|
"step": 1421 |
|
}, |
|
{ |
|
"epoch": 7.143386897404203, |
|
"grad_norm": 4.195478439331055, |
|
"learning_rate": 4.14568345323741e-05, |
|
"loss": 4.6322, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 7.3906056860321385, |
|
"grad_norm": 4.963181972503662, |
|
"learning_rate": 4.100719424460432e-05, |
|
"loss": 4.547, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.637824474660074, |
|
"grad_norm": 5.290962219238281, |
|
"learning_rate": 4.055755395683453e-05, |
|
"loss": 4.5553, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.88504326328801, |
|
"grad_norm": 5.0038838386535645, |
|
"learning_rate": 4.010791366906475e-05, |
|
"loss": 4.5651, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 5.183382987976074, |
|
"eval_runtime": 14.2977, |
|
"eval_samples_per_second": 50.288, |
|
"eval_steps_per_second": 6.295, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 8.128553770086526, |
|
"grad_norm": 5.3380446434021, |
|
"learning_rate": 3.965827338129496e-05, |
|
"loss": 4.3966, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 8.375772558714463, |
|
"grad_norm": 5.339470863342285, |
|
"learning_rate": 3.920863309352518e-05, |
|
"loss": 4.3068, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.622991347342397, |
|
"grad_norm": 4.9476189613342285, |
|
"learning_rate": 3.8758992805755396e-05, |
|
"loss": 4.3249, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.870210135970334, |
|
"grad_norm": 5.430028915405273, |
|
"learning_rate": 3.8309352517985616e-05, |
|
"loss": 4.3407, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 5.13620138168335, |
|
"eval_runtime": 14.2616, |
|
"eval_samples_per_second": 50.415, |
|
"eval_steps_per_second": 6.311, |
|
"step": 1827 |
|
}, |
|
{ |
|
"epoch": 9.11372064276885, |
|
"grad_norm": 5.2561259269714355, |
|
"learning_rate": 3.785971223021583e-05, |
|
"loss": 4.1746, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 9.360939431396787, |
|
"grad_norm": 5.811314105987549, |
|
"learning_rate": 3.741007194244605e-05, |
|
"loss": 4.1324, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.608158220024722, |
|
"grad_norm": 5.552155017852783, |
|
"learning_rate": 3.696043165467626e-05, |
|
"loss": 4.1058, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 9.855377008652658, |
|
"grad_norm": 6.073920726776123, |
|
"learning_rate": 3.6510791366906475e-05, |
|
"loss": 4.0436, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 5.104895114898682, |
|
"eval_runtime": 14.2556, |
|
"eval_samples_per_second": 50.436, |
|
"eval_steps_per_second": 6.313, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 10.098887515451175, |
|
"grad_norm": 5.994938373565674, |
|
"learning_rate": 3.606115107913669e-05, |
|
"loss": 3.99, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 10.34610630407911, |
|
"grad_norm": 6.414961814880371, |
|
"learning_rate": 3.561151079136691e-05, |
|
"loss": 3.9013, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 10.593325092707046, |
|
"grad_norm": 6.1248459815979, |
|
"learning_rate": 3.516187050359712e-05, |
|
"loss": 3.8884, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 10.840543881334982, |
|
"grad_norm": 5.360867500305176, |
|
"learning_rate": 3.471223021582734e-05, |
|
"loss": 3.877, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 5.103781700134277, |
|
"eval_runtime": 14.2705, |
|
"eval_samples_per_second": 50.384, |
|
"eval_steps_per_second": 6.307, |
|
"step": 2233 |
|
}, |
|
{ |
|
"epoch": 11.084054388133499, |
|
"grad_norm": 5.840531349182129, |
|
"learning_rate": 3.4262589928057554e-05, |
|
"loss": 3.8216, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 11.331273176761433, |
|
"grad_norm": 7.407821178436279, |
|
"learning_rate": 3.3812949640287773e-05, |
|
"loss": 3.6379, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 11.57849196538937, |
|
"grad_norm": 7.770689487457275, |
|
"learning_rate": 3.3363309352517986e-05, |
|
"loss": 3.7063, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 11.825710754017305, |
|
"grad_norm": 6.17808198928833, |
|
"learning_rate": 3.2913669064748206e-05, |
|
"loss": 3.7008, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 5.091330528259277, |
|
"eval_runtime": 14.3396, |
|
"eval_samples_per_second": 50.141, |
|
"eval_steps_per_second": 6.276, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 12.069221260815821, |
|
"grad_norm": 6.636974334716797, |
|
"learning_rate": 3.246402877697842e-05, |
|
"loss": 3.6062, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 12.316440049443758, |
|
"grad_norm": 6.129552364349365, |
|
"learning_rate": 3.201438848920863e-05, |
|
"loss": 3.4693, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.563658838071694, |
|
"grad_norm": 7.458967208862305, |
|
"learning_rate": 3.1564748201438845e-05, |
|
"loss": 3.507, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 12.810877626699629, |
|
"grad_norm": 6.7472243309021, |
|
"learning_rate": 3.1115107913669065e-05, |
|
"loss": 3.5042, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 5.1072187423706055, |
|
"eval_runtime": 14.225, |
|
"eval_samples_per_second": 50.545, |
|
"eval_steps_per_second": 6.327, |
|
"step": 2639 |
|
}, |
|
{ |
|
"epoch": 13.054388133498145, |
|
"grad_norm": 6.99562406539917, |
|
"learning_rate": 3.066546762589928e-05, |
|
"loss": 3.4307, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 13.301606922126082, |
|
"grad_norm": 7.893692493438721, |
|
"learning_rate": 3.0215827338129498e-05, |
|
"loss": 3.2852, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 13.548825710754016, |
|
"grad_norm": 7.662129878997803, |
|
"learning_rate": 2.976618705035971e-05, |
|
"loss": 3.3116, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 13.796044499381953, |
|
"grad_norm": 7.612554550170898, |
|
"learning_rate": 2.931654676258993e-05, |
|
"loss": 3.3243, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 5.115802764892578, |
|
"eval_runtime": 14.2589, |
|
"eval_samples_per_second": 50.425, |
|
"eval_steps_per_second": 6.312, |
|
"step": 2842 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 6060, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 2 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3660516081664e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|