DmitryYarov's picture
Upload folder using huggingface_hub
e19a10b verified
{
"best_metric": 5.091330528259277,
"best_model_checkpoint": "aristotle_new_layer_plain/checkpoint-2436",
"epoch": 14.0,
"eval_steps": 500,
"global_step": 2842,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.24721878862793573,
"grad_norm": 4.709590435028076,
"learning_rate": 5e-06,
"loss": 10.2189,
"step": 50
},
{
"epoch": 0.49443757725587145,
"grad_norm": 3.6981208324432373,
"learning_rate": 1e-05,
"loss": 9.1727,
"step": 100
},
{
"epoch": 0.7416563658838071,
"grad_norm": 3.9341259002685547,
"learning_rate": 1.5e-05,
"loss": 8.6877,
"step": 150
},
{
"epoch": 0.9888751545117429,
"grad_norm": 3.341215133666992,
"learning_rate": 2e-05,
"loss": 8.1544,
"step": 200
},
{
"epoch": 1.0,
"eval_loss": 7.7336225509643555,
"eval_runtime": 14.2669,
"eval_samples_per_second": 50.396,
"eval_steps_per_second": 6.308,
"step": 203
},
{
"epoch": 1.2323856613102595,
"grad_norm": 4.8560404777526855,
"learning_rate": 2.5e-05,
"loss": 7.5653,
"step": 250
},
{
"epoch": 1.4796044499381953,
"grad_norm": 3.1774024963378906,
"learning_rate": 3e-05,
"loss": 7.1093,
"step": 300
},
{
"epoch": 1.726823238566131,
"grad_norm": 4.33836030960083,
"learning_rate": 3.5e-05,
"loss": 6.7529,
"step": 350
},
{
"epoch": 1.9740420271940669,
"grad_norm": 2.5972180366516113,
"learning_rate": 4e-05,
"loss": 6.536,
"step": 400
},
{
"epoch": 2.0,
"eval_loss": 6.426527500152588,
"eval_runtime": 14.2681,
"eval_samples_per_second": 50.392,
"eval_steps_per_second": 6.308,
"step": 406
},
{
"epoch": 2.2175525339925835,
"grad_norm": 3.43729567527771,
"learning_rate": 4.5e-05,
"loss": 6.3559,
"step": 450
},
{
"epoch": 2.464771322620519,
"grad_norm": 3.318251848220825,
"learning_rate": 5e-05,
"loss": 6.3251,
"step": 500
},
{
"epoch": 2.711990111248455,
"grad_norm": 3.502115488052368,
"learning_rate": 4.9550359712230215e-05,
"loss": 6.1368,
"step": 550
},
{
"epoch": 2.9592088998763906,
"grad_norm": 3.497938394546509,
"learning_rate": 4.9100719424460435e-05,
"loss": 6.0775,
"step": 600
},
{
"epoch": 3.0,
"eval_loss": 6.036961555480957,
"eval_runtime": 14.2475,
"eval_samples_per_second": 50.465,
"eval_steps_per_second": 6.317,
"step": 609
},
{
"epoch": 3.202719406674907,
"grad_norm": 3.4355390071868896,
"learning_rate": 4.865107913669065e-05,
"loss": 5.8684,
"step": 650
},
{
"epoch": 3.449938195302843,
"grad_norm": 3.9220526218414307,
"learning_rate": 4.820143884892087e-05,
"loss": 5.8101,
"step": 700
},
{
"epoch": 3.6971569839307787,
"grad_norm": 3.782421827316284,
"learning_rate": 4.775179856115108e-05,
"loss": 5.7784,
"step": 750
},
{
"epoch": 3.9443757725587143,
"grad_norm": 3.5181567668914795,
"learning_rate": 4.7302158273381294e-05,
"loss": 5.7181,
"step": 800
},
{
"epoch": 4.0,
"eval_loss": 5.772126197814941,
"eval_runtime": 14.2949,
"eval_samples_per_second": 50.298,
"eval_steps_per_second": 6.296,
"step": 812
},
{
"epoch": 4.187886279357231,
"grad_norm": 3.6087594032287598,
"learning_rate": 4.685251798561151e-05,
"loss": 5.5154,
"step": 850
},
{
"epoch": 4.435105067985167,
"grad_norm": 3.8448667526245117,
"learning_rate": 4.640287769784173e-05,
"loss": 5.4664,
"step": 900
},
{
"epoch": 4.6823238566131025,
"grad_norm": 3.594693660736084,
"learning_rate": 4.595323741007194e-05,
"loss": 5.4121,
"step": 950
},
{
"epoch": 4.929542645241038,
"grad_norm": 3.6225693225860596,
"learning_rate": 4.550359712230216e-05,
"loss": 5.3158,
"step": 1000
},
{
"epoch": 5.0,
"eval_loss": 5.521895885467529,
"eval_runtime": 14.2777,
"eval_samples_per_second": 50.358,
"eval_steps_per_second": 6.304,
"step": 1015
},
{
"epoch": 5.173053152039555,
"grad_norm": 4.245815277099609,
"learning_rate": 4.505395683453237e-05,
"loss": 5.1744,
"step": 1050
},
{
"epoch": 5.420271940667491,
"grad_norm": 4.306251525878906,
"learning_rate": 4.460431654676259e-05,
"loss": 5.1245,
"step": 1100
},
{
"epoch": 5.667490729295427,
"grad_norm": 3.7834959030151367,
"learning_rate": 4.4154676258992806e-05,
"loss": 5.0729,
"step": 1150
},
{
"epoch": 5.914709517923362,
"grad_norm": 4.298359394073486,
"learning_rate": 4.3705035971223026e-05,
"loss": 5.0558,
"step": 1200
},
{
"epoch": 6.0,
"eval_loss": 5.389599323272705,
"eval_runtime": 14.2956,
"eval_samples_per_second": 50.295,
"eval_steps_per_second": 6.296,
"step": 1218
},
{
"epoch": 6.158220024721879,
"grad_norm": 4.416134357452393,
"learning_rate": 4.325539568345324e-05,
"loss": 4.839,
"step": 1250
},
{
"epoch": 6.405438813349814,
"grad_norm": 4.565963268280029,
"learning_rate": 4.280575539568346e-05,
"loss": 4.8297,
"step": 1300
},
{
"epoch": 6.652657601977751,
"grad_norm": 4.854921817779541,
"learning_rate": 4.235611510791367e-05,
"loss": 4.8175,
"step": 1350
},
{
"epoch": 6.899876390605686,
"grad_norm": 4.982056617736816,
"learning_rate": 4.1906474820143885e-05,
"loss": 4.8081,
"step": 1400
},
{
"epoch": 7.0,
"eval_loss": 5.254246711730957,
"eval_runtime": 14.2665,
"eval_samples_per_second": 50.398,
"eval_steps_per_second": 6.308,
"step": 1421
},
{
"epoch": 7.143386897404203,
"grad_norm": 4.195478439331055,
"learning_rate": 4.14568345323741e-05,
"loss": 4.6322,
"step": 1450
},
{
"epoch": 7.3906056860321385,
"grad_norm": 4.963181972503662,
"learning_rate": 4.100719424460432e-05,
"loss": 4.547,
"step": 1500
},
{
"epoch": 7.637824474660074,
"grad_norm": 5.290962219238281,
"learning_rate": 4.055755395683453e-05,
"loss": 4.5553,
"step": 1550
},
{
"epoch": 7.88504326328801,
"grad_norm": 5.0038838386535645,
"learning_rate": 4.010791366906475e-05,
"loss": 4.5651,
"step": 1600
},
{
"epoch": 8.0,
"eval_loss": 5.183382987976074,
"eval_runtime": 14.2977,
"eval_samples_per_second": 50.288,
"eval_steps_per_second": 6.295,
"step": 1624
},
{
"epoch": 8.128553770086526,
"grad_norm": 5.3380446434021,
"learning_rate": 3.965827338129496e-05,
"loss": 4.3966,
"step": 1650
},
{
"epoch": 8.375772558714463,
"grad_norm": 5.339470863342285,
"learning_rate": 3.920863309352518e-05,
"loss": 4.3068,
"step": 1700
},
{
"epoch": 8.622991347342397,
"grad_norm": 4.9476189613342285,
"learning_rate": 3.8758992805755396e-05,
"loss": 4.3249,
"step": 1750
},
{
"epoch": 8.870210135970334,
"grad_norm": 5.430028915405273,
"learning_rate": 3.8309352517985616e-05,
"loss": 4.3407,
"step": 1800
},
{
"epoch": 9.0,
"eval_loss": 5.13620138168335,
"eval_runtime": 14.2616,
"eval_samples_per_second": 50.415,
"eval_steps_per_second": 6.311,
"step": 1827
},
{
"epoch": 9.11372064276885,
"grad_norm": 5.2561259269714355,
"learning_rate": 3.785971223021583e-05,
"loss": 4.1746,
"step": 1850
},
{
"epoch": 9.360939431396787,
"grad_norm": 5.811314105987549,
"learning_rate": 3.741007194244605e-05,
"loss": 4.1324,
"step": 1900
},
{
"epoch": 9.608158220024722,
"grad_norm": 5.552155017852783,
"learning_rate": 3.696043165467626e-05,
"loss": 4.1058,
"step": 1950
},
{
"epoch": 9.855377008652658,
"grad_norm": 6.073920726776123,
"learning_rate": 3.6510791366906475e-05,
"loss": 4.0436,
"step": 2000
},
{
"epoch": 10.0,
"eval_loss": 5.104895114898682,
"eval_runtime": 14.2556,
"eval_samples_per_second": 50.436,
"eval_steps_per_second": 6.313,
"step": 2030
},
{
"epoch": 10.098887515451175,
"grad_norm": 5.994938373565674,
"learning_rate": 3.606115107913669e-05,
"loss": 3.99,
"step": 2050
},
{
"epoch": 10.34610630407911,
"grad_norm": 6.414961814880371,
"learning_rate": 3.561151079136691e-05,
"loss": 3.9013,
"step": 2100
},
{
"epoch": 10.593325092707046,
"grad_norm": 6.1248459815979,
"learning_rate": 3.516187050359712e-05,
"loss": 3.8884,
"step": 2150
},
{
"epoch": 10.840543881334982,
"grad_norm": 5.360867500305176,
"learning_rate": 3.471223021582734e-05,
"loss": 3.877,
"step": 2200
},
{
"epoch": 11.0,
"eval_loss": 5.103781700134277,
"eval_runtime": 14.2705,
"eval_samples_per_second": 50.384,
"eval_steps_per_second": 6.307,
"step": 2233
},
{
"epoch": 11.084054388133499,
"grad_norm": 5.840531349182129,
"learning_rate": 3.4262589928057554e-05,
"loss": 3.8216,
"step": 2250
},
{
"epoch": 11.331273176761433,
"grad_norm": 7.407821178436279,
"learning_rate": 3.3812949640287773e-05,
"loss": 3.6379,
"step": 2300
},
{
"epoch": 11.57849196538937,
"grad_norm": 7.770689487457275,
"learning_rate": 3.3363309352517986e-05,
"loss": 3.7063,
"step": 2350
},
{
"epoch": 11.825710754017305,
"grad_norm": 6.17808198928833,
"learning_rate": 3.2913669064748206e-05,
"loss": 3.7008,
"step": 2400
},
{
"epoch": 12.0,
"eval_loss": 5.091330528259277,
"eval_runtime": 14.3396,
"eval_samples_per_second": 50.141,
"eval_steps_per_second": 6.276,
"step": 2436
},
{
"epoch": 12.069221260815821,
"grad_norm": 6.636974334716797,
"learning_rate": 3.246402877697842e-05,
"loss": 3.6062,
"step": 2450
},
{
"epoch": 12.316440049443758,
"grad_norm": 6.129552364349365,
"learning_rate": 3.201438848920863e-05,
"loss": 3.4693,
"step": 2500
},
{
"epoch": 12.563658838071694,
"grad_norm": 7.458967208862305,
"learning_rate": 3.1564748201438845e-05,
"loss": 3.507,
"step": 2550
},
{
"epoch": 12.810877626699629,
"grad_norm": 6.7472243309021,
"learning_rate": 3.1115107913669065e-05,
"loss": 3.5042,
"step": 2600
},
{
"epoch": 13.0,
"eval_loss": 5.1072187423706055,
"eval_runtime": 14.225,
"eval_samples_per_second": 50.545,
"eval_steps_per_second": 6.327,
"step": 2639
},
{
"epoch": 13.054388133498145,
"grad_norm": 6.99562406539917,
"learning_rate": 3.066546762589928e-05,
"loss": 3.4307,
"step": 2650
},
{
"epoch": 13.301606922126082,
"grad_norm": 7.893692493438721,
"learning_rate": 3.0215827338129498e-05,
"loss": 3.2852,
"step": 2700
},
{
"epoch": 13.548825710754016,
"grad_norm": 7.662129878997803,
"learning_rate": 2.976618705035971e-05,
"loss": 3.3116,
"step": 2750
},
{
"epoch": 13.796044499381953,
"grad_norm": 7.612554550170898,
"learning_rate": 2.931654676258993e-05,
"loss": 3.3243,
"step": 2800
},
{
"epoch": 14.0,
"eval_loss": 5.115802764892578,
"eval_runtime": 14.2589,
"eval_samples_per_second": 50.425,
"eval_steps_per_second": 6.312,
"step": 2842
}
],
"logging_steps": 50,
"max_steps": 6060,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.3660516081664e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}