neurocoder's picture
Model save
96aac03 verified
raw
history blame
16 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995119570522206,
"eval_steps": 500,
"global_step": 512,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009760858955588092,
"grad_norm": 23.375,
"learning_rate": 0.0002,
"loss": 2.8524,
"step": 5
},
{
"epoch": 0.019521717911176184,
"grad_norm": 12.9375,
"learning_rate": 0.0002,
"loss": 1.7471,
"step": 10
},
{
"epoch": 0.029282576866764276,
"grad_norm": 5.34375,
"learning_rate": 0.0002,
"loss": 1.4572,
"step": 15
},
{
"epoch": 0.03904343582235237,
"grad_norm": 16.625,
"learning_rate": 0.0002,
"loss": 1.3997,
"step": 20
},
{
"epoch": 0.04880429477794046,
"grad_norm": 6.875,
"learning_rate": 0.0002,
"loss": 1.5013,
"step": 25
},
{
"epoch": 0.05856515373352855,
"grad_norm": 5.25,
"learning_rate": 0.0002,
"loss": 1.4325,
"step": 30
},
{
"epoch": 0.06832601268911664,
"grad_norm": 4.5,
"learning_rate": 0.0002,
"loss": 1.3938,
"step": 35
},
{
"epoch": 0.07808687164470474,
"grad_norm": 4.53125,
"learning_rate": 0.0002,
"loss": 1.4496,
"step": 40
},
{
"epoch": 0.08784773060029283,
"grad_norm": 4.90625,
"learning_rate": 0.0002,
"loss": 1.4919,
"step": 45
},
{
"epoch": 0.09760858955588092,
"grad_norm": 3.96875,
"learning_rate": 0.0002,
"loss": 1.4757,
"step": 50
},
{
"epoch": 0.10736944851146901,
"grad_norm": 5.8125,
"learning_rate": 0.0002,
"loss": 1.5236,
"step": 55
},
{
"epoch": 0.1171303074670571,
"grad_norm": 3.71875,
"learning_rate": 0.0002,
"loss": 1.464,
"step": 60
},
{
"epoch": 0.1268911664226452,
"grad_norm": 5.0,
"learning_rate": 0.0002,
"loss": 1.4982,
"step": 65
},
{
"epoch": 0.1366520253782333,
"grad_norm": 4.15625,
"learning_rate": 0.0002,
"loss": 1.3583,
"step": 70
},
{
"epoch": 0.14641288433382138,
"grad_norm": 3.203125,
"learning_rate": 0.0002,
"loss": 1.3943,
"step": 75
},
{
"epoch": 0.15617374328940947,
"grad_norm": 3.796875,
"learning_rate": 0.0002,
"loss": 1.4915,
"step": 80
},
{
"epoch": 0.16593460224499756,
"grad_norm": 4.15625,
"learning_rate": 0.0002,
"loss": 1.5084,
"step": 85
},
{
"epoch": 0.17569546120058566,
"grad_norm": 3.1875,
"learning_rate": 0.0002,
"loss": 1.4788,
"step": 90
},
{
"epoch": 0.18545632015617375,
"grad_norm": 7.28125,
"learning_rate": 0.0002,
"loss": 1.4738,
"step": 95
},
{
"epoch": 0.19521717911176184,
"grad_norm": 3.734375,
"learning_rate": 0.0002,
"loss": 1.4498,
"step": 100
},
{
"epoch": 0.20497803806734993,
"grad_norm": 7.15625,
"learning_rate": 0.0002,
"loss": 1.5028,
"step": 105
},
{
"epoch": 0.21473889702293802,
"grad_norm": 7.3125,
"learning_rate": 0.0002,
"loss": 1.4605,
"step": 110
},
{
"epoch": 0.22449975597852612,
"grad_norm": 3.53125,
"learning_rate": 0.0002,
"loss": 1.4047,
"step": 115
},
{
"epoch": 0.2342606149341142,
"grad_norm": 3.640625,
"learning_rate": 0.0002,
"loss": 1.4216,
"step": 120
},
{
"epoch": 0.2440214738897023,
"grad_norm": 3.5,
"learning_rate": 0.0002,
"loss": 1.4401,
"step": 125
},
{
"epoch": 0.2537823328452904,
"grad_norm": 2.78125,
"learning_rate": 0.0002,
"loss": 1.4301,
"step": 130
},
{
"epoch": 0.2635431918008785,
"grad_norm": 2.96875,
"learning_rate": 0.0002,
"loss": 1.4325,
"step": 135
},
{
"epoch": 0.2733040507564666,
"grad_norm": 2.8125,
"learning_rate": 0.0002,
"loss": 1.3988,
"step": 140
},
{
"epoch": 0.28306490971205467,
"grad_norm": 3.234375,
"learning_rate": 0.0002,
"loss": 1.3768,
"step": 145
},
{
"epoch": 0.29282576866764276,
"grad_norm": 5.0,
"learning_rate": 0.0002,
"loss": 1.4328,
"step": 150
},
{
"epoch": 0.30258662762323085,
"grad_norm": 4.0625,
"learning_rate": 0.0002,
"loss": 1.4189,
"step": 155
},
{
"epoch": 0.31234748657881894,
"grad_norm": 2.78125,
"learning_rate": 0.0002,
"loss": 1.3605,
"step": 160
},
{
"epoch": 0.32210834553440704,
"grad_norm": 3.40625,
"learning_rate": 0.0002,
"loss": 1.4092,
"step": 165
},
{
"epoch": 0.33186920448999513,
"grad_norm": 3.421875,
"learning_rate": 0.0002,
"loss": 1.3854,
"step": 170
},
{
"epoch": 0.3416300634455832,
"grad_norm": 3.375,
"learning_rate": 0.0002,
"loss": 1.3923,
"step": 175
},
{
"epoch": 0.3513909224011713,
"grad_norm": 2.609375,
"learning_rate": 0.0002,
"loss": 1.3395,
"step": 180
},
{
"epoch": 0.3611517813567594,
"grad_norm": 2.671875,
"learning_rate": 0.0002,
"loss": 1.3532,
"step": 185
},
{
"epoch": 0.3709126403123475,
"grad_norm": 2.765625,
"learning_rate": 0.0002,
"loss": 1.3764,
"step": 190
},
{
"epoch": 0.3806734992679356,
"grad_norm": 3.109375,
"learning_rate": 0.0002,
"loss": 1.3179,
"step": 195
},
{
"epoch": 0.3904343582235237,
"grad_norm": 2.71875,
"learning_rate": 0.0002,
"loss": 1.309,
"step": 200
},
{
"epoch": 0.4001952171791118,
"grad_norm": 2.9375,
"learning_rate": 0.0002,
"loss": 1.4215,
"step": 205
},
{
"epoch": 0.40995607613469986,
"grad_norm": 2.71875,
"learning_rate": 0.0002,
"loss": 1.4362,
"step": 210
},
{
"epoch": 0.41971693509028796,
"grad_norm": 2.875,
"learning_rate": 0.0002,
"loss": 1.3835,
"step": 215
},
{
"epoch": 0.42947779404587605,
"grad_norm": 2.46875,
"learning_rate": 0.0002,
"loss": 1.3099,
"step": 220
},
{
"epoch": 0.43923865300146414,
"grad_norm": 2.609375,
"learning_rate": 0.0002,
"loss": 1.3739,
"step": 225
},
{
"epoch": 0.44899951195705223,
"grad_norm": 2.890625,
"learning_rate": 0.0002,
"loss": 1.3374,
"step": 230
},
{
"epoch": 0.4587603709126403,
"grad_norm": 2.453125,
"learning_rate": 0.0002,
"loss": 1.3693,
"step": 235
},
{
"epoch": 0.4685212298682284,
"grad_norm": 2.625,
"learning_rate": 0.0002,
"loss": 1.3478,
"step": 240
},
{
"epoch": 0.4782820888238165,
"grad_norm": 2.78125,
"learning_rate": 0.0002,
"loss": 1.3685,
"step": 245
},
{
"epoch": 0.4880429477794046,
"grad_norm": 2.53125,
"learning_rate": 0.0002,
"loss": 1.2963,
"step": 250
},
{
"epoch": 0.4978038067349927,
"grad_norm": 2.359375,
"learning_rate": 0.0002,
"loss": 1.3452,
"step": 255
},
{
"epoch": 0.5075646656905808,
"grad_norm": 2.515625,
"learning_rate": 0.0002,
"loss": 1.3321,
"step": 260
},
{
"epoch": 0.5173255246461689,
"grad_norm": 2.65625,
"learning_rate": 0.0002,
"loss": 1.3702,
"step": 265
},
{
"epoch": 0.527086383601757,
"grad_norm": 3.296875,
"learning_rate": 0.0002,
"loss": 1.4063,
"step": 270
},
{
"epoch": 0.5368472425573451,
"grad_norm": 2.59375,
"learning_rate": 0.0002,
"loss": 1.2899,
"step": 275
},
{
"epoch": 0.5466081015129332,
"grad_norm": 2.453125,
"learning_rate": 0.0002,
"loss": 1.309,
"step": 280
},
{
"epoch": 0.5563689604685212,
"grad_norm": 2.5,
"learning_rate": 0.0002,
"loss": 1.3354,
"step": 285
},
{
"epoch": 0.5661298194241093,
"grad_norm": 2.578125,
"learning_rate": 0.0002,
"loss": 1.3682,
"step": 290
},
{
"epoch": 0.5758906783796974,
"grad_norm": 2.40625,
"learning_rate": 0.0002,
"loss": 1.3351,
"step": 295
},
{
"epoch": 0.5856515373352855,
"grad_norm": 2.65625,
"learning_rate": 0.0002,
"loss": 1.3483,
"step": 300
},
{
"epoch": 0.5954123962908736,
"grad_norm": 2.421875,
"learning_rate": 0.0002,
"loss": 1.292,
"step": 305
},
{
"epoch": 0.6051732552464617,
"grad_norm": 2.53125,
"learning_rate": 0.0002,
"loss": 1.3021,
"step": 310
},
{
"epoch": 0.6149341142020498,
"grad_norm": 2.453125,
"learning_rate": 0.0002,
"loss": 1.3805,
"step": 315
},
{
"epoch": 0.6246949731576379,
"grad_norm": 2.71875,
"learning_rate": 0.0002,
"loss": 1.3212,
"step": 320
},
{
"epoch": 0.634455832113226,
"grad_norm": 2.40625,
"learning_rate": 0.0002,
"loss": 1.2793,
"step": 325
},
{
"epoch": 0.6442166910688141,
"grad_norm": 2.53125,
"learning_rate": 0.0002,
"loss": 1.3733,
"step": 330
},
{
"epoch": 0.6539775500244022,
"grad_norm": 2.53125,
"learning_rate": 0.0002,
"loss": 1.2849,
"step": 335
},
{
"epoch": 0.6637384089799903,
"grad_norm": 2.59375,
"learning_rate": 0.0002,
"loss": 1.3962,
"step": 340
},
{
"epoch": 0.6734992679355783,
"grad_norm": 2.796875,
"learning_rate": 0.0002,
"loss": 1.3527,
"step": 345
},
{
"epoch": 0.6832601268911664,
"grad_norm": 2.828125,
"learning_rate": 0.0002,
"loss": 1.3286,
"step": 350
},
{
"epoch": 0.6930209858467545,
"grad_norm": 2.671875,
"learning_rate": 0.0002,
"loss": 1.3914,
"step": 355
},
{
"epoch": 0.7027818448023426,
"grad_norm": 5.125,
"learning_rate": 0.0002,
"loss": 1.3359,
"step": 360
},
{
"epoch": 0.7125427037579307,
"grad_norm": 2.609375,
"learning_rate": 0.0002,
"loss": 1.3115,
"step": 365
},
{
"epoch": 0.7223035627135188,
"grad_norm": 2.109375,
"learning_rate": 0.0002,
"loss": 1.3279,
"step": 370
},
{
"epoch": 0.7320644216691069,
"grad_norm": 7.15625,
"learning_rate": 0.0002,
"loss": 1.2885,
"step": 375
},
{
"epoch": 0.741825280624695,
"grad_norm": 3.390625,
"learning_rate": 0.0002,
"loss": 1.322,
"step": 380
},
{
"epoch": 0.7515861395802831,
"grad_norm": 3.109375,
"learning_rate": 0.0002,
"loss": 1.3047,
"step": 385
},
{
"epoch": 0.7613469985358712,
"grad_norm": 2.65625,
"learning_rate": 0.0002,
"loss": 1.2534,
"step": 390
},
{
"epoch": 0.7711078574914593,
"grad_norm": 2.734375,
"learning_rate": 0.0002,
"loss": 1.2539,
"step": 395
},
{
"epoch": 0.7808687164470474,
"grad_norm": 2.71875,
"learning_rate": 0.0002,
"loss": 1.277,
"step": 400
},
{
"epoch": 0.7906295754026355,
"grad_norm": 2.8125,
"learning_rate": 0.0002,
"loss": 1.2932,
"step": 405
},
{
"epoch": 0.8003904343582235,
"grad_norm": 2.296875,
"learning_rate": 0.0002,
"loss": 1.2782,
"step": 410
},
{
"epoch": 0.8101512933138116,
"grad_norm": 2.359375,
"learning_rate": 0.0002,
"loss": 1.294,
"step": 415
},
{
"epoch": 0.8199121522693997,
"grad_norm": 2.296875,
"learning_rate": 0.0002,
"loss": 1.2311,
"step": 420
},
{
"epoch": 0.8296730112249878,
"grad_norm": 2.28125,
"learning_rate": 0.0002,
"loss": 1.3014,
"step": 425
},
{
"epoch": 0.8394338701805759,
"grad_norm": 2.859375,
"learning_rate": 0.0002,
"loss": 1.325,
"step": 430
},
{
"epoch": 0.849194729136164,
"grad_norm": 2.125,
"learning_rate": 0.0002,
"loss": 1.187,
"step": 435
},
{
"epoch": 0.8589555880917521,
"grad_norm": 2.359375,
"learning_rate": 0.0002,
"loss": 1.2626,
"step": 440
},
{
"epoch": 0.8687164470473402,
"grad_norm": 2.3125,
"learning_rate": 0.0002,
"loss": 1.1967,
"step": 445
},
{
"epoch": 0.8784773060029283,
"grad_norm": 2.125,
"learning_rate": 0.0002,
"loss": 1.3065,
"step": 450
},
{
"epoch": 0.8882381649585164,
"grad_norm": 2.421875,
"learning_rate": 0.0002,
"loss": 1.2892,
"step": 455
},
{
"epoch": 0.8979990239141045,
"grad_norm": 2.3125,
"learning_rate": 0.0002,
"loss": 1.2817,
"step": 460
},
{
"epoch": 0.9077598828696926,
"grad_norm": 2.109375,
"learning_rate": 0.0002,
"loss": 1.2344,
"step": 465
},
{
"epoch": 0.9175207418252807,
"grad_norm": 2.359375,
"learning_rate": 0.0002,
"loss": 1.349,
"step": 470
},
{
"epoch": 0.9272816007808687,
"grad_norm": 2.21875,
"learning_rate": 0.0002,
"loss": 1.255,
"step": 475
},
{
"epoch": 0.9370424597364568,
"grad_norm": 2.03125,
"learning_rate": 0.0002,
"loss": 1.2741,
"step": 480
},
{
"epoch": 0.9468033186920449,
"grad_norm": 2.328125,
"learning_rate": 0.0002,
"loss": 1.3024,
"step": 485
},
{
"epoch": 0.956564177647633,
"grad_norm": 2.34375,
"learning_rate": 0.0002,
"loss": 1.297,
"step": 490
},
{
"epoch": 0.9663250366032211,
"grad_norm": 2.125,
"learning_rate": 0.0002,
"loss": 1.2095,
"step": 495
},
{
"epoch": 0.9760858955588092,
"grad_norm": 2.1875,
"learning_rate": 0.0002,
"loss": 1.3445,
"step": 500
},
{
"epoch": 0.9858467545143973,
"grad_norm": 2.78125,
"learning_rate": 0.0002,
"loss": 1.349,
"step": 505
},
{
"epoch": 0.9956076134699854,
"grad_norm": 2.296875,
"learning_rate": 0.0002,
"loss": 1.2542,
"step": 510
},
{
"epoch": 0.9995119570522206,
"step": 512,
"total_flos": 3518281600204800.0,
"train_loss": 1.3758189086802304,
"train_runtime": 549.4956,
"train_samples_per_second": 14.915,
"train_steps_per_second": 0.932
}
],
"logging_steps": 5,
"max_steps": 512,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3518281600204800.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}