|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.30140908748398765, |
|
"eval_steps": 400, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006028181749679753, |
|
"grad_norm": 0.38845974438826136, |
|
"learning_rate": 1.0040160642570282e-06, |
|
"loss": 1.6618, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012056363499359506, |
|
"grad_norm": 0.255603405930248, |
|
"learning_rate": 2.0080321285140564e-06, |
|
"loss": 1.6698, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01808454524903926, |
|
"grad_norm": 0.20116505722657768, |
|
"learning_rate": 3.0120481927710846e-06, |
|
"loss": 1.6264, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02411272699871901, |
|
"grad_norm": 0.1795881732266397, |
|
"learning_rate": 4.016064257028113e-06, |
|
"loss": 1.6025, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.030140908748398764, |
|
"grad_norm": 0.14600421495766222, |
|
"learning_rate": 5.020080321285141e-06, |
|
"loss": 1.5584, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03616909049807852, |
|
"grad_norm": 0.1170942466306718, |
|
"learning_rate": 6.024096385542169e-06, |
|
"loss": 1.5604, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04219727224775827, |
|
"grad_norm": 0.2445974669666656, |
|
"learning_rate": 7.028112449799197e-06, |
|
"loss": 1.5057, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04822545399743802, |
|
"grad_norm": 0.11238025351136724, |
|
"learning_rate": 8.032128514056226e-06, |
|
"loss": 1.4724, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05425363574711778, |
|
"grad_norm": 0.13639577534070754, |
|
"learning_rate": 9.036144578313253e-06, |
|
"loss": 1.4962, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06028181749679753, |
|
"grad_norm": 0.17113178431310286, |
|
"learning_rate": 1.0040160642570281e-05, |
|
"loss": 1.4765, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06630999924647728, |
|
"grad_norm": 0.4795401637258333, |
|
"learning_rate": 1.104417670682731e-05, |
|
"loss": 1.4468, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07233818099615703, |
|
"grad_norm": 0.13385762516900662, |
|
"learning_rate": 1.2048192771084338e-05, |
|
"loss": 1.4134, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07836636274583679, |
|
"grad_norm": 0.12303707813666019, |
|
"learning_rate": 1.3052208835341367e-05, |
|
"loss": 1.4191, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08439454449551655, |
|
"grad_norm": 0.10822073133399364, |
|
"learning_rate": 1.4056224899598394e-05, |
|
"loss": 1.397, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09042272624519629, |
|
"grad_norm": 0.1109270134990499, |
|
"learning_rate": 1.5060240963855424e-05, |
|
"loss": 1.3818, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09645090799487605, |
|
"grad_norm": 0.14622173134033867, |
|
"learning_rate": 1.606425702811245e-05, |
|
"loss": 1.4002, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1024790897445558, |
|
"grad_norm": 0.10587626114414271, |
|
"learning_rate": 1.706827309236948e-05, |
|
"loss": 1.4123, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10850727149423556, |
|
"grad_norm": 0.10302196814593138, |
|
"learning_rate": 1.8072289156626505e-05, |
|
"loss": 1.4016, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1145354532439153, |
|
"grad_norm": 0.1359849724314843, |
|
"learning_rate": 1.9076305220883535e-05, |
|
"loss": 1.404, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12056363499359506, |
|
"grad_norm": 0.10587622358885339, |
|
"learning_rate": 2.0080321285140562e-05, |
|
"loss": 1.4019, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1265918167432748, |
|
"grad_norm": 0.15017595066321648, |
|
"learning_rate": 2.1084337349397593e-05, |
|
"loss": 1.393, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13261999849295456, |
|
"grad_norm": 0.19475575142022897, |
|
"learning_rate": 2.208835341365462e-05, |
|
"loss": 1.3876, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1386481802426343, |
|
"grad_norm": 0.12084095277263424, |
|
"learning_rate": 2.309236947791165e-05, |
|
"loss": 1.3916, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14467636199231407, |
|
"grad_norm": 0.11857482977859173, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 1.4056, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15070454374199382, |
|
"grad_norm": 0.1403959719635503, |
|
"learning_rate": 2.5100401606425704e-05, |
|
"loss": 1.3935, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15673272549167358, |
|
"grad_norm": 0.10800155257965392, |
|
"learning_rate": 2.6104417670682734e-05, |
|
"loss": 1.3826, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16276090724135334, |
|
"grad_norm": 0.10598439909830581, |
|
"learning_rate": 2.7108433734939758e-05, |
|
"loss": 1.3999, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1687890889910331, |
|
"grad_norm": 0.10753449693494475, |
|
"learning_rate": 2.8112449799196788e-05, |
|
"loss": 1.4047, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17481727074071282, |
|
"grad_norm": 0.36718328659037996, |
|
"learning_rate": 2.911646586345382e-05, |
|
"loss": 1.3935, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.18084545249039258, |
|
"grad_norm": 0.10611900000479042, |
|
"learning_rate": 3.012048192771085e-05, |
|
"loss": 1.3736, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18687363424007233, |
|
"grad_norm": 0.11901555220652378, |
|
"learning_rate": 3.112449799196787e-05, |
|
"loss": 1.3927, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1929018159897521, |
|
"grad_norm": 0.118935148513695, |
|
"learning_rate": 3.21285140562249e-05, |
|
"loss": 1.3636, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19892999773943185, |
|
"grad_norm": 0.1974545721831922, |
|
"learning_rate": 3.313253012048193e-05, |
|
"loss": 1.3892, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2049581794891116, |
|
"grad_norm": 0.13145409772199562, |
|
"learning_rate": 3.413654618473896e-05, |
|
"loss": 1.3756, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.21098636123879136, |
|
"grad_norm": 0.11064380941915805, |
|
"learning_rate": 3.5140562248995983e-05, |
|
"loss": 1.3935, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21701454298847112, |
|
"grad_norm": 0.12160423827639648, |
|
"learning_rate": 3.614457831325301e-05, |
|
"loss": 1.3698, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.22304272473815084, |
|
"grad_norm": 0.10349641889173723, |
|
"learning_rate": 3.7148594377510044e-05, |
|
"loss": 1.3771, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2290709064878306, |
|
"grad_norm": 0.10682144059511894, |
|
"learning_rate": 3.815261044176707e-05, |
|
"loss": 1.3768, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.23509908823751036, |
|
"grad_norm": 0.11625245619819907, |
|
"learning_rate": 3.91566265060241e-05, |
|
"loss": 1.3795, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2411272699871901, |
|
"grad_norm": 0.10327726962763091, |
|
"learning_rate": 4.0160642570281125e-05, |
|
"loss": 1.3987, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2411272699871901, |
|
"eval_loss": 1.3548544645309448, |
|
"eval_runtime": 148.2269, |
|
"eval_samples_per_second": 7.239, |
|
"eval_steps_per_second": 0.911, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.24715545173686987, |
|
"grad_norm": 0.10660530950921367, |
|
"learning_rate": 4.116465863453816e-05, |
|
"loss": 1.3886, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2531836334865496, |
|
"grad_norm": 0.10405582985373843, |
|
"learning_rate": 4.2168674698795186e-05, |
|
"loss": 1.3645, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2592118152362294, |
|
"grad_norm": 0.3318479326670041, |
|
"learning_rate": 4.317269076305221e-05, |
|
"loss": 1.3591, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2652399969859091, |
|
"grad_norm": 0.10840544026201794, |
|
"learning_rate": 4.417670682730924e-05, |
|
"loss": 1.3805, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2712681787355889, |
|
"grad_norm": 0.10730056620740543, |
|
"learning_rate": 4.5180722891566266e-05, |
|
"loss": 1.3888, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2772963604852686, |
|
"grad_norm": 0.10699620793474768, |
|
"learning_rate": 4.61847389558233e-05, |
|
"loss": 1.3935, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2833245422349484, |
|
"grad_norm": 0.10595493402596641, |
|
"learning_rate": 4.718875502008032e-05, |
|
"loss": 1.3659, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.28935272398462814, |
|
"grad_norm": 0.14234040947748414, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 1.371, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.29538090573430786, |
|
"grad_norm": 0.1095349792774781, |
|
"learning_rate": 4.919678714859438e-05, |
|
"loss": 1.3647, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.30140908748398765, |
|
"grad_norm": 0.10655792946130023, |
|
"learning_rate": 4.999997536857586e-05, |
|
"loss": 1.3606, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4974, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 840529663229952.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|