|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 14.416919708251953, |
|
"learning_rate": 9.75e-05, |
|
"loss": 0.3808, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 14.942909240722656, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.1636, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.1972920000553131, |
|
"eval_mse": 0.1972920149564743, |
|
"eval_runtime": 1.8069, |
|
"eval_samples_per_second": 21.584, |
|
"eval_steps_per_second": 2.767, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.4087517261505127, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.2302, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 21.778547286987305, |
|
"learning_rate": 9e-05, |
|
"loss": 0.1331, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.046439673751592636, |
|
"eval_mse": 0.046439677476882935, |
|
"eval_runtime": 1.622, |
|
"eval_samples_per_second": 24.044, |
|
"eval_steps_per_second": 3.083, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.1269134283065796, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.05, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.305675983428955, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.0289, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.03575298935174942, |
|
"eval_mse": 0.03575298190116882, |
|
"eval_runtime": 1.6208, |
|
"eval_samples_per_second": 24.063, |
|
"eval_steps_per_second": 3.085, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.7014061212539673, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.0246, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.39174097776412964, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0221, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.03326353803277016, |
|
"eval_mse": 0.03326353803277016, |
|
"eval_runtime": 1.6114, |
|
"eval_samples_per_second": 24.203, |
|
"eval_steps_per_second": 3.103, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 2.6145267486572266, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.021, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 13.847771644592285, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0223, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.03398064896464348, |
|
"eval_mse": 0.033980656415224075, |
|
"eval_runtime": 1.6515, |
|
"eval_samples_per_second": 23.615, |
|
"eval_steps_per_second": 3.028, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.43660324811935425, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.0131, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 2.1774399280548096, |
|
"learning_rate": 7e-05, |
|
"loss": 0.0117, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.04457540437579155, |
|
"eval_mse": 0.04457540065050125, |
|
"eval_runtime": 1.6081, |
|
"eval_samples_per_second": 24.252, |
|
"eval_steps_per_second": 3.109, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 2.61739182472229, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.0168, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 2.7107529640197754, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.0107, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.03702976927161217, |
|
"eval_mse": 0.037029776722192764, |
|
"eval_runtime": 1.6027, |
|
"eval_samples_per_second": 24.334, |
|
"eval_steps_per_second": 3.12, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 4.807140350341797, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.0112, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.6699814796447754, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0096, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.03073795698583126, |
|
"eval_mse": 0.030737943947315216, |
|
"eval_runtime": 1.678, |
|
"eval_samples_per_second": 23.242, |
|
"eval_steps_per_second": 2.98, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 5.444133281707764, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.0099, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 1.5312561988830566, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.0142, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.03504549711942673, |
|
"eval_mse": 0.03504551202058792, |
|
"eval_runtime": 1.6103, |
|
"eval_samples_per_second": 24.218, |
|
"eval_steps_per_second": 3.105, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 1.527550220489502, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.0051, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.0232219696044922, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0069, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.032399099320173264, |
|
"eval_mse": 0.03239908814430237, |
|
"eval_runtime": 1.61, |
|
"eval_samples_per_second": 24.224, |
|
"eval_steps_per_second": 3.106, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 10.5, |
|
"grad_norm": 1.1013288497924805, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.0034, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.35051777958869934, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.0028, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.02933628484606743, |
|
"eval_mse": 0.029336294159293175, |
|
"eval_runtime": 1.7012, |
|
"eval_samples_per_second": 22.925, |
|
"eval_steps_per_second": 2.939, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 1.1170843839645386, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.0019, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 1.3299288749694824, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0044, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.028278259560465813, |
|
"eval_mse": 0.028278270736336708, |
|
"eval_runtime": 1.5914, |
|
"eval_samples_per_second": 24.506, |
|
"eval_steps_per_second": 3.142, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 1.6604584455490112, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.002, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 1.2441127300262451, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.0011, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.029920559376478195, |
|
"eval_mse": 0.029920564964413643, |
|
"eval_runtime": 1.6282, |
|
"eval_samples_per_second": 23.953, |
|
"eval_steps_per_second": 3.071, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.7714802026748657, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.0008, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.5498138070106506, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0005, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.027942122891545296, |
|
"eval_mse": 0.027942117303609848, |
|
"eval_runtime": 1.5994, |
|
"eval_samples_per_second": 24.384, |
|
"eval_steps_per_second": 3.126, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 14.5, |
|
"grad_norm": 0.5462870001792908, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.0006, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.32672354578971863, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0005, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.029117202386260033, |
|
"eval_mse": 0.02911720797419548, |
|
"eval_runtime": 1.595, |
|
"eval_samples_per_second": 24.451, |
|
"eval_steps_per_second": 3.135, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 15.5, |
|
"grad_norm": 0.7088171243667603, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.0012, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.3224898874759674, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0011, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.028802577406167984, |
|
"eval_mse": 0.028802569955587387, |
|
"eval_runtime": 1.6242, |
|
"eval_samples_per_second": 24.012, |
|
"eval_steps_per_second": 3.078, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.2536928951740265, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.0002, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.2693057060241699, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.0003, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.028974896296858788, |
|
"eval_mse": 0.02897489443421364, |
|
"eval_runtime": 1.5937, |
|
"eval_samples_per_second": 24.472, |
|
"eval_steps_per_second": 3.137, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 0.22231905162334442, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0001, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.44173210859298706, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0001, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.029911378398537636, |
|
"eval_mse": 0.029911383986473083, |
|
"eval_runtime": 1.5913, |
|
"eval_samples_per_second": 24.509, |
|
"eval_steps_per_second": 3.142, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.2958744764328003, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.0001, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.41316938400268555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0001, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.029724078252911568, |
|
"eval_mse": 0.029724083840847015, |
|
"eval_runtime": 1.5907, |
|
"eval_samples_per_second": 24.518, |
|
"eval_steps_per_second": 3.143, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 19.5, |
|
"grad_norm": 0.0391409695148468, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.021498844027519226, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|