|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9489751417357173, |
|
"eval_steps": 500, |
|
"global_step": 284, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013955516790231139, |
|
"grad_norm": 4.9564723930657655, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3584, |
|
"mean_token_accuracy": 0.916381947696209, |
|
"num_tokens": 794149.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.13955516790231137, |
|
"grad_norm": 0.2761209164773756, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2481, |
|
"mean_token_accuracy": 0.9236801053500838, |
|
"num_tokens": 7995437.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.27911033580462274, |
|
"grad_norm": 0.09969812837236781, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1429, |
|
"mean_token_accuracy": 0.9472733726724982, |
|
"num_tokens": 15966760.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4186655037069342, |
|
"grad_norm": 0.06494669674949567, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1206, |
|
"mean_token_accuracy": 0.9548009386286139, |
|
"num_tokens": 23944501.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5582206716092455, |
|
"grad_norm": 0.06828323186623608, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1069, |
|
"mean_token_accuracy": 0.959625343978405, |
|
"num_tokens": 31937884.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6977758395115569, |
|
"grad_norm": 0.06165242715272168, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0979, |
|
"mean_token_accuracy": 0.9630069149658084, |
|
"num_tokens": 39961024.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8373310074138683, |
|
"grad_norm": 0.11090642053667181, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0915, |
|
"mean_token_accuracy": 0.965560769662261, |
|
"num_tokens": 47949799.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9768861753161797, |
|
"grad_norm": 0.06459506102599961, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0873, |
|
"mean_token_accuracy": 0.9674661625176668, |
|
"num_tokens": 55962901.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.111644134321849, |
|
"grad_norm": 0.10339694562330957, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0795, |
|
"mean_token_accuracy": 0.9700713963956124, |
|
"num_tokens": 63670548.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2511993022241605, |
|
"grad_norm": 0.0950394988703407, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0758, |
|
"mean_token_accuracy": 0.9714846231043339, |
|
"num_tokens": 71651053.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3907544701264718, |
|
"grad_norm": 0.07782860090437102, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0739, |
|
"mean_token_accuracy": 0.9722085015848279, |
|
"num_tokens": 79640732.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5303096380287833, |
|
"grad_norm": 0.07920993179360719, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0736, |
|
"mean_token_accuracy": 0.9723761970177293, |
|
"num_tokens": 87641211.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6698648059310948, |
|
"grad_norm": 0.07411448710660809, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0722, |
|
"mean_token_accuracy": 0.9729392532259226, |
|
"num_tokens": 95648655.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.809419973833406, |
|
"grad_norm": 0.05976275661821771, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0721, |
|
"mean_token_accuracy": 0.9728728512302041, |
|
"num_tokens": 103620940.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.9489751417357173, |
|
"grad_norm": 0.06809511858490988, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0718, |
|
"mean_token_accuracy": 0.9729661472141743, |
|
"num_tokens": 111618326.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.083733100741387, |
|
"grad_norm": 0.07797576138180032, |
|
"learning_rate": 0.0001, |
|
"loss": 0.063, |
|
"mean_token_accuracy": 0.9762524333586585, |
|
"num_tokens": 119354987.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.223288268643698, |
|
"grad_norm": 0.06579714178222891, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0579, |
|
"mean_token_accuracy": 0.9778176054358483, |
|
"num_tokens": 127354165.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.36284343654601, |
|
"grad_norm": 0.06657618820877312, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0585, |
|
"mean_token_accuracy": 0.9775434693321585, |
|
"num_tokens": 135368633.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.502398604448321, |
|
"grad_norm": 0.0804632921365247, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0601, |
|
"mean_token_accuracy": 0.9770329371094704, |
|
"num_tokens": 143337210.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.6419537723506323, |
|
"grad_norm": 0.06313698724150443, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0602, |
|
"mean_token_accuracy": 0.9769743764773011, |
|
"num_tokens": 151337995.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.7815089402529436, |
|
"grad_norm": 0.06376621347327172, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0611, |
|
"mean_token_accuracy": 0.9767070030793548, |
|
"num_tokens": 159338042.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.921064108155255, |
|
"grad_norm": 0.06559121682492687, |
|
"learning_rate": 0.0001, |
|
"loss": 0.062, |
|
"mean_token_accuracy": 0.9763033416122198, |
|
"num_tokens": 167307219.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.0558220671609244, |
|
"grad_norm": 0.10041231718814958, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0559, |
|
"mean_token_accuracy": 0.9786613514122454, |
|
"num_tokens": 175030018.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.195377235063236, |
|
"grad_norm": 0.08091195651854872, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0473, |
|
"mean_token_accuracy": 0.9815034126862884, |
|
"num_tokens": 183021967.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.3349324029655474, |
|
"grad_norm": 0.07575741175199704, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0489, |
|
"mean_token_accuracy": 0.9808894643560052, |
|
"num_tokens": 191014779.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.4744875708678586, |
|
"grad_norm": 0.07829702223737112, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0498, |
|
"mean_token_accuracy": 0.9806048579514026, |
|
"num_tokens": 199025571.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.61404273877017, |
|
"grad_norm": 0.06942538891461214, |
|
"learning_rate": 0.0001, |
|
"loss": 0.051, |
|
"mean_token_accuracy": 0.9801934180781245, |
|
"num_tokens": 207011679.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.7535979066724816, |
|
"grad_norm": 0.07675336174060962, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0515, |
|
"mean_token_accuracy": 0.9799842674285173, |
|
"num_tokens": 215009066.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.893153074574793, |
|
"grad_norm": 0.07733704641759509, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0527, |
|
"mean_token_accuracy": 0.9795725893229246, |
|
"num_tokens": 223003497.0, |
|
"step": 280 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 284, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 507232006963200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|