|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 198, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010129787907565685, |
|
"grad_norm": 5.634124615502245, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3589, |
|
"mean_token_accuracy": 0.9127573072910309, |
|
"num_tokens": 586686.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10129787907565685, |
|
"grad_norm": 0.34120253622850233, |
|
"learning_rate": 0.0001, |
|
"loss": 0.2653, |
|
"mean_token_accuracy": 0.9179378294696411, |
|
"num_tokens": 5928997.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2025957581513137, |
|
"grad_norm": 0.15880751179799044, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1607, |
|
"mean_token_accuracy": 0.9390914304181933, |
|
"num_tokens": 11864766.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30389363722697055, |
|
"grad_norm": 0.10858768982732472, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1389, |
|
"mean_token_accuracy": 0.9470763374119997, |
|
"num_tokens": 17788976.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4051915163026274, |
|
"grad_norm": 0.07843910135917725, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1254, |
|
"mean_token_accuracy": 0.9517961731180549, |
|
"num_tokens": 23762431.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5064893953782843, |
|
"grad_norm": 0.09149076132992681, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1173, |
|
"mean_token_accuracy": 0.9551124922931195, |
|
"num_tokens": 29691182.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6077872744539411, |
|
"grad_norm": 0.0988060674250123, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1126, |
|
"mean_token_accuracy": 0.9569881336763502, |
|
"num_tokens": 35615925.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.709085153529598, |
|
"grad_norm": 0.0747939087236935, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1078, |
|
"mean_token_accuracy": 0.9588578680530191, |
|
"num_tokens": 41567014.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8103830326052548, |
|
"grad_norm": 0.09381859172196332, |
|
"learning_rate": 0.0001, |
|
"loss": 0.107, |
|
"mean_token_accuracy": 0.9592851245775819, |
|
"num_tokens": 47465256.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9116809116809117, |
|
"grad_norm": 0.07959394726227396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1045, |
|
"mean_token_accuracy": 0.9601302666589617, |
|
"num_tokens": 53394634.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.0101297879075657, |
|
"grad_norm": 0.08937759002808963, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1018, |
|
"mean_token_accuracy": 0.9612304681367047, |
|
"num_tokens": 59195734.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1114276669832226, |
|
"grad_norm": 0.07683886672466819, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0887, |
|
"mean_token_accuracy": 0.9659375650808215, |
|
"num_tokens": 65134645.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2127255460588793, |
|
"grad_norm": 0.08142019069177424, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0885, |
|
"mean_token_accuracy": 0.965754240937531, |
|
"num_tokens": 71072294.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3140234251345362, |
|
"grad_norm": 0.07420029680099594, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0892, |
|
"mean_token_accuracy": 0.9654547093436122, |
|
"num_tokens": 77036690.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.415321304210193, |
|
"grad_norm": 0.082199072710958, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0908, |
|
"mean_token_accuracy": 0.9649530470371246, |
|
"num_tokens": 82943314.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.51661918328585, |
|
"grad_norm": 0.07212494518614486, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0906, |
|
"mean_token_accuracy": 0.9651403101161122, |
|
"num_tokens": 88875871.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.617917062361507, |
|
"grad_norm": 0.10352344172757606, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0909, |
|
"mean_token_accuracy": 0.9648977383971215, |
|
"num_tokens": 94820130.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.7192149414371638, |
|
"grad_norm": 0.07698503957302266, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0915, |
|
"mean_token_accuracy": 0.9645912747830152, |
|
"num_tokens": 100733124.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8205128205128205, |
|
"grad_norm": 0.08462828819382234, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0911, |
|
"mean_token_accuracy": 0.9649000752717256, |
|
"num_tokens": 106690216.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9218106995884774, |
|
"grad_norm": 0.06837628398080642, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0915, |
|
"mean_token_accuracy": 0.9646391872316599, |
|
"num_tokens": 112618930.0, |
|
"step": 190 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 392, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 309419476910080.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|