|
{ |
|
"best_metric": 0.8025246858596802, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.11486331265793706, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00022972662531587412, |
|
"eval_loss": 1.124374270439148, |
|
"eval_runtime": 51.9298, |
|
"eval_samples_per_second": 35.298, |
|
"eval_steps_per_second": 8.839, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002297266253158741, |
|
"grad_norm": 0.9001293778419495, |
|
"learning_rate": 4.12e-05, |
|
"loss": 0.9218, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004594532506317482, |
|
"grad_norm": 1.1467550992965698, |
|
"learning_rate": 8.24e-05, |
|
"loss": 0.9375, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006891798759476223, |
|
"grad_norm": 1.1466718912124634, |
|
"learning_rate": 0.0001236, |
|
"loss": 0.9289, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009189065012634964, |
|
"grad_norm": 1.5765711069107056, |
|
"learning_rate": 0.0001648, |
|
"loss": 0.8709, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011486331265793705, |
|
"grad_norm": 1.228388786315918, |
|
"learning_rate": 0.000206, |
|
"loss": 0.925, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011486331265793705, |
|
"eval_loss": 0.8846560120582581, |
|
"eval_runtime": 52.2146, |
|
"eval_samples_per_second": 35.105, |
|
"eval_steps_per_second": 8.791, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.013783597518952447, |
|
"grad_norm": 0.6585016250610352, |
|
"learning_rate": 0.0002057490971767619, |
|
"loss": 0.7491, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01608086377211119, |
|
"grad_norm": 0.8562206029891968, |
|
"learning_rate": 0.00020499761108038175, |
|
"loss": 0.8757, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.018378130025269928, |
|
"grad_norm": 0.7634038925170898, |
|
"learning_rate": 0.00020374920287558198, |
|
"loss": 0.9325, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02067539627842867, |
|
"grad_norm": 0.8068410754203796, |
|
"learning_rate": 0.00020200995468164684, |
|
"loss": 0.842, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02297266253158741, |
|
"grad_norm": 1.1460894346237183, |
|
"learning_rate": 0.00019978833994094855, |
|
"loss": 0.8731, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02297266253158741, |
|
"eval_loss": 0.8611448407173157, |
|
"eval_runtime": 51.6983, |
|
"eval_samples_per_second": 35.456, |
|
"eval_steps_per_second": 8.878, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02526992878474615, |
|
"grad_norm": 0.5959282517433167, |
|
"learning_rate": 0.00019709518213718787, |
|
"loss": 0.7564, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.027567195037904894, |
|
"grad_norm": 0.7839557528495789, |
|
"learning_rate": 0.00019394360206446948, |
|
"loss": 0.8659, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.029864461291063633, |
|
"grad_norm": 0.8548005819320679, |
|
"learning_rate": 0.00019034895390411186, |
|
"loss": 0.8915, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03216172754422238, |
|
"grad_norm": 0.7380218505859375, |
|
"learning_rate": 0.0001863287504206196, |
|
"loss": 0.8667, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03445899379738112, |
|
"grad_norm": 1.0800361633300781, |
|
"learning_rate": 0.00018190257764125471, |
|
"loss": 0.848, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03445899379738112, |
|
"eval_loss": 0.8501473665237427, |
|
"eval_runtime": 51.9217, |
|
"eval_samples_per_second": 35.303, |
|
"eval_steps_per_second": 8.84, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.036756260050539856, |
|
"grad_norm": 0.5680871605873108, |
|
"learning_rate": 0.00017709199943488106, |
|
"loss": 0.7338, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0390535263036986, |
|
"grad_norm": 0.6372123956680298, |
|
"learning_rate": 0.00017192045245496238, |
|
"loss": 0.8033, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04135079255685734, |
|
"grad_norm": 0.8441762924194336, |
|
"learning_rate": 0.00016641313195854277, |
|
"loss": 0.9452, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04364805881001608, |
|
"grad_norm": 0.7282130718231201, |
|
"learning_rate": 0.0001605968690574869, |
|
"loss": 0.8036, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04594532506317482, |
|
"grad_norm": 1.0529526472091675, |
|
"learning_rate": 0.0001545, |
|
"loss": 0.8569, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04594532506317482, |
|
"eval_loss": 0.8399880528450012, |
|
"eval_runtime": 51.6878, |
|
"eval_samples_per_second": 35.463, |
|
"eval_steps_per_second": 8.88, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.048242591316333565, |
|
"grad_norm": 0.5307555794715881, |
|
"learning_rate": 0.00014815222811927496, |
|
"loss": 0.747, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0505398575694923, |
|
"grad_norm": 0.7486038208007812, |
|
"learning_rate": 0.00014158447912183896, |
|
"loss": 0.8806, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.052837123822651044, |
|
"grad_norm": 0.875223696231842, |
|
"learning_rate": 0.00013482875042061958, |
|
"loss": 0.9061, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.05513439007580979, |
|
"grad_norm": 0.7279521226882935, |
|
"learning_rate": 0.00012791795524676576, |
|
"loss": 0.8071, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05743165632896853, |
|
"grad_norm": 1.203727126121521, |
|
"learning_rate": 0.00012088576229969385, |
|
"loss": 0.8783, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05743165632896853, |
|
"eval_loss": 0.8261856436729431, |
|
"eval_runtime": 51.6149, |
|
"eval_samples_per_second": 35.513, |
|
"eval_steps_per_second": 8.893, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05972892258212727, |
|
"grad_norm": 0.5175067782402039, |
|
"learning_rate": 0.0001137664317165683, |
|
"loss": 0.6935, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06202618883528601, |
|
"grad_norm": 0.6327574849128723, |
|
"learning_rate": 0.00010659464816035761, |
|
"loss": 0.7742, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06432345508844475, |
|
"grad_norm": 0.7658243775367737, |
|
"learning_rate": 9.940535183964242e-05, |
|
"loss": 0.8565, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06662072134160349, |
|
"grad_norm": 0.7925461530685425, |
|
"learning_rate": 9.22335682834317e-05, |
|
"loss": 0.835, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.06891798759476224, |
|
"grad_norm": 0.9995759129524231, |
|
"learning_rate": 8.511423770030617e-05, |
|
"loss": 0.8554, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06891798759476224, |
|
"eval_loss": 0.817738950252533, |
|
"eval_runtime": 51.6421, |
|
"eval_samples_per_second": 35.494, |
|
"eval_steps_per_second": 8.888, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07121525384792098, |
|
"grad_norm": 0.5488688945770264, |
|
"learning_rate": 7.808204475323423e-05, |
|
"loss": 0.692, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07351252010107971, |
|
"grad_norm": 0.6316297054290771, |
|
"learning_rate": 7.117124957938042e-05, |
|
"loss": 0.8166, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.07580978635423846, |
|
"grad_norm": 0.6976966857910156, |
|
"learning_rate": 6.441552087816105e-05, |
|
"loss": 0.9222, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0781070526073972, |
|
"grad_norm": 0.6356428265571594, |
|
"learning_rate": 5.784777188072502e-05, |
|
"loss": 0.852, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08040431886055593, |
|
"grad_norm": 0.9787359833717346, |
|
"learning_rate": 5.150000000000002e-05, |
|
"loss": 0.8186, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08040431886055593, |
|
"eval_loss": 0.8114416003227234, |
|
"eval_runtime": 51.6228, |
|
"eval_samples_per_second": 35.508, |
|
"eval_steps_per_second": 8.891, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08270158511371468, |
|
"grad_norm": 0.5780043601989746, |
|
"learning_rate": 4.540313094251309e-05, |
|
"loss": 0.6796, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08499885136687342, |
|
"grad_norm": 0.7033482193946838, |
|
"learning_rate": 3.958686804145719e-05, |
|
"loss": 0.8087, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.08729611762003216, |
|
"grad_norm": 0.5974348783493042, |
|
"learning_rate": 3.4079547545037634e-05, |
|
"loss": 0.7923, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.08959338387319091, |
|
"grad_norm": 0.8100835084915161, |
|
"learning_rate": 2.8908000565118947e-05, |
|
"loss": 0.8478, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09189065012634964, |
|
"grad_norm": 0.816436231136322, |
|
"learning_rate": 2.4097422358745275e-05, |
|
"loss": 0.8498, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09189065012634964, |
|
"eval_loss": 0.8052087426185608, |
|
"eval_runtime": 51.7512, |
|
"eval_samples_per_second": 35.419, |
|
"eval_steps_per_second": 8.869, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09418791637950838, |
|
"grad_norm": 0.5542159080505371, |
|
"learning_rate": 1.9671249579380422e-05, |
|
"loss": 0.7486, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.09648518263266713, |
|
"grad_norm": 0.5713785886764526, |
|
"learning_rate": 1.5651046095888127e-05, |
|
"loss": 0.7955, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.09878244888582587, |
|
"grad_norm": 0.7074142098426819, |
|
"learning_rate": 1.205639793553052e-05, |
|
"loss": 0.8298, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1010797151389846, |
|
"grad_norm": 0.6469036936759949, |
|
"learning_rate": 8.904817862812098e-06, |
|
"loss": 0.8189, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.10337698139214335, |
|
"grad_norm": 0.9961839914321899, |
|
"learning_rate": 6.211660059051443e-06, |
|
"loss": 0.816, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.10337698139214335, |
|
"eval_loss": 0.803685188293457, |
|
"eval_runtime": 52.0877, |
|
"eval_samples_per_second": 35.191, |
|
"eval_steps_per_second": 8.812, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.10567424764530209, |
|
"grad_norm": 0.5176745057106018, |
|
"learning_rate": 3.990045318353154e-06, |
|
"loss": 0.7438, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.10797151389846082, |
|
"grad_norm": 0.5562933087348938, |
|
"learning_rate": 2.250797124418014e-06, |
|
"loss": 0.8519, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.11026878015161957, |
|
"grad_norm": 0.634845495223999, |
|
"learning_rate": 1.0023889196182526e-06, |
|
"loss": 0.8369, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.11256604640477831, |
|
"grad_norm": 0.7085196375846863, |
|
"learning_rate": 2.5090282323810766e-07, |
|
"loss": 0.8191, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.11486331265793706, |
|
"grad_norm": 1.0037809610366821, |
|
"learning_rate": 0.0, |
|
"loss": 0.8017, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11486331265793706, |
|
"eval_loss": 0.8025246858596802, |
|
"eval_runtime": 51.8757, |
|
"eval_samples_per_second": 35.334, |
|
"eval_steps_per_second": 8.848, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.39979133403136e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|