{ "best_metric": 0.8025246858596802, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.11486331265793706, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00022972662531587412, "eval_loss": 1.124374270439148, "eval_runtime": 51.9298, "eval_samples_per_second": 35.298, "eval_steps_per_second": 8.839, "step": 1 }, { "epoch": 0.002297266253158741, "grad_norm": 0.9001293778419495, "learning_rate": 4.12e-05, "loss": 0.9218, "step": 10 }, { "epoch": 0.004594532506317482, "grad_norm": 1.1467550992965698, "learning_rate": 8.24e-05, "loss": 0.9375, "step": 20 }, { "epoch": 0.006891798759476223, "grad_norm": 1.1466718912124634, "learning_rate": 0.0001236, "loss": 0.9289, "step": 30 }, { "epoch": 0.009189065012634964, "grad_norm": 1.5765711069107056, "learning_rate": 0.0001648, "loss": 0.8709, "step": 40 }, { "epoch": 0.011486331265793705, "grad_norm": 1.228388786315918, "learning_rate": 0.000206, "loss": 0.925, "step": 50 }, { "epoch": 0.011486331265793705, "eval_loss": 0.8846560120582581, "eval_runtime": 52.2146, "eval_samples_per_second": 35.105, "eval_steps_per_second": 8.791, "step": 50 }, { "epoch": 0.013783597518952447, "grad_norm": 0.6585016250610352, "learning_rate": 0.0002057490971767619, "loss": 0.7491, "step": 60 }, { "epoch": 0.01608086377211119, "grad_norm": 0.8562206029891968, "learning_rate": 0.00020499761108038175, "loss": 0.8757, "step": 70 }, { "epoch": 0.018378130025269928, "grad_norm": 0.7634038925170898, "learning_rate": 0.00020374920287558198, "loss": 0.9325, "step": 80 }, { "epoch": 0.02067539627842867, "grad_norm": 0.8068410754203796, "learning_rate": 0.00020200995468164684, "loss": 0.842, "step": 90 }, { "epoch": 0.02297266253158741, "grad_norm": 1.1460894346237183, "learning_rate": 0.00019978833994094855, "loss": 0.8731, "step": 100 }, { "epoch": 0.02297266253158741, "eval_loss": 0.8611448407173157, "eval_runtime": 51.6983, "eval_samples_per_second": 35.456, "eval_steps_per_second": 8.878, "step": 100 }, { "epoch": 0.02526992878474615, "grad_norm": 0.5959282517433167, "learning_rate": 0.00019709518213718787, "loss": 0.7564, "step": 110 }, { "epoch": 0.027567195037904894, "grad_norm": 0.7839557528495789, "learning_rate": 0.00019394360206446948, "loss": 0.8659, "step": 120 }, { "epoch": 0.029864461291063633, "grad_norm": 0.8548005819320679, "learning_rate": 0.00019034895390411186, "loss": 0.8915, "step": 130 }, { "epoch": 0.03216172754422238, "grad_norm": 0.7380218505859375, "learning_rate": 0.0001863287504206196, "loss": 0.8667, "step": 140 }, { "epoch": 0.03445899379738112, "grad_norm": 1.0800361633300781, "learning_rate": 0.00018190257764125471, "loss": 0.848, "step": 150 }, { "epoch": 0.03445899379738112, "eval_loss": 0.8501473665237427, "eval_runtime": 51.9217, "eval_samples_per_second": 35.303, "eval_steps_per_second": 8.84, "step": 150 }, { "epoch": 0.036756260050539856, "grad_norm": 0.5680871605873108, "learning_rate": 0.00017709199943488106, "loss": 0.7338, "step": 160 }, { "epoch": 0.0390535263036986, "grad_norm": 0.6372123956680298, "learning_rate": 0.00017192045245496238, "loss": 0.8033, "step": 170 }, { "epoch": 0.04135079255685734, "grad_norm": 0.8441762924194336, "learning_rate": 0.00016641313195854277, "loss": 0.9452, "step": 180 }, { "epoch": 0.04364805881001608, "grad_norm": 0.7282130718231201, "learning_rate": 0.0001605968690574869, "loss": 0.8036, "step": 190 }, { "epoch": 0.04594532506317482, "grad_norm": 1.0529526472091675, "learning_rate": 0.0001545, "loss": 0.8569, "step": 200 }, { "epoch": 0.04594532506317482, "eval_loss": 0.8399880528450012, "eval_runtime": 51.6878, "eval_samples_per_second": 35.463, "eval_steps_per_second": 8.88, "step": 200 }, { "epoch": 0.048242591316333565, "grad_norm": 0.5307555794715881, "learning_rate": 0.00014815222811927496, "loss": 0.747, "step": 210 }, { "epoch": 0.0505398575694923, "grad_norm": 0.7486038208007812, "learning_rate": 0.00014158447912183896, "loss": 0.8806, "step": 220 }, { "epoch": 0.052837123822651044, "grad_norm": 0.875223696231842, "learning_rate": 0.00013482875042061958, "loss": 0.9061, "step": 230 }, { "epoch": 0.05513439007580979, "grad_norm": 0.7279521226882935, "learning_rate": 0.00012791795524676576, "loss": 0.8071, "step": 240 }, { "epoch": 0.05743165632896853, "grad_norm": 1.203727126121521, "learning_rate": 0.00012088576229969385, "loss": 0.8783, "step": 250 }, { "epoch": 0.05743165632896853, "eval_loss": 0.8261856436729431, "eval_runtime": 51.6149, "eval_samples_per_second": 35.513, "eval_steps_per_second": 8.893, "step": 250 }, { "epoch": 0.05972892258212727, "grad_norm": 0.5175067782402039, "learning_rate": 0.0001137664317165683, "loss": 0.6935, "step": 260 }, { "epoch": 0.06202618883528601, "grad_norm": 0.6327574849128723, "learning_rate": 0.00010659464816035761, "loss": 0.7742, "step": 270 }, { "epoch": 0.06432345508844475, "grad_norm": 0.7658243775367737, "learning_rate": 9.940535183964242e-05, "loss": 0.8565, "step": 280 }, { "epoch": 0.06662072134160349, "grad_norm": 0.7925461530685425, "learning_rate": 9.22335682834317e-05, "loss": 0.835, "step": 290 }, { "epoch": 0.06891798759476224, "grad_norm": 0.9995759129524231, "learning_rate": 8.511423770030617e-05, "loss": 0.8554, "step": 300 }, { "epoch": 0.06891798759476224, "eval_loss": 0.817738950252533, "eval_runtime": 51.6421, "eval_samples_per_second": 35.494, "eval_steps_per_second": 8.888, "step": 300 }, { "epoch": 0.07121525384792098, "grad_norm": 0.5488688945770264, "learning_rate": 7.808204475323423e-05, "loss": 0.692, "step": 310 }, { "epoch": 0.07351252010107971, "grad_norm": 0.6316297054290771, "learning_rate": 7.117124957938042e-05, "loss": 0.8166, "step": 320 }, { "epoch": 0.07580978635423846, "grad_norm": 0.6976966857910156, "learning_rate": 6.441552087816105e-05, "loss": 0.9222, "step": 330 }, { "epoch": 0.0781070526073972, "grad_norm": 0.6356428265571594, "learning_rate": 5.784777188072502e-05, "loss": 0.852, "step": 340 }, { "epoch": 0.08040431886055593, "grad_norm": 0.9787359833717346, "learning_rate": 5.150000000000002e-05, "loss": 0.8186, "step": 350 }, { "epoch": 0.08040431886055593, "eval_loss": 0.8114416003227234, "eval_runtime": 51.6228, "eval_samples_per_second": 35.508, "eval_steps_per_second": 8.891, "step": 350 }, { "epoch": 0.08270158511371468, "grad_norm": 0.5780043601989746, "learning_rate": 4.540313094251309e-05, "loss": 0.6796, "step": 360 }, { "epoch": 0.08499885136687342, "grad_norm": 0.7033482193946838, "learning_rate": 3.958686804145719e-05, "loss": 0.8087, "step": 370 }, { "epoch": 0.08729611762003216, "grad_norm": 0.5974348783493042, "learning_rate": 3.4079547545037634e-05, "loss": 0.7923, "step": 380 }, { "epoch": 0.08959338387319091, "grad_norm": 0.8100835084915161, "learning_rate": 2.8908000565118947e-05, "loss": 0.8478, "step": 390 }, { "epoch": 0.09189065012634964, "grad_norm": 0.816436231136322, "learning_rate": 2.4097422358745275e-05, "loss": 0.8498, "step": 400 }, { "epoch": 0.09189065012634964, "eval_loss": 0.8052087426185608, "eval_runtime": 51.7512, "eval_samples_per_second": 35.419, "eval_steps_per_second": 8.869, "step": 400 }, { "epoch": 0.09418791637950838, "grad_norm": 0.5542159080505371, "learning_rate": 1.9671249579380422e-05, "loss": 0.7486, "step": 410 }, { "epoch": 0.09648518263266713, "grad_norm": 0.5713785886764526, "learning_rate": 1.5651046095888127e-05, "loss": 0.7955, "step": 420 }, { "epoch": 0.09878244888582587, "grad_norm": 0.7074142098426819, "learning_rate": 1.205639793553052e-05, "loss": 0.8298, "step": 430 }, { "epoch": 0.1010797151389846, "grad_norm": 0.6469036936759949, "learning_rate": 8.904817862812098e-06, "loss": 0.8189, "step": 440 }, { "epoch": 0.10337698139214335, "grad_norm": 0.9961839914321899, "learning_rate": 6.211660059051443e-06, "loss": 0.816, "step": 450 }, { "epoch": 0.10337698139214335, "eval_loss": 0.803685188293457, "eval_runtime": 52.0877, "eval_samples_per_second": 35.191, "eval_steps_per_second": 8.812, "step": 450 }, { "epoch": 0.10567424764530209, "grad_norm": 0.5176745057106018, "learning_rate": 3.990045318353154e-06, "loss": 0.7438, "step": 460 }, { "epoch": 0.10797151389846082, "grad_norm": 0.5562933087348938, "learning_rate": 2.250797124418014e-06, "loss": 0.8519, "step": 470 }, { "epoch": 0.11026878015161957, "grad_norm": 0.634845495223999, "learning_rate": 1.0023889196182526e-06, "loss": 0.8369, "step": 480 }, { "epoch": 0.11256604640477831, "grad_norm": 0.7085196375846863, "learning_rate": 2.5090282323810766e-07, "loss": 0.8191, "step": 490 }, { "epoch": 0.11486331265793706, "grad_norm": 1.0037809610366821, "learning_rate": 0.0, "loss": 0.8017, "step": 500 }, { "epoch": 0.11486331265793706, "eval_loss": 0.8025246858596802, "eval_runtime": 51.8757, "eval_samples_per_second": 35.334, "eval_steps_per_second": 8.848, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.39979133403136e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }