{ "best_metric": 10.305556297302246, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 3.0125786163522013, "eval_steps": 50, "global_step": 119, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025157232704402517, "grad_norm": 0.1715589165687561, "learning_rate": 1.0100000000000002e-05, "loss": 10.3802, "step": 1 }, { "epoch": 0.025157232704402517, "eval_loss": 10.382518768310547, "eval_runtime": 0.1746, "eval_samples_per_second": 3058.484, "eval_steps_per_second": 97.367, "step": 1 }, { "epoch": 0.050314465408805034, "grad_norm": 0.18782849609851837, "learning_rate": 2.0200000000000003e-05, "loss": 10.3817, "step": 2 }, { "epoch": 0.07547169811320754, "grad_norm": 0.22927959263324738, "learning_rate": 3.0299999999999998e-05, "loss": 10.379, "step": 3 }, { "epoch": 0.10062893081761007, "grad_norm": 0.338342547416687, "learning_rate": 4.0400000000000006e-05, "loss": 10.3812, "step": 4 }, { "epoch": 0.12578616352201258, "grad_norm": 0.34414851665496826, "learning_rate": 5.05e-05, "loss": 10.3802, "step": 5 }, { "epoch": 0.1509433962264151, "grad_norm": 0.46170228719711304, "learning_rate": 6.0599999999999996e-05, "loss": 10.3843, "step": 6 }, { "epoch": 0.1761006289308176, "grad_norm": 0.1982971578836441, "learning_rate": 7.07e-05, "loss": 10.3814, "step": 7 }, { "epoch": 0.20125786163522014, "grad_norm": 0.20252886414527893, "learning_rate": 8.080000000000001e-05, "loss": 10.3824, "step": 8 }, { "epoch": 0.22641509433962265, "grad_norm": 0.23484252393245697, "learning_rate": 9.09e-05, "loss": 10.3728, "step": 9 }, { "epoch": 0.25157232704402516, "grad_norm": 0.3196582496166229, "learning_rate": 0.000101, "loss": 10.3746, "step": 10 }, { "epoch": 0.27672955974842767, "grad_norm": 0.35150107741355896, "learning_rate": 0.00010007339449541285, "loss": 10.3727, "step": 11 }, { "epoch": 0.3018867924528302, "grad_norm": 0.4719482362270355, "learning_rate": 9.914678899082569e-05, "loss": 10.3699, "step": 12 }, { "epoch": 0.3270440251572327, "grad_norm": 0.2938939034938812, "learning_rate": 9.822018348623854e-05, "loss": 10.376, "step": 13 }, { "epoch": 0.3522012578616352, "grad_norm": 0.2226066291332245, "learning_rate": 9.729357798165138e-05, "loss": 10.3751, "step": 14 }, { "epoch": 0.37735849056603776, "grad_norm": 0.24701771140098572, "learning_rate": 9.636697247706423e-05, "loss": 10.3662, "step": 15 }, { "epoch": 0.4025157232704403, "grad_norm": 0.3606414794921875, "learning_rate": 9.544036697247707e-05, "loss": 10.3621, "step": 16 }, { "epoch": 0.4276729559748428, "grad_norm": 0.3855360448360443, "learning_rate": 9.451376146788992e-05, "loss": 10.3593, "step": 17 }, { "epoch": 0.4528301886792453, "grad_norm": 0.4999494254589081, "learning_rate": 9.358715596330275e-05, "loss": 10.3569, "step": 18 }, { "epoch": 0.4779874213836478, "grad_norm": 0.4514361619949341, "learning_rate": 9.266055045871561e-05, "loss": 10.3608, "step": 19 }, { "epoch": 0.5031446540880503, "grad_norm": 0.26520147919654846, "learning_rate": 9.173394495412844e-05, "loss": 10.3659, "step": 20 }, { "epoch": 0.5283018867924528, "grad_norm": 0.26291126012802124, "learning_rate": 9.08073394495413e-05, "loss": 10.3592, "step": 21 }, { "epoch": 0.5534591194968553, "grad_norm": 0.36332985758781433, "learning_rate": 8.988073394495413e-05, "loss": 10.3518, "step": 22 }, { "epoch": 0.5786163522012578, "grad_norm": 0.40622854232788086, "learning_rate": 8.895412844036697e-05, "loss": 10.344, "step": 23 }, { "epoch": 0.6037735849056604, "grad_norm": 0.5071566700935364, "learning_rate": 8.802752293577982e-05, "loss": 10.3411, "step": 24 }, { "epoch": 0.6289308176100629, "grad_norm": 0.6803290843963623, "learning_rate": 8.710091743119266e-05, "loss": 10.329, "step": 25 }, { "epoch": 0.6540880503144654, "grad_norm": 0.3055112063884735, "learning_rate": 8.617431192660551e-05, "loss": 10.3552, "step": 26 }, { "epoch": 0.6792452830188679, "grad_norm": 0.2761920392513275, "learning_rate": 8.524770642201835e-05, "loss": 10.3511, "step": 27 }, { "epoch": 0.7044025157232704, "grad_norm": 0.25242385268211365, "learning_rate": 8.43211009174312e-05, "loss": 10.3422, "step": 28 }, { "epoch": 0.7295597484276729, "grad_norm": 0.35044631361961365, "learning_rate": 8.339449541284404e-05, "loss": 10.3301, "step": 29 }, { "epoch": 0.7547169811320755, "grad_norm": 0.4619623124599457, "learning_rate": 8.246788990825688e-05, "loss": 10.3278, "step": 30 }, { "epoch": 0.779874213836478, "grad_norm": 0.5502440333366394, "learning_rate": 8.154128440366973e-05, "loss": 10.3209, "step": 31 }, { "epoch": 0.8050314465408805, "grad_norm": 0.31955239176750183, "learning_rate": 8.061467889908257e-05, "loss": 10.3387, "step": 32 }, { "epoch": 0.8301886792452831, "grad_norm": 0.26196402311325073, "learning_rate": 7.968807339449542e-05, "loss": 10.3445, "step": 33 }, { "epoch": 0.8553459119496856, "grad_norm": 0.17065182328224182, "learning_rate": 7.876146788990826e-05, "loss": 10.337, "step": 34 }, { "epoch": 0.8805031446540881, "grad_norm": 0.28439250588417053, "learning_rate": 7.78348623853211e-05, "loss": 10.3194, "step": 35 }, { "epoch": 0.9056603773584906, "grad_norm": 0.3914329409599304, "learning_rate": 7.690825688073395e-05, "loss": 10.3172, "step": 36 }, { "epoch": 0.9308176100628931, "grad_norm": 0.4504752457141876, "learning_rate": 7.598165137614679e-05, "loss": 10.308, "step": 37 }, { "epoch": 0.9559748427672956, "grad_norm": 0.2524828612804413, "learning_rate": 7.505504587155964e-05, "loss": 10.3287, "step": 38 }, { "epoch": 0.9811320754716981, "grad_norm": 0.17636598646640778, "learning_rate": 7.412844036697248e-05, "loss": 10.3251, "step": 39 }, { "epoch": 1.0125786163522013, "grad_norm": 0.4300397038459778, "learning_rate": 7.320183486238533e-05, "loss": 16.2943, "step": 40 }, { "epoch": 1.0377358490566038, "grad_norm": 0.19085323810577393, "learning_rate": 7.227522935779817e-05, "loss": 9.4535, "step": 41 }, { "epoch": 1.0628930817610063, "grad_norm": 0.1255641132593155, "learning_rate": 7.134862385321102e-05, "loss": 10.7832, "step": 42 }, { "epoch": 1.0880503144654088, "grad_norm": 0.1516360491514206, "learning_rate": 7.042201834862386e-05, "loss": 10.5093, "step": 43 }, { "epoch": 1.1132075471698113, "grad_norm": 0.2405787855386734, "learning_rate": 6.949541284403669e-05, "loss": 10.0203, "step": 44 }, { "epoch": 1.1383647798742138, "grad_norm": 0.3532116115093231, "learning_rate": 6.856880733944955e-05, "loss": 10.5763, "step": 45 }, { "epoch": 1.1635220125786163, "grad_norm": 0.2831573188304901, "learning_rate": 6.764220183486238e-05, "loss": 9.7205, "step": 46 }, { "epoch": 1.1886792452830188, "grad_norm": 0.18701517581939697, "learning_rate": 6.671559633027524e-05, "loss": 10.387, "step": 47 }, { "epoch": 1.2138364779874213, "grad_norm": 0.12146440893411636, "learning_rate": 6.578899082568807e-05, "loss": 10.4843, "step": 48 }, { "epoch": 1.2389937106918238, "grad_norm": 0.12593397498130798, "learning_rate": 6.486238532110092e-05, "loss": 10.5579, "step": 49 }, { "epoch": 1.2641509433962264, "grad_norm": 0.1917344033718109, "learning_rate": 6.393577981651376e-05, "loss": 9.7609, "step": 50 }, { "epoch": 1.2641509433962264, "eval_loss": 10.313277244567871, "eval_runtime": 0.1754, "eval_samples_per_second": 3044.681, "eval_steps_per_second": 96.928, "step": 50 }, { "epoch": 1.2893081761006289, "grad_norm": 0.34799277782440186, "learning_rate": 6.300917431192661e-05, "loss": 11.1836, "step": 51 }, { "epoch": 1.3144654088050314, "grad_norm": 0.31355059146881104, "learning_rate": 6.208256880733945e-05, "loss": 6.6889, "step": 52 }, { "epoch": 1.3396226415094339, "grad_norm": 0.23540031909942627, "learning_rate": 6.11559633027523e-05, "loss": 13.2285, "step": 53 }, { "epoch": 1.3647798742138364, "grad_norm": 0.14588120579719543, "learning_rate": 6.0229357798165135e-05, "loss": 10.7637, "step": 54 }, { "epoch": 1.389937106918239, "grad_norm": 0.12895844876766205, "learning_rate": 5.930275229357799e-05, "loss": 10.2831, "step": 55 }, { "epoch": 1.4150943396226414, "grad_norm": 0.16842244565486908, "learning_rate": 5.8376146788990825e-05, "loss": 9.4583, "step": 56 }, { "epoch": 1.440251572327044, "grad_norm": 0.380400151014328, "learning_rate": 5.744954128440368e-05, "loss": 11.541, "step": 57 }, { "epoch": 1.4654088050314464, "grad_norm": 0.2583652436733246, "learning_rate": 5.6522935779816515e-05, "loss": 7.7146, "step": 58 }, { "epoch": 1.490566037735849, "grad_norm": 0.19550327956676483, "learning_rate": 5.559633027522936e-05, "loss": 12.1234, "step": 59 }, { "epoch": 1.5157232704402515, "grad_norm": 0.14996886253356934, "learning_rate": 5.46697247706422e-05, "loss": 10.5247, "step": 60 }, { "epoch": 1.540880503144654, "grad_norm": 0.13765446841716766, "learning_rate": 5.374311926605505e-05, "loss": 10.2347, "step": 61 }, { "epoch": 1.5660377358490565, "grad_norm": 0.15372247993946075, "learning_rate": 5.281651376146789e-05, "loss": 10.319, "step": 62 }, { "epoch": 1.591194968553459, "grad_norm": 0.2650212049484253, "learning_rate": 5.188990825688074e-05, "loss": 10.4112, "step": 63 }, { "epoch": 1.6163522012578615, "grad_norm": 0.2876375913619995, "learning_rate": 5.096330275229358e-05, "loss": 10.5638, "step": 64 }, { "epoch": 1.641509433962264, "grad_norm": 0.21278758347034454, "learning_rate": 5.0036697247706424e-05, "loss": 10.398, "step": 65 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1513729989528656, "learning_rate": 4.911009174311927e-05, "loss": 9.6802, "step": 66 }, { "epoch": 1.691823899371069, "grad_norm": 0.16482418775558472, "learning_rate": 4.8183486238532114e-05, "loss": 10.7964, "step": 67 }, { "epoch": 1.7169811320754715, "grad_norm": 0.16112814843654633, "learning_rate": 4.725688073394496e-05, "loss": 10.3939, "step": 68 }, { "epoch": 1.742138364779874, "grad_norm": 0.175452321767807, "learning_rate": 4.6330275229357804e-05, "loss": 10.1161, "step": 69 }, { "epoch": 1.7672955974842768, "grad_norm": 0.26655417680740356, "learning_rate": 4.540366972477065e-05, "loss": 10.3799, "step": 70 }, { "epoch": 1.7924528301886793, "grad_norm": 0.22874386608600616, "learning_rate": 4.447706422018349e-05, "loss": 10.1276, "step": 71 }, { "epoch": 1.8176100628930818, "grad_norm": 0.18383730947971344, "learning_rate": 4.355045871559633e-05, "loss": 9.9519, "step": 72 }, { "epoch": 1.8427672955974843, "grad_norm": 0.13625063002109528, "learning_rate": 4.262385321100918e-05, "loss": 10.7582, "step": 73 }, { "epoch": 1.8679245283018868, "grad_norm": 0.13668720424175262, "learning_rate": 4.169724770642202e-05, "loss": 10.263, "step": 74 }, { "epoch": 1.8930817610062893, "grad_norm": 0.14149360358715057, "learning_rate": 4.077064220183487e-05, "loss": 9.9526, "step": 75 }, { "epoch": 1.9182389937106918, "grad_norm": 0.31288641691207886, "learning_rate": 3.984403669724771e-05, "loss": 11.2683, "step": 76 }, { "epoch": 1.9433962264150944, "grad_norm": 0.2737663984298706, "learning_rate": 3.891743119266055e-05, "loss": 6.941, "step": 77 }, { "epoch": 1.9685534591194969, "grad_norm": 0.17450223863124847, "learning_rate": 3.7990825688073395e-05, "loss": 13.6963, "step": 78 }, { "epoch": 1.9937106918238994, "grad_norm": 0.256649374961853, "learning_rate": 3.706422018348624e-05, "loss": 14.5839, "step": 79 }, { "epoch": 2.0251572327044025, "grad_norm": 0.1788261979818344, "learning_rate": 3.6137614678899085e-05, "loss": 10.3272, "step": 80 }, { "epoch": 2.050314465408805, "grad_norm": 0.15341153740882874, "learning_rate": 3.521100917431193e-05, "loss": 10.331, "step": 81 }, { "epoch": 2.0754716981132075, "grad_norm": 0.16519047319889069, "learning_rate": 3.4284403669724775e-05, "loss": 10.3231, "step": 82 }, { "epoch": 2.10062893081761, "grad_norm": 0.17606724798679352, "learning_rate": 3.335779816513762e-05, "loss": 10.3009, "step": 83 }, { "epoch": 2.1257861635220126, "grad_norm": 0.29273226857185364, "learning_rate": 3.243119266055046e-05, "loss": 10.2937, "step": 84 }, { "epoch": 2.150943396226415, "grad_norm": 0.33806756138801575, "learning_rate": 3.1504587155963303e-05, "loss": 10.2861, "step": 85 }, { "epoch": 2.1761006289308176, "grad_norm": 0.17116469144821167, "learning_rate": 3.057798165137615e-05, "loss": 10.322, "step": 86 }, { "epoch": 2.20125786163522, "grad_norm": 0.15034303069114685, "learning_rate": 2.9651376146788993e-05, "loss": 10.3318, "step": 87 }, { "epoch": 2.2264150943396226, "grad_norm": 0.15952639281749725, "learning_rate": 2.872477064220184e-05, "loss": 10.3268, "step": 88 }, { "epoch": 2.251572327044025, "grad_norm": 0.16502609848976135, "learning_rate": 2.779816513761468e-05, "loss": 10.3057, "step": 89 }, { "epoch": 2.2767295597484276, "grad_norm": 0.2700464427471161, "learning_rate": 2.6871559633027525e-05, "loss": 10.2954, "step": 90 }, { "epoch": 2.30188679245283, "grad_norm": 0.2575024366378784, "learning_rate": 2.594495412844037e-05, "loss": 10.2855, "step": 91 }, { "epoch": 2.3270440251572326, "grad_norm": 0.15534816682338715, "learning_rate": 2.5018348623853212e-05, "loss": 10.3186, "step": 92 }, { "epoch": 2.352201257861635, "grad_norm": 0.15319371223449707, "learning_rate": 2.4091743119266057e-05, "loss": 10.3259, "step": 93 }, { "epoch": 2.3773584905660377, "grad_norm": 0.13836082816123962, "learning_rate": 2.3165137614678902e-05, "loss": 10.3322, "step": 94 }, { "epoch": 2.40251572327044, "grad_norm": 0.14459159970283508, "learning_rate": 2.2238532110091743e-05, "loss": 10.3131, "step": 95 }, { "epoch": 2.4276729559748427, "grad_norm": 0.15748228132724762, "learning_rate": 2.131192660550459e-05, "loss": 10.2985, "step": 96 }, { "epoch": 2.452830188679245, "grad_norm": 0.247804194688797, "learning_rate": 2.0385321100917433e-05, "loss": 10.2892, "step": 97 }, { "epoch": 2.4779874213836477, "grad_norm": 0.17554815113544464, "learning_rate": 1.9458715596330275e-05, "loss": 10.3041, "step": 98 }, { "epoch": 2.50314465408805, "grad_norm": 0.1803831309080124, "learning_rate": 1.853211009174312e-05, "loss": 10.3223, "step": 99 }, { "epoch": 2.5283018867924527, "grad_norm": 0.1464659571647644, "learning_rate": 1.7605504587155965e-05, "loss": 10.3293, "step": 100 }, { "epoch": 2.5283018867924527, "eval_loss": 10.305556297302246, "eval_runtime": 0.1774, "eval_samples_per_second": 3010.845, "eval_steps_per_second": 95.851, "step": 100 }, { "epoch": 2.5534591194968552, "grad_norm": 0.1466299146413803, "learning_rate": 1.667889908256881e-05, "loss": 10.3158, "step": 101 }, { "epoch": 2.5786163522012577, "grad_norm": 0.17850784957408905, "learning_rate": 1.5752293577981652e-05, "loss": 10.2993, "step": 102 }, { "epoch": 2.6037735849056602, "grad_norm": 0.2629879117012024, "learning_rate": 1.4825688073394497e-05, "loss": 10.2901, "step": 103 }, { "epoch": 2.6289308176100628, "grad_norm": 0.3722081184387207, "learning_rate": 1.389908256880734e-05, "loss": 10.2754, "step": 104 }, { "epoch": 2.6540880503144653, "grad_norm": 0.17005372047424316, "learning_rate": 1.2972477064220185e-05, "loss": 10.3255, "step": 105 }, { "epoch": 2.6792452830188678, "grad_norm": 0.14880169928073883, "learning_rate": 1.2045871559633028e-05, "loss": 10.3339, "step": 106 }, { "epoch": 2.7044025157232703, "grad_norm": 0.18930739164352417, "learning_rate": 1.1119266055045872e-05, "loss": 10.3178, "step": 107 }, { "epoch": 2.729559748427673, "grad_norm": 0.18744437396526337, "learning_rate": 1.0192660550458717e-05, "loss": 10.2987, "step": 108 }, { "epoch": 2.7547169811320753, "grad_norm": 0.27432578802108765, "learning_rate": 9.26605504587156e-06, "loss": 10.2927, "step": 109 }, { "epoch": 2.779874213836478, "grad_norm": 0.31490325927734375, "learning_rate": 8.339449541284405e-06, "loss": 10.2855, "step": 110 }, { "epoch": 2.8050314465408803, "grad_norm": 0.17064593732357025, "learning_rate": 7.412844036697248e-06, "loss": 10.3155, "step": 111 }, { "epoch": 2.830188679245283, "grad_norm": 0.15425540506839752, "learning_rate": 6.4862385321100925e-06, "loss": 10.3282, "step": 112 }, { "epoch": 2.8553459119496853, "grad_norm": 0.14992254972457886, "learning_rate": 5.559633027522936e-06, "loss": 10.3263, "step": 113 }, { "epoch": 2.880503144654088, "grad_norm": 0.1558142602443695, "learning_rate": 4.63302752293578e-06, "loss": 10.3093, "step": 114 }, { "epoch": 2.9056603773584904, "grad_norm": 0.2051534205675125, "learning_rate": 3.706422018348624e-06, "loss": 10.2948, "step": 115 }, { "epoch": 2.930817610062893, "grad_norm": 0.2544957101345062, "learning_rate": 2.779816513761468e-06, "loss": 10.2848, "step": 116 }, { "epoch": 2.9559748427672954, "grad_norm": 0.13796408474445343, "learning_rate": 1.853211009174312e-06, "loss": 10.3145, "step": 117 }, { "epoch": 2.981132075471698, "grad_norm": 0.12653085589408875, "learning_rate": 9.26605504587156e-07, "loss": 10.3105, "step": 118 }, { "epoch": 3.0125786163522013, "grad_norm": 0.185125932097435, "learning_rate": 0.0, "loss": 16.0677, "step": 119 } ], "logging_steps": 1, "max_steps": 119, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 206642570330112.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }