{ "best_metric": 0.9760225669957687, "best_model_checkpoint": "melanoma-v3\\checkpoint-12523", "epoch": 7.0, "eval_steps": 500, "global_step": 12523, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027948574622694244, "grad_norm": 9.020837783813477, "learning_rate": 1.8355945730247407e-05, "loss": 0.6846, "step": 50 }, { "epoch": 0.05589714924538849, "grad_norm": 15.282539367675781, "learning_rate": 3.790901835594573e-05, "loss": 0.6027, "step": 100 }, { "epoch": 0.08384572386808273, "grad_norm": 17.592166900634766, "learning_rate": 5.78611332801277e-05, "loss": 0.4504, "step": 150 }, { "epoch": 0.11179429849077697, "grad_norm": 11.997602462768555, "learning_rate": 7.781324820430965e-05, "loss": 0.3523, "step": 200 }, { "epoch": 0.13974287311347122, "grad_norm": 16.85348129272461, "learning_rate": 9.776536312849161e-05, "loss": 0.2612, "step": 250 }, { "epoch": 0.16769144773616546, "grad_norm": 5.394933700561523, "learning_rate": 0.00011771747805267359, "loss": 0.2493, "step": 300 }, { "epoch": 0.1956400223588597, "grad_norm": 15.290493965148926, "learning_rate": 0.00013766959297685554, "loss": 0.2092, "step": 350 }, { "epoch": 0.22358859698155395, "grad_norm": 15.310522079467773, "learning_rate": 0.00015762170790103753, "loss": 0.1995, "step": 400 }, { "epoch": 0.2515371716042482, "grad_norm": 20.243497848510742, "learning_rate": 0.00017757382282521946, "loss": 0.2205, "step": 450 }, { "epoch": 0.27948574622694244, "grad_norm": 5.062504291534424, "learning_rate": 0.00019752593774940145, "loss": 0.2124, "step": 500 }, { "epoch": 0.3074343208496367, "grad_norm": 5.295910358428955, "learning_rate": 0.00021747805267358338, "loss": 0.2178, "step": 550 }, { "epoch": 0.3353828954723309, "grad_norm": 1.7261399030685425, "learning_rate": 0.00023743016759776537, "loss": 0.1946, "step": 600 }, { "epoch": 0.36333147009502514, "grad_norm": 5.8339338302612305, "learning_rate": 0.00025738228252194736, "loss": 0.2097, "step": 650 }, { "epoch": 0.3912800447177194, "grad_norm": 2.142676830291748, "learning_rate": 0.0002773343974461293, "loss": 0.2165, "step": 700 }, { "epoch": 0.41922861934041367, "grad_norm": 2.970517873764038, "learning_rate": 0.00029728651237031123, "loss": 0.2051, "step": 750 }, { "epoch": 0.4471771939631079, "grad_norm": 4.551942348480225, "learning_rate": 0.0003172386272944932, "loss": 0.2288, "step": 800 }, { "epoch": 0.47512576858580213, "grad_norm": 5.390867710113525, "learning_rate": 0.0003371907422186752, "loss": 0.2091, "step": 850 }, { "epoch": 0.5030743432084964, "grad_norm": 1.3094195127487183, "learning_rate": 0.00035714285714285714, "loss": 0.2302, "step": 900 }, { "epoch": 0.5310229178311906, "grad_norm": 2.8628153800964355, "learning_rate": 0.00037709497206703913, "loss": 0.2244, "step": 950 }, { "epoch": 0.5589714924538849, "grad_norm": 2.0258989334106445, "learning_rate": 0.00039704708699122107, "loss": 0.1956, "step": 1000 }, { "epoch": 0.5869200670765791, "grad_norm": 2.2154078483581543, "learning_rate": 0.000416999201915403, "loss": 0.2345, "step": 1050 }, { "epoch": 0.6148686416992734, "grad_norm": 1.6475298404693604, "learning_rate": 0.000436951316839585, "loss": 0.2137, "step": 1100 }, { "epoch": 0.6428172163219675, "grad_norm": 2.1503453254699707, "learning_rate": 0.000456903431763767, "loss": 0.2324, "step": 1150 }, { "epoch": 0.6707657909446618, "grad_norm": 1.5340570211410522, "learning_rate": 0.00047685554668794897, "loss": 0.2288, "step": 1200 }, { "epoch": 0.6987143655673561, "grad_norm": 2.333324432373047, "learning_rate": 0.0004968076616121309, "loss": 0.2061, "step": 1250 }, { "epoch": 0.7266629401900503, "grad_norm": 1.712575912475586, "learning_rate": 0.000498136645962733, "loss": 0.2189, "step": 1300 }, { "epoch": 0.7546115148127446, "grad_norm": 1.3168742656707764, "learning_rate": 0.0004959183673469388, "loss": 0.1978, "step": 1350 }, { "epoch": 0.7825600894354388, "grad_norm": 4.792356967926025, "learning_rate": 0.0004937000887311446, "loss": 0.2333, "step": 1400 }, { "epoch": 0.810508664058133, "grad_norm": 1.0282349586486816, "learning_rate": 0.0004914818101153504, "loss": 0.23, "step": 1450 }, { "epoch": 0.8384572386808273, "grad_norm": 0.6257219910621643, "learning_rate": 0.0004892635314995564, "loss": 0.2082, "step": 1500 }, { "epoch": 0.8664058133035215, "grad_norm": 1.1651033163070679, "learning_rate": 0.00048704525288376224, "loss": 0.1976, "step": 1550 }, { "epoch": 0.8943543879262158, "grad_norm": 1.0239338874816895, "learning_rate": 0.00048482697426796806, "loss": 0.1945, "step": 1600 }, { "epoch": 0.92230296254891, "grad_norm": 1.616534948348999, "learning_rate": 0.0004826086956521739, "loss": 0.1801, "step": 1650 }, { "epoch": 0.9502515371716043, "grad_norm": 1.4642572402954102, "learning_rate": 0.00048039041703637974, "loss": 0.1888, "step": 1700 }, { "epoch": 0.9782001117942984, "grad_norm": 5.763888359069824, "learning_rate": 0.00047817213842058566, "loss": 0.1798, "step": 1750 }, { "epoch": 1.0, "eval_f1": 0.8859060402684564, "eval_loss": 0.2657495439052582, "eval_runtime": 17.3445, "eval_samples_per_second": 211.017, "eval_steps_per_second": 6.63, "step": 1789 }, { "epoch": 1.0061486864169928, "grad_norm": 0.4531048536300659, "learning_rate": 0.0004759538598047915, "loss": 0.1844, "step": 1800 }, { "epoch": 1.034097261039687, "grad_norm": 1.737192153930664, "learning_rate": 0.00047373558118899735, "loss": 0.2026, "step": 1850 }, { "epoch": 1.0620458356623812, "grad_norm": 1.7833003997802734, "learning_rate": 0.00047151730257320317, "loss": 0.1724, "step": 1900 }, { "epoch": 1.0899944102850754, "grad_norm": 1.5082063674926758, "learning_rate": 0.0004692990239574091, "loss": 0.1891, "step": 1950 }, { "epoch": 1.1179429849077698, "grad_norm": 1.3961833715438843, "learning_rate": 0.00046708074534161496, "loss": 0.1737, "step": 2000 }, { "epoch": 1.145891559530464, "grad_norm": 0.9128915667533875, "learning_rate": 0.0004648624667258208, "loss": 0.1584, "step": 2050 }, { "epoch": 1.1738401341531581, "grad_norm": 1.8712306022644043, "learning_rate": 0.00046264418811002664, "loss": 0.1765, "step": 2100 }, { "epoch": 1.2017887087758523, "grad_norm": 1.0604172945022583, "learning_rate": 0.00046042590949423246, "loss": 0.2011, "step": 2150 }, { "epoch": 1.2297372833985467, "grad_norm": 2.5036442279815674, "learning_rate": 0.0004582076308784384, "loss": 0.1873, "step": 2200 }, { "epoch": 1.2576858580212409, "grad_norm": 2.0800774097442627, "learning_rate": 0.0004559893522626442, "loss": 0.1795, "step": 2250 }, { "epoch": 1.2856344326439353, "grad_norm": 1.3409812450408936, "learning_rate": 0.00045377107364685007, "loss": 0.1708, "step": 2300 }, { "epoch": 1.3135830072666295, "grad_norm": 1.0831228494644165, "learning_rate": 0.0004515527950310559, "loss": 0.1738, "step": 2350 }, { "epoch": 1.3415315818893236, "grad_norm": 0.5806594491004944, "learning_rate": 0.00044933451641526175, "loss": 0.1659, "step": 2400 }, { "epoch": 1.3694801565120178, "grad_norm": 0.9568463563919067, "learning_rate": 0.0004471162377994676, "loss": 0.1621, "step": 2450 }, { "epoch": 1.397428731134712, "grad_norm": 0.8637075424194336, "learning_rate": 0.0004448979591836735, "loss": 0.2007, "step": 2500 }, { "epoch": 1.4253773057574064, "grad_norm": 1.1704365015029907, "learning_rate": 0.0004426796805678793, "loss": 0.1556, "step": 2550 }, { "epoch": 1.4533258803801006, "grad_norm": 0.8730977773666382, "learning_rate": 0.0004404614019520852, "loss": 0.1741, "step": 2600 }, { "epoch": 1.481274455002795, "grad_norm": 0.7711090445518494, "learning_rate": 0.00043824312333629105, "loss": 0.1628, "step": 2650 }, { "epoch": 1.5092230296254892, "grad_norm": 0.6016321182250977, "learning_rate": 0.0004360248447204969, "loss": 0.1393, "step": 2700 }, { "epoch": 1.5371716042481833, "grad_norm": 0.9047902822494507, "learning_rate": 0.00043380656610470273, "loss": 0.1607, "step": 2750 }, { "epoch": 1.5651201788708775, "grad_norm": 1.0270776748657227, "learning_rate": 0.0004315882874889086, "loss": 0.1736, "step": 2800 }, { "epoch": 1.5930687534935717, "grad_norm": 0.7460580468177795, "learning_rate": 0.0004293700088731145, "loss": 0.1627, "step": 2850 }, { "epoch": 1.621017328116266, "grad_norm": 0.8114007115364075, "learning_rate": 0.00042715173025732034, "loss": 0.1756, "step": 2900 }, { "epoch": 1.6489659027389603, "grad_norm": 0.7761090397834778, "learning_rate": 0.0004249334516415262, "loss": 0.1491, "step": 2950 }, { "epoch": 1.6769144773616547, "grad_norm": 0.7432196140289307, "learning_rate": 0.00042271517302573203, "loss": 0.1663, "step": 3000 }, { "epoch": 1.7048630519843488, "grad_norm": 0.5656224489212036, "learning_rate": 0.0004204968944099379, "loss": 0.1808, "step": 3050 }, { "epoch": 1.732811626607043, "grad_norm": 1.192177653312683, "learning_rate": 0.00041827861579414377, "loss": 0.1605, "step": 3100 }, { "epoch": 1.7607602012297372, "grad_norm": 0.8815297484397888, "learning_rate": 0.00041606033717834964, "loss": 0.1595, "step": 3150 }, { "epoch": 1.7887087758524314, "grad_norm": 0.7288667559623718, "learning_rate": 0.00041384205856255545, "loss": 0.1529, "step": 3200 }, { "epoch": 1.8166573504751258, "grad_norm": 0.41470789909362793, "learning_rate": 0.0004116237799467613, "loss": 0.1661, "step": 3250 }, { "epoch": 1.8446059250978202, "grad_norm": 0.5929239392280579, "learning_rate": 0.00040940550133096714, "loss": 0.1658, "step": 3300 }, { "epoch": 1.8725544997205144, "grad_norm": 0.5527342557907104, "learning_rate": 0.00040718722271517306, "loss": 0.1686, "step": 3350 }, { "epoch": 1.9005030743432085, "grad_norm": 0.7223379611968994, "learning_rate": 0.0004049689440993789, "loss": 0.1616, "step": 3400 }, { "epoch": 1.9284516489659027, "grad_norm": 1.6212414503097534, "learning_rate": 0.00040275066548358475, "loss": 0.1455, "step": 3450 }, { "epoch": 1.9564002235885969, "grad_norm": 0.5442690849304199, "learning_rate": 0.00040053238686779056, "loss": 0.1666, "step": 3500 }, { "epoch": 1.984348798211291, "grad_norm": 0.4307806193828583, "learning_rate": 0.00039831410825199643, "loss": 0.1634, "step": 3550 }, { "epoch": 2.0, "eval_f1": 0.8839446782922429, "eval_loss": 0.26385268568992615, "eval_runtime": 15.9608, "eval_samples_per_second": 229.312, "eval_steps_per_second": 7.205, "step": 3578 }, { "epoch": 2.0122973728339857, "grad_norm": 1.0025759935379028, "learning_rate": 0.00039609582963620236, "loss": 0.16, "step": 3600 }, { "epoch": 2.04024594745668, "grad_norm": 0.7586847543716431, "learning_rate": 0.00039387755102040817, "loss": 0.18, "step": 3650 }, { "epoch": 2.068194522079374, "grad_norm": 1.3456194400787354, "learning_rate": 0.00039165927240461404, "loss": 0.1344, "step": 3700 }, { "epoch": 2.096143096702068, "grad_norm": 0.5425747632980347, "learning_rate": 0.00038944099378881986, "loss": 0.1386, "step": 3750 }, { "epoch": 2.1240916713247624, "grad_norm": 0.8643823266029358, "learning_rate": 0.0003872227151730258, "loss": 0.1609, "step": 3800 }, { "epoch": 2.1520402459474566, "grad_norm": 0.6121143698692322, "learning_rate": 0.0003850044365572316, "loss": 0.1433, "step": 3850 }, { "epoch": 2.1799888205701508, "grad_norm": 1.2089327573776245, "learning_rate": 0.00038278615794143747, "loss": 0.1635, "step": 3900 }, { "epoch": 2.207937395192845, "grad_norm": 0.845815122127533, "learning_rate": 0.0003805678793256433, "loss": 0.1633, "step": 3950 }, { "epoch": 2.2358859698155396, "grad_norm": 0.8113688230514526, "learning_rate": 0.00037834960070984915, "loss": 0.1497, "step": 4000 }, { "epoch": 2.2638345444382337, "grad_norm": 1.681497573852539, "learning_rate": 0.000376131322094055, "loss": 0.1402, "step": 4050 }, { "epoch": 2.291783119060928, "grad_norm": 0.9330260157585144, "learning_rate": 0.0003739130434782609, "loss": 0.1561, "step": 4100 }, { "epoch": 2.319731693683622, "grad_norm": 1.3728777170181274, "learning_rate": 0.0003716947648624667, "loss": 0.1228, "step": 4150 }, { "epoch": 2.3476802683063163, "grad_norm": 1.523483395576477, "learning_rate": 0.0003694764862466726, "loss": 0.1327, "step": 4200 }, { "epoch": 2.3756288429290104, "grad_norm": 0.6407974362373352, "learning_rate": 0.00036725820763087845, "loss": 0.1499, "step": 4250 }, { "epoch": 2.4035774175517046, "grad_norm": 0.5848202109336853, "learning_rate": 0.0003650399290150843, "loss": 0.1082, "step": 4300 }, { "epoch": 2.4315259921743992, "grad_norm": 0.21722891926765442, "learning_rate": 0.0003628216503992902, "loss": 0.1498, "step": 4350 }, { "epoch": 2.4594745667970934, "grad_norm": 0.7241097092628479, "learning_rate": 0.000360603371783496, "loss": 0.1354, "step": 4400 }, { "epoch": 2.4874231414197876, "grad_norm": 1.2298479080200195, "learning_rate": 0.00035838509316770187, "loss": 0.1409, "step": 4450 }, { "epoch": 2.5153717160424818, "grad_norm": 0.6272071003913879, "learning_rate": 0.00035616681455190774, "loss": 0.1358, "step": 4500 }, { "epoch": 2.543320290665176, "grad_norm": 0.9323145151138306, "learning_rate": 0.0003539485359361136, "loss": 0.1316, "step": 4550 }, { "epoch": 2.5712688652878706, "grad_norm": 0.44958174228668213, "learning_rate": 0.0003517302573203194, "loss": 0.1283, "step": 4600 }, { "epoch": 2.5992174399105643, "grad_norm": 0.30582261085510254, "learning_rate": 0.0003495119787045253, "loss": 0.1225, "step": 4650 }, { "epoch": 2.627166014533259, "grad_norm": 0.9854662418365479, "learning_rate": 0.00034729370008873117, "loss": 0.1527, "step": 4700 }, { "epoch": 2.655114589155953, "grad_norm": 0.3222338557243347, "learning_rate": 0.00034507542147293703, "loss": 0.1485, "step": 4750 }, { "epoch": 2.6830631637786473, "grad_norm": 0.7984679341316223, "learning_rate": 0.00034285714285714285, "loss": 0.15, "step": 4800 }, { "epoch": 2.7110117384013415, "grad_norm": 0.27581000328063965, "learning_rate": 0.0003406388642413487, "loss": 0.1423, "step": 4850 }, { "epoch": 2.7389603130240356, "grad_norm": 0.36392349004745483, "learning_rate": 0.00033842058562555454, "loss": 0.12, "step": 4900 }, { "epoch": 2.7669088876467303, "grad_norm": 0.8806673884391785, "learning_rate": 0.00033620230700976046, "loss": 0.1353, "step": 4950 }, { "epoch": 2.794857462269424, "grad_norm": 0.4529147446155548, "learning_rate": 0.0003339840283939663, "loss": 0.1524, "step": 5000 }, { "epoch": 2.8228060368921186, "grad_norm": 0.7661668658256531, "learning_rate": 0.00033176574977817214, "loss": 0.1431, "step": 5050 }, { "epoch": 2.850754611514813, "grad_norm": 0.7224089503288269, "learning_rate": 0.000329547471162378, "loss": 0.1328, "step": 5100 }, { "epoch": 2.878703186137507, "grad_norm": 0.7597375512123108, "learning_rate": 0.00032732919254658383, "loss": 0.1322, "step": 5150 }, { "epoch": 2.906651760760201, "grad_norm": 1.273445725440979, "learning_rate": 0.00032511091393078975, "loss": 0.1234, "step": 5200 }, { "epoch": 2.9346003353828953, "grad_norm": 1.428731918334961, "learning_rate": 0.00032289263531499557, "loss": 0.1476, "step": 5250 }, { "epoch": 2.96254891000559, "grad_norm": 0.4086410701274872, "learning_rate": 0.00032067435669920144, "loss": 0.1288, "step": 5300 }, { "epoch": 2.9904974846282837, "grad_norm": 1.4084666967391968, "learning_rate": 0.00031845607808340725, "loss": 0.1403, "step": 5350 }, { "epoch": 3.0, "eval_f1": 0.922771152754117, "eval_loss": 0.1881607472896576, "eval_runtime": 15.5665, "eval_samples_per_second": 235.12, "eval_steps_per_second": 7.388, "step": 5367 }, { "epoch": 3.0184460592509783, "grad_norm": 1.1583423614501953, "learning_rate": 0.0003162377994676132, "loss": 0.1348, "step": 5400 }, { "epoch": 3.0463946338736725, "grad_norm": 0.5038284063339233, "learning_rate": 0.000314019520851819, "loss": 0.1332, "step": 5450 }, { "epoch": 3.0743432084963667, "grad_norm": 1.139427900314331, "learning_rate": 0.00031180124223602486, "loss": 0.1071, "step": 5500 }, { "epoch": 3.102291783119061, "grad_norm": 0.632847011089325, "learning_rate": 0.0003095829636202307, "loss": 0.129, "step": 5550 }, { "epoch": 3.130240357741755, "grad_norm": 0.9477543830871582, "learning_rate": 0.00030736468500443655, "loss": 0.1296, "step": 5600 }, { "epoch": 3.1581889323644496, "grad_norm": 0.7813945412635803, "learning_rate": 0.0003051464063886424, "loss": 0.1356, "step": 5650 }, { "epoch": 3.186137506987144, "grad_norm": 1.0547813177108765, "learning_rate": 0.0003029281277728483, "loss": 0.1223, "step": 5700 }, { "epoch": 3.214086081609838, "grad_norm": 0.8783809542655945, "learning_rate": 0.0003007098491570541, "loss": 0.1086, "step": 5750 }, { "epoch": 3.242034656232532, "grad_norm": 1.7557278871536255, "learning_rate": 0.00029849157054126, "loss": 0.1176, "step": 5800 }, { "epoch": 3.2699832308552264, "grad_norm": 0.8250858783721924, "learning_rate": 0.0002962732919254659, "loss": 0.1244, "step": 5850 }, { "epoch": 3.2979318054779205, "grad_norm": 0.5830790996551514, "learning_rate": 0.0002940550133096717, "loss": 0.1255, "step": 5900 }, { "epoch": 3.3258803801006147, "grad_norm": 0.39319124817848206, "learning_rate": 0.0002918367346938776, "loss": 0.1145, "step": 5950 }, { "epoch": 3.3538289547233093, "grad_norm": 0.7558562755584717, "learning_rate": 0.0002896184560780834, "loss": 0.1449, "step": 6000 }, { "epoch": 3.3817775293460035, "grad_norm": 0.686841607093811, "learning_rate": 0.00028740017746228927, "loss": 0.1269, "step": 6050 }, { "epoch": 3.4097261039686977, "grad_norm": 0.3600720167160034, "learning_rate": 0.00028518189884649514, "loss": 0.1095, "step": 6100 }, { "epoch": 3.437674678591392, "grad_norm": 0.38374069333076477, "learning_rate": 0.000282963620230701, "loss": 0.11, "step": 6150 }, { "epoch": 3.465623253214086, "grad_norm": 0.736842691898346, "learning_rate": 0.0002807453416149068, "loss": 0.1099, "step": 6200 }, { "epoch": 3.49357182783678, "grad_norm": 0.21129830181598663, "learning_rate": 0.0002785270629991127, "loss": 0.1291, "step": 6250 }, { "epoch": 3.5215204024594744, "grad_norm": 0.6699479222297668, "learning_rate": 0.0002763087843833185, "loss": 0.1018, "step": 6300 }, { "epoch": 3.549468977082169, "grad_norm": 1.1297788619995117, "learning_rate": 0.00027409050576752443, "loss": 0.0986, "step": 6350 }, { "epoch": 3.577417551704863, "grad_norm": 0.7633982300758362, "learning_rate": 0.00027187222715173025, "loss": 0.1249, "step": 6400 }, { "epoch": 3.6053661263275574, "grad_norm": 1.3231158256530762, "learning_rate": 0.0002696539485359361, "loss": 0.1178, "step": 6450 }, { "epoch": 3.6333147009502516, "grad_norm": 0.5314428806304932, "learning_rate": 0.00026743566992014193, "loss": 0.1094, "step": 6500 }, { "epoch": 3.6612632755729457, "grad_norm": 0.3701280355453491, "learning_rate": 0.00026521739130434786, "loss": 0.12, "step": 6550 }, { "epoch": 3.68921185019564, "grad_norm": 0.6473478674888611, "learning_rate": 0.00026299911268855367, "loss": 0.106, "step": 6600 }, { "epoch": 3.717160424818334, "grad_norm": 0.4988769590854645, "learning_rate": 0.00026078083407275954, "loss": 0.1396, "step": 6650 }, { "epoch": 3.7451089994410287, "grad_norm": 1.0782082080841064, "learning_rate": 0.0002585625554569654, "loss": 0.1144, "step": 6700 }, { "epoch": 3.773057574063723, "grad_norm": 0.846603274345398, "learning_rate": 0.00025634427684117123, "loss": 0.1142, "step": 6750 }, { "epoch": 3.801006148686417, "grad_norm": 1.2522852420806885, "learning_rate": 0.00025412599822537715, "loss": 0.1086, "step": 6800 }, { "epoch": 3.8289547233091112, "grad_norm": 0.30715683102607727, "learning_rate": 0.00025190771960958297, "loss": 0.1023, "step": 6850 }, { "epoch": 3.8569032979318054, "grad_norm": 1.1746628284454346, "learning_rate": 0.00024968944099378884, "loss": 0.1182, "step": 6900 }, { "epoch": 3.8848518725544996, "grad_norm": 2.2346272468566895, "learning_rate": 0.0002474711623779947, "loss": 0.1207, "step": 6950 }, { "epoch": 3.9128004471771938, "grad_norm": 1.6879889965057373, "learning_rate": 0.0002452528837622005, "loss": 0.101, "step": 7000 }, { "epoch": 3.9407490217998884, "grad_norm": 0.777055025100708, "learning_rate": 0.00024303460514640642, "loss": 0.1051, "step": 7050 }, { "epoch": 3.9686975964225826, "grad_norm": 1.065346598625183, "learning_rate": 0.00024081632653061226, "loss": 0.1055, "step": 7100 }, { "epoch": 3.9966461710452768, "grad_norm": 2.3775737285614014, "learning_rate": 0.0002385980479148181, "loss": 0.097, "step": 7150 }, { "epoch": 4.0, "eval_f1": 0.9374824782730586, "eval_loss": 0.15188030898571014, "eval_runtime": 15.6619, "eval_samples_per_second": 233.688, "eval_steps_per_second": 7.343, "step": 7156 }, { "epoch": 4.024594745667971, "grad_norm": 0.8017207980155945, "learning_rate": 0.00023637976929902397, "loss": 0.1102, "step": 7200 }, { "epoch": 4.052543320290665, "grad_norm": 2.158339023590088, "learning_rate": 0.00023416149068322982, "loss": 0.1033, "step": 7250 }, { "epoch": 4.08049189491336, "grad_norm": 0.8033800721168518, "learning_rate": 0.00023194321206743569, "loss": 0.1054, "step": 7300 }, { "epoch": 4.1084404695360535, "grad_norm": 1.1149848699569702, "learning_rate": 0.00022972493345164153, "loss": 0.0884, "step": 7350 }, { "epoch": 4.136389044158748, "grad_norm": 0.18090881407260895, "learning_rate": 0.0002275066548358474, "loss": 0.091, "step": 7400 }, { "epoch": 4.164337618781442, "grad_norm": 0.6261000633239746, "learning_rate": 0.00022528837622005324, "loss": 0.0842, "step": 7450 }, { "epoch": 4.192286193404136, "grad_norm": 0.44622424244880676, "learning_rate": 0.00022307009760425908, "loss": 0.0933, "step": 7500 }, { "epoch": 4.220234768026831, "grad_norm": 1.2650983333587646, "learning_rate": 0.00022085181898846495, "loss": 0.0982, "step": 7550 }, { "epoch": 4.248183342649525, "grad_norm": 2.512014150619507, "learning_rate": 0.0002186335403726708, "loss": 0.0959, "step": 7600 }, { "epoch": 4.276131917272219, "grad_norm": 0.9572253227233887, "learning_rate": 0.00021641526175687667, "loss": 0.1197, "step": 7650 }, { "epoch": 4.304080491894913, "grad_norm": 0.8612358570098877, "learning_rate": 0.0002141969831410825, "loss": 0.0898, "step": 7700 }, { "epoch": 4.332029066517608, "grad_norm": 0.13563434779644012, "learning_rate": 0.00021197870452528838, "loss": 0.1268, "step": 7750 }, { "epoch": 4.3599776411403015, "grad_norm": 0.9207750558853149, "learning_rate": 0.00020976042590949425, "loss": 0.1138, "step": 7800 }, { "epoch": 4.387926215762996, "grad_norm": 0.15072673559188843, "learning_rate": 0.00020754214729370012, "loss": 0.0854, "step": 7850 }, { "epoch": 4.41587479038569, "grad_norm": 0.5902436375617981, "learning_rate": 0.00020532386867790596, "loss": 0.0967, "step": 7900 }, { "epoch": 4.4438233650083845, "grad_norm": 2.2286317348480225, "learning_rate": 0.0002031055900621118, "loss": 0.0939, "step": 7950 }, { "epoch": 4.471771939631079, "grad_norm": 0.6109320521354675, "learning_rate": 0.00020088731144631767, "loss": 0.0887, "step": 8000 }, { "epoch": 4.499720514253773, "grad_norm": 1.340773582458496, "learning_rate": 0.00019866903283052352, "loss": 0.0837, "step": 8050 }, { "epoch": 4.5276690888764675, "grad_norm": 1.1809900999069214, "learning_rate": 0.00019645075421472938, "loss": 0.0765, "step": 8100 }, { "epoch": 4.555617663499161, "grad_norm": 0.8299195170402527, "learning_rate": 0.00019423247559893523, "loss": 0.0972, "step": 8150 }, { "epoch": 4.583566238121856, "grad_norm": 0.18145732581615448, "learning_rate": 0.0001920141969831411, "loss": 0.075, "step": 8200 }, { "epoch": 4.61151481274455, "grad_norm": 0.2446570098400116, "learning_rate": 0.00018979591836734694, "loss": 0.0857, "step": 8250 }, { "epoch": 4.639463387367244, "grad_norm": 1.534759759902954, "learning_rate": 0.00018757763975155278, "loss": 0.0793, "step": 8300 }, { "epoch": 4.667411961989939, "grad_norm": 1.6062694787979126, "learning_rate": 0.00018535936113575865, "loss": 0.0931, "step": 8350 }, { "epoch": 4.6953605366126325, "grad_norm": 2.1251513957977295, "learning_rate": 0.0001831410825199645, "loss": 0.1003, "step": 8400 }, { "epoch": 4.723309111235327, "grad_norm": 0.9858797788619995, "learning_rate": 0.00018092280390417036, "loss": 0.082, "step": 8450 }, { "epoch": 4.751257685858021, "grad_norm": 1.1654304265975952, "learning_rate": 0.0001787045252883762, "loss": 0.0603, "step": 8500 }, { "epoch": 4.7792062604807155, "grad_norm": 1.731586217880249, "learning_rate": 0.00017648624667258208, "loss": 0.0593, "step": 8550 }, { "epoch": 4.807154835103409, "grad_norm": 0.524991512298584, "learning_rate": 0.00017426796805678795, "loss": 0.0928, "step": 8600 }, { "epoch": 4.835103409726104, "grad_norm": 0.76012122631073, "learning_rate": 0.00017204968944099382, "loss": 0.0901, "step": 8650 }, { "epoch": 4.8630519843487985, "grad_norm": 1.038218379020691, "learning_rate": 0.00016983141082519966, "loss": 0.0712, "step": 8700 }, { "epoch": 4.891000558971492, "grad_norm": 0.7694293260574341, "learning_rate": 0.0001676131322094055, "loss": 0.0801, "step": 8750 }, { "epoch": 4.918949133594187, "grad_norm": 1.4149705171585083, "learning_rate": 0.00016539485359361137, "loss": 0.1054, "step": 8800 }, { "epoch": 4.946897708216881, "grad_norm": 0.35135123133659363, "learning_rate": 0.00016317657497781721, "loss": 0.0917, "step": 8850 }, { "epoch": 4.974846282839575, "grad_norm": 1.8134843111038208, "learning_rate": 0.00016095829636202308, "loss": 0.0946, "step": 8900 }, { "epoch": 5.0, "eval_f1": 0.9494430162810625, "eval_loss": 0.12213503569364548, "eval_runtime": 16.0206, "eval_samples_per_second": 228.456, "eval_steps_per_second": 7.178, "step": 8945 }, { "epoch": 5.00279485746227, "grad_norm": 0.8835757374763489, "learning_rate": 0.00015874001774622893, "loss": 0.0792, "step": 8950 }, { "epoch": 5.0307434320849636, "grad_norm": 0.5846070647239685, "learning_rate": 0.0001565217391304348, "loss": 0.0834, "step": 9000 }, { "epoch": 5.058692006707658, "grad_norm": 0.4132053256034851, "learning_rate": 0.00015430346051464064, "loss": 0.0622, "step": 9050 }, { "epoch": 5.086640581330352, "grad_norm": 3.0904829502105713, "learning_rate": 0.00015208518189884648, "loss": 0.0626, "step": 9100 }, { "epoch": 5.1145891559530465, "grad_norm": 2.243579387664795, "learning_rate": 0.00014986690328305235, "loss": 0.0795, "step": 9150 }, { "epoch": 5.14253773057574, "grad_norm": 1.4317448139190674, "learning_rate": 0.0001476486246672582, "loss": 0.0642, "step": 9200 }, { "epoch": 5.170486305198435, "grad_norm": 0.3954031765460968, "learning_rate": 0.00014543034605146406, "loss": 0.0748, "step": 9250 }, { "epoch": 5.1984348798211295, "grad_norm": 0.7129592895507812, "learning_rate": 0.0001432120674356699, "loss": 0.0631, "step": 9300 }, { "epoch": 5.226383454443823, "grad_norm": 0.7119801044464111, "learning_rate": 0.0001409937888198758, "loss": 0.0672, "step": 9350 }, { "epoch": 5.254332029066518, "grad_norm": 1.8398321866989136, "learning_rate": 0.00013877551020408165, "loss": 0.0627, "step": 9400 }, { "epoch": 5.282280603689212, "grad_norm": 1.1668697595596313, "learning_rate": 0.0001365572315882875, "loss": 0.0661, "step": 9450 }, { "epoch": 5.310229178311906, "grad_norm": 0.6019765734672546, "learning_rate": 0.00013433895297249336, "loss": 0.0615, "step": 9500 }, { "epoch": 5.3381777529346, "grad_norm": 2.365870237350464, "learning_rate": 0.0001321206743566992, "loss": 0.0582, "step": 9550 }, { "epoch": 5.366126327557295, "grad_norm": 0.10710655897855759, "learning_rate": 0.00012990239574090507, "loss": 0.052, "step": 9600 }, { "epoch": 5.394074902179989, "grad_norm": 0.6959599852561951, "learning_rate": 0.0001276841171251109, "loss": 0.0558, "step": 9650 }, { "epoch": 5.422023476802683, "grad_norm": 0.11615774035453796, "learning_rate": 0.00012546583850931678, "loss": 0.0625, "step": 9700 }, { "epoch": 5.4499720514253776, "grad_norm": 0.057329438626766205, "learning_rate": 0.00012324755989352263, "loss": 0.0455, "step": 9750 }, { "epoch": 5.477920626048071, "grad_norm": 3.285327196121216, "learning_rate": 0.00012102928127772848, "loss": 0.0674, "step": 9800 }, { "epoch": 5.505869200670766, "grad_norm": 2.17206072807312, "learning_rate": 0.00011881100266193434, "loss": 0.0678, "step": 9850 }, { "epoch": 5.5338177752934605, "grad_norm": 0.2885390520095825, "learning_rate": 0.0001165927240461402, "loss": 0.0512, "step": 9900 }, { "epoch": 5.561766349916154, "grad_norm": 2.1970298290252686, "learning_rate": 0.00011437444543034606, "loss": 0.0598, "step": 9950 }, { "epoch": 5.589714924538849, "grad_norm": 0.1274396926164627, "learning_rate": 0.00011215616681455192, "loss": 0.0717, "step": 10000 }, { "epoch": 5.617663499161543, "grad_norm": 0.7299688458442688, "learning_rate": 0.00010993788819875776, "loss": 0.0797, "step": 10050 }, { "epoch": 5.645612073784237, "grad_norm": 0.5048921704292297, "learning_rate": 0.00010771960958296362, "loss": 0.0592, "step": 10100 }, { "epoch": 5.673560648406931, "grad_norm": 1.5433499813079834, "learning_rate": 0.00010550133096716947, "loss": 0.0702, "step": 10150 }, { "epoch": 5.701509223029626, "grad_norm": 0.23040899634361267, "learning_rate": 0.00010328305235137533, "loss": 0.0654, "step": 10200 }, { "epoch": 5.729457797652319, "grad_norm": 1.693513035774231, "learning_rate": 0.00010106477373558119, "loss": 0.0687, "step": 10250 }, { "epoch": 5.757406372275014, "grad_norm": 4.076319694519043, "learning_rate": 9.884649511978706e-05, "loss": 0.0639, "step": 10300 }, { "epoch": 5.785354946897709, "grad_norm": 0.48212429881095886, "learning_rate": 9.662821650399291e-05, "loss": 0.0604, "step": 10350 }, { "epoch": 5.813303521520402, "grad_norm": 0.04306049272418022, "learning_rate": 9.440993788819877e-05, "loss": 0.0495, "step": 10400 }, { "epoch": 5.841252096143097, "grad_norm": 0.23358555138111115, "learning_rate": 9.223602484472051e-05, "loss": 0.0615, "step": 10450 }, { "epoch": 5.869200670765791, "grad_norm": 1.8047716617584229, "learning_rate": 9.001774622892636e-05, "loss": 0.0531, "step": 10500 }, { "epoch": 5.897149245388485, "grad_norm": 0.1505836844444275, "learning_rate": 8.779946761313221e-05, "loss": 0.0529, "step": 10550 }, { "epoch": 5.92509782001118, "grad_norm": 0.42754456400871277, "learning_rate": 8.558118899733806e-05, "loss": 0.0567, "step": 10600 }, { "epoch": 5.953046394633874, "grad_norm": 1.9926128387451172, "learning_rate": 8.336291038154392e-05, "loss": 0.0473, "step": 10650 }, { "epoch": 5.980994969256568, "grad_norm": 0.3347943425178528, "learning_rate": 8.114463176574978e-05, "loss": 0.0688, "step": 10700 }, { "epoch": 6.0, "eval_f1": 0.9666100735710244, "eval_loss": 0.08690152317285538, "eval_runtime": 15.4749, "eval_samples_per_second": 236.512, "eval_steps_per_second": 7.431, "step": 10734 }, { "epoch": 6.008943543879262, "grad_norm": 0.8442544341087341, "learning_rate": 7.892635314995563e-05, "loss": 0.0456, "step": 10750 }, { "epoch": 6.036892118501957, "grad_norm": 1.1793649196624756, "learning_rate": 7.67080745341615e-05, "loss": 0.0434, "step": 10800 }, { "epoch": 6.06484069312465, "grad_norm": 0.9821094870567322, "learning_rate": 7.448979591836736e-05, "loss": 0.0455, "step": 10850 }, { "epoch": 6.092789267747345, "grad_norm": 0.5856317281723022, "learning_rate": 7.227151730257321e-05, "loss": 0.0365, "step": 10900 }, { "epoch": 6.120737842370039, "grad_norm": 0.5461094379425049, "learning_rate": 7.005323868677906e-05, "loss": 0.0342, "step": 10950 }, { "epoch": 6.148686416992733, "grad_norm": 2.0931904315948486, "learning_rate": 6.783496007098491e-05, "loss": 0.0423, "step": 11000 }, { "epoch": 6.176634991615428, "grad_norm": 0.046953145414590836, "learning_rate": 6.561668145519077e-05, "loss": 0.0523, "step": 11050 }, { "epoch": 6.204583566238122, "grad_norm": 1.0641871690750122, "learning_rate": 6.339840283939662e-05, "loss": 0.0421, "step": 11100 }, { "epoch": 6.232532140860816, "grad_norm": 0.4122409224510193, "learning_rate": 6.118012422360248e-05, "loss": 0.0441, "step": 11150 }, { "epoch": 6.26048071548351, "grad_norm": 0.2935519218444824, "learning_rate": 5.8961845607808344e-05, "loss": 0.0354, "step": 11200 }, { "epoch": 6.288429290106205, "grad_norm": 0.8135849833488464, "learning_rate": 5.6743566992014193e-05, "loss": 0.0333, "step": 11250 }, { "epoch": 6.316377864728899, "grad_norm": 1.1806564331054688, "learning_rate": 5.4525288376220056e-05, "loss": 0.0614, "step": 11300 }, { "epoch": 6.344326439351593, "grad_norm": 0.5362429022789001, "learning_rate": 5.230700976042591e-05, "loss": 0.0453, "step": 11350 }, { "epoch": 6.372275013974288, "grad_norm": 0.10853467881679535, "learning_rate": 5.008873114463176e-05, "loss": 0.0371, "step": 11400 }, { "epoch": 6.400223588596981, "grad_norm": 2.1878767013549805, "learning_rate": 4.787045252883762e-05, "loss": 0.0418, "step": 11450 }, { "epoch": 6.428172163219676, "grad_norm": 1.1209208965301514, "learning_rate": 4.565217391304348e-05, "loss": 0.0257, "step": 11500 }, { "epoch": 6.45612073784237, "grad_norm": 1.4330588579177856, "learning_rate": 4.343389529724934e-05, "loss": 0.0595, "step": 11550 }, { "epoch": 6.484069312465064, "grad_norm": 1.9309511184692383, "learning_rate": 4.1215616681455187e-05, "loss": 0.0357, "step": 11600 }, { "epoch": 6.512017887087758, "grad_norm": 0.004909028764814138, "learning_rate": 3.899733806566105e-05, "loss": 0.0332, "step": 11650 }, { "epoch": 6.539966461710453, "grad_norm": 0.5627462267875671, "learning_rate": 3.6779059449866906e-05, "loss": 0.0416, "step": 11700 }, { "epoch": 6.567915036333147, "grad_norm": 4.456967830657959, "learning_rate": 3.456078083407276e-05, "loss": 0.045, "step": 11750 }, { "epoch": 6.595863610955841, "grad_norm": 2.8317513465881348, "learning_rate": 3.234250221827861e-05, "loss": 0.0305, "step": 11800 }, { "epoch": 6.623812185578536, "grad_norm": 2.5559463500976562, "learning_rate": 3.012422360248447e-05, "loss": 0.0297, "step": 11850 }, { "epoch": 6.651760760201229, "grad_norm": 0.5523830056190491, "learning_rate": 2.790594498669033e-05, "loss": 0.0325, "step": 11900 }, { "epoch": 6.679709334823924, "grad_norm": 0.04988823086023331, "learning_rate": 2.5687666370896183e-05, "loss": 0.0313, "step": 11950 }, { "epoch": 6.707657909446619, "grad_norm": 1.5902944803237915, "learning_rate": 2.3469387755102043e-05, "loss": 0.0369, "step": 12000 }, { "epoch": 6.735606484069312, "grad_norm": 1.3617613315582275, "learning_rate": 2.1251109139307896e-05, "loss": 0.0338, "step": 12050 }, { "epoch": 6.763555058692007, "grad_norm": 0.13116395473480225, "learning_rate": 1.9032830523513755e-05, "loss": 0.0254, "step": 12100 }, { "epoch": 6.791503633314701, "grad_norm": 0.7528799176216125, "learning_rate": 1.6814551907719608e-05, "loss": 0.0322, "step": 12150 }, { "epoch": 6.819452207937395, "grad_norm": 1.894806146621704, "learning_rate": 1.4596273291925467e-05, "loss": 0.0254, "step": 12200 }, { "epoch": 6.847400782560089, "grad_norm": 0.5565615892410278, "learning_rate": 1.2377994676131324e-05, "loss": 0.0284, "step": 12250 }, { "epoch": 6.875349357182784, "grad_norm": 0.45591992139816284, "learning_rate": 1.015971606033718e-05, "loss": 0.0356, "step": 12300 }, { "epoch": 6.9032979318054775, "grad_norm": 1.1384029388427734, "learning_rate": 7.941437444543036e-06, "loss": 0.0308, "step": 12350 }, { "epoch": 6.931246506428172, "grad_norm": 0.24669720232486725, "learning_rate": 5.7231588287488905e-06, "loss": 0.0385, "step": 12400 }, { "epoch": 6.959195081050867, "grad_norm": 2.1776230335235596, "learning_rate": 3.5048802129547475e-06, "loss": 0.0271, "step": 12450 }, { "epoch": 6.98714365567356, "grad_norm": 0.8091051578521729, "learning_rate": 1.2866015971606034e-06, "loss": 0.0303, "step": 12500 }, { "epoch": 7.0, "eval_f1": 0.9760225669957687, "eval_loss": 0.06743501126766205, "eval_runtime": 16.4262, "eval_samples_per_second": 222.815, "eval_steps_per_second": 7.001, "step": 12523 }, { "epoch": 7.0, "step": 12523, "total_flos": 2.1045216318570566e+19, "train_loss": 0.12371139868869004, "train_runtime": 2341.9199, "train_samples_per_second": 171.111, "train_steps_per_second": 5.347 } ], "logging_steps": 50, "max_steps": 12523, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1045216318570566e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }