{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008678295582747549, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.678295582747549e-05, "eval_loss": 1.574290156364441, "eval_runtime": 661.121, "eval_samples_per_second": 29.355, "eval_steps_per_second": 3.67, "step": 1 }, { "epoch": 0.0002603488674824265, "grad_norm": 5.837701797485352, "learning_rate": 1.5e-05, "loss": 6.4896, "step": 3 }, { "epoch": 0.000520697734964853, "grad_norm": 5.339621067047119, "learning_rate": 3e-05, "loss": 6.5447, "step": 6 }, { "epoch": 0.0007810466024472793, "grad_norm": 5.166632175445557, "learning_rate": 4.5e-05, "loss": 6.1585, "step": 9 }, { "epoch": 0.0007810466024472793, "eval_loss": 1.4268044233322144, "eval_runtime": 666.8349, "eval_samples_per_second": 29.103, "eval_steps_per_second": 3.638, "step": 9 }, { "epoch": 0.001041395469929706, "grad_norm": 3.3273203372955322, "learning_rate": 4.993910125649561e-05, "loss": 5.1469, "step": 12 }, { "epoch": 0.0013017443374121322, "grad_norm": 3.0238025188446045, "learning_rate": 4.962019382530521e-05, "loss": 4.947, "step": 15 }, { "epoch": 0.0015620932048945586, "grad_norm": 2.8387701511383057, "learning_rate": 4.9031542398457974e-05, "loss": 4.3529, "step": 18 }, { "epoch": 0.0015620932048945586, "eval_loss": 1.0735480785369873, "eval_runtime": 667.1413, "eval_samples_per_second": 29.09, "eval_steps_per_second": 3.636, "step": 18 }, { "epoch": 0.0018224420723769851, "grad_norm": 3.0103814601898193, "learning_rate": 4.817959636416969e-05, "loss": 4.4745, "step": 21 }, { "epoch": 0.002082790939859412, "grad_norm": 2.6641955375671387, "learning_rate": 4.707368982147318e-05, "loss": 3.9389, "step": 24 }, { "epoch": 0.002343139807341838, "grad_norm": 2.33498215675354, "learning_rate": 4.572593931387604e-05, "loss": 3.6797, "step": 27 }, { "epoch": 0.002343139807341838, "eval_loss": 0.8915460705757141, "eval_runtime": 667.0273, "eval_samples_per_second": 29.095, "eval_steps_per_second": 3.637, "step": 27 }, { "epoch": 0.0026034886748242643, "grad_norm": 2.407578229904175, "learning_rate": 4.415111107797445e-05, "loss": 3.656, "step": 30 }, { "epoch": 0.002863837542306691, "grad_norm": 2.5195510387420654, "learning_rate": 4.2366459261474933e-05, "loss": 3.4146, "step": 33 }, { "epoch": 0.0031241864097891173, "grad_norm": 2.285335063934326, "learning_rate": 4.039153688314145e-05, "loss": 3.0848, "step": 36 }, { "epoch": 0.0031241864097891173, "eval_loss": 0.7998289465904236, "eval_runtime": 665.3923, "eval_samples_per_second": 29.166, "eval_steps_per_second": 3.646, "step": 36 }, { "epoch": 0.003384535277271544, "grad_norm": 2.3626725673675537, "learning_rate": 3.824798160583012e-05, "loss": 3.2387, "step": 39 }, { "epoch": 0.0036448841447539702, "grad_norm": 2.3358616828918457, "learning_rate": 3.5959278669726935e-05, "loss": 2.9941, "step": 42 }, { "epoch": 0.003905233012236397, "grad_norm": 2.2276687622070312, "learning_rate": 3.355050358314172e-05, "loss": 2.9752, "step": 45 }, { "epoch": 0.003905233012236397, "eval_loss": 0.7518236041069031, "eval_runtime": 666.2992, "eval_samples_per_second": 29.127, "eval_steps_per_second": 3.641, "step": 45 }, { "epoch": 0.004165581879718824, "grad_norm": 2.41260027885437, "learning_rate": 3.104804738999169e-05, "loss": 2.82, "step": 48 }, { "epoch": 0.0044259307472012495, "grad_norm": 2.4345157146453857, "learning_rate": 2.8479327524001636e-05, "loss": 2.9344, "step": 51 }, { "epoch": 0.004686279614683676, "grad_norm": 2.6772620677948, "learning_rate": 2.587248741756253e-05, "loss": 2.8991, "step": 54 }, { "epoch": 0.004686279614683676, "eval_loss": 0.7258293628692627, "eval_runtime": 667.3649, "eval_samples_per_second": 29.08, "eval_steps_per_second": 3.635, "step": 54 }, { "epoch": 0.004946628482166103, "grad_norm": 2.7180323600769043, "learning_rate": 2.3256088156396868e-05, "loss": 2.8935, "step": 57 }, { "epoch": 0.005206977349648529, "grad_norm": 2.3361828327178955, "learning_rate": 2.0658795558326743e-05, "loss": 2.8249, "step": 60 }, { "epoch": 0.005467326217130955, "grad_norm": 2.265934467315674, "learning_rate": 1.8109066104575023e-05, "loss": 2.8911, "step": 63 }, { "epoch": 0.005467326217130955, "eval_loss": 0.709642767906189, "eval_runtime": 666.8626, "eval_samples_per_second": 29.102, "eval_steps_per_second": 3.638, "step": 63 }, { "epoch": 0.005727675084613382, "grad_norm": 2.4326114654541016, "learning_rate": 1.56348351646022e-05, "loss": 2.9059, "step": 66 }, { "epoch": 0.005988023952095809, "grad_norm": 2.208724021911621, "learning_rate": 1.3263210930352737e-05, "loss": 2.7232, "step": 69 }, { "epoch": 0.006248372819578235, "grad_norm": 2.313538074493408, "learning_rate": 1.1020177413231334e-05, "loss": 2.79, "step": 72 }, { "epoch": 0.006248372819578235, "eval_loss": 0.700292706489563, "eval_runtime": 666.7836, "eval_samples_per_second": 29.105, "eval_steps_per_second": 3.638, "step": 72 }, { "epoch": 0.006508721687060661, "grad_norm": 2.1600091457366943, "learning_rate": 8.930309757836517e-06, "loss": 2.7683, "step": 75 }, { "epoch": 0.006769070554543088, "grad_norm": 2.334808588027954, "learning_rate": 7.016504991533726e-06, "loss": 2.7657, "step": 78 }, { "epoch": 0.007029419422025514, "grad_norm": 2.3936593532562256, "learning_rate": 5.299731159831953e-06, "loss": 2.8566, "step": 81 }, { "epoch": 0.007029419422025514, "eval_loss": 0.6951488256454468, "eval_runtime": 666.3577, "eval_samples_per_second": 29.124, "eval_steps_per_second": 3.641, "step": 81 }, { "epoch": 0.0072897682895079405, "grad_norm": 2.512799024581909, "learning_rate": 3.798797596089351e-06, "loss": 2.7116, "step": 84 }, { "epoch": 0.007550117156990367, "grad_norm": 2.50602388381958, "learning_rate": 2.5301488425208296e-06, "loss": 2.7318, "step": 87 }, { "epoch": 0.007810466024472794, "grad_norm": 2.2158546447753906, "learning_rate": 1.5076844803522922e-06, "loss": 2.8176, "step": 90 }, { "epoch": 0.007810466024472794, "eval_loss": 0.6929637789726257, "eval_runtime": 666.938, "eval_samples_per_second": 29.099, "eval_steps_per_second": 3.638, "step": 90 }, { "epoch": 0.00807081489195522, "grad_norm": 2.417557954788208, "learning_rate": 7.426068431000882e-07, "loss": 2.7837, "step": 93 }, { "epoch": 0.008331163759437647, "grad_norm": 2.59735107421875, "learning_rate": 2.4329828146074095e-07, "loss": 2.846, "step": 96 }, { "epoch": 0.008591512626920072, "grad_norm": 2.75762677192688, "learning_rate": 1.522932452260595e-08, "loss": 2.8172, "step": 99 }, { "epoch": 0.008591512626920072, "eval_loss": 0.6925280690193176, "eval_runtime": 667.4952, "eval_samples_per_second": 29.074, "eval_steps_per_second": 3.634, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.2442104332288e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }