{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9893390191897655, "eval_steps": 100, "global_step": 174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 648.7421875, "epoch": 0.017057569296375266, "grad_norm": 0.06589560955762863, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": 0.0062, "reward": 0.703125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.69921875, "rewards/reflection_reward_pos": 0.00390625, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 648.5087890625, "epoch": 0.08528784648187633, "grad_norm": 0.06861326843500137, "kl": 0.00011852383613586426, "learning_rate": 8.333333333333334e-07, "loss": -0.0011, "reward": 0.6640625, "reward_std": 0.09115048055537045, "rewards/accuracy_reward": 0.6640625, "rewards/reflection_reward_pos": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 638.33359375, "epoch": 0.17057569296375266, "grad_norm": 0.06051831692457199, "kl": 0.00015828609466552734, "learning_rate": 1.6666666666666669e-06, "loss": 0.01, "reward": 0.67734375, "reward_std": 0.10275145107880235, "rewards/accuracy_reward": 0.67734375, "rewards/reflection_reward_pos": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 622.8640625, "epoch": 0.255863539445629, "grad_norm": 0.09521586447954178, "kl": 0.0001518726348876953, "learning_rate": 2.5e-06, "loss": 0.0024, "reward": 0.71015625, "reward_std": 0.12042912095785141, "rewards/accuracy_reward": 0.709375, "rewards/reflection_reward_pos": 0.00078125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 607.86640625, "epoch": 0.3411513859275053, "grad_norm": 0.08084629476070404, "kl": 0.00022783279418945313, "learning_rate": 2.9987834972573546e-06, "loss": 0.0036, "reward": 0.70546875, "reward_std": 0.12042912160977721, "rewards/accuracy_reward": 0.70390625, "rewards/reflection_reward_pos": 0.0015625, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 645.02421875, "epoch": 0.42643923240938164, "grad_norm": 0.08875104784965515, "kl": 0.00043997764587402346, "learning_rate": 2.9851204919417464e-06, "loss": 0.0049, "reward": 0.65859375, "reward_std": 0.11379999481141567, "rewards/accuracy_reward": 0.65859375, "rewards/reflection_reward_pos": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 670.86953125, "epoch": 0.511727078891258, "grad_norm": 0.07565722614526749, "kl": 0.0007645606994628907, "learning_rate": 2.956412726139078e-06, "loss": 0.0067, "reward": 0.67421875, "reward_std": 0.10054174307733774, "rewards/accuracy_reward": 0.67265625, "rewards/reflection_reward_pos": 0.0015625, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 659.73203125, "epoch": 0.5970149253731343, "grad_norm": 0.07428640872240067, "kl": 0.0010577201843261718, "learning_rate": 2.9129510189868974e-06, "loss": 0.0062, "reward": 0.65, "reward_std": 0.08838834529742598, "rewards/accuracy_reward": 0.65, "rewards/reflection_reward_pos": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 645.309375, "epoch": 0.6823027718550106, "grad_norm": 0.08162426203489304, "kl": 0.0015665054321289062, "learning_rate": 2.8551756519155732e-06, "loss": 0.0098, "reward": 0.6921875, "reward_std": 0.10385630577802658, "rewards/accuracy_reward": 0.6921875, "rewards/reflection_reward_pos": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 655.0703125, "epoch": 0.767590618336887, "grad_norm": 0.08563440293073654, "kl": 0.0018520355224609375, "learning_rate": 2.7836719084521715e-06, "loss": 0.0064, "reward": 0.65859375, "reward_std": 0.12484853798523546, "rewards/accuracy_reward": 0.65859375, "rewards/reflection_reward_pos": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 634.13515625, "epoch": 0.8528784648187633, "grad_norm": 0.08154138922691345, "kl": 0.0025421142578125, "learning_rate": 2.699164145105252e-06, "loss": 0.0071, "reward": 0.6671875, "reward_std": 0.11490484932437539, "rewards/accuracy_reward": 0.66484375, "rewards/reflection_reward_pos": 0.00234375, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 648.434375, "epoch": 0.9381663113006397, "grad_norm": 0.09137308597564697, "kl": 0.0026458740234375, "learning_rate": 2.602508453394493e-06, "loss": 0.0071, "reward": 0.6796875, "reward_std": 0.1303728088736534, "rewards/accuracy_reward": 0.6796875, "rewards/reflection_reward_pos": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 622.5903686523437, "epoch": 1.0341151385927505, "grad_norm": 0.10423822700977325, "kl": 0.0027740478515625, "learning_rate": 2.4946839873611927e-06, "loss": 0.0048, "reward": 0.6671875, "reward_std": 0.11932426644489169, "rewards/accuracy_reward": 0.6671875, "rewards/reflection_reward_pos": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 608.39921875, "epoch": 1.1194029850746268, "grad_norm": 0.10257695615291595, "kl": 0.0032474517822265623, "learning_rate": 2.3767830444148337e-06, "loss": 0.0086, "reward": 0.68515625, "reward_std": 0.10496116010472178, "rewards/accuracy_reward": 0.68515625, "rewards/reflection_reward_pos": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 617.39140625, "epoch": 1.2046908315565032, "grad_norm": 0.0765712708234787, "kl": 0.0033966064453125, "learning_rate": 2.25e-06, "loss": 0.005, "reward": 0.68046875, "reward_std": 0.10275145145133138, "rewards/accuracy_reward": 0.68046875, "rewards/reflection_reward_pos": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 620.7875, "epoch": 1.2899786780383795, "grad_norm": 0.1029704362154007, "kl": 0.0029888153076171875, "learning_rate": 2.1156192081791355e-06, "loss": 0.0023, "reward": 0.6828125, "reward_std": 0.12595339212566614, "rewards/accuracy_reward": 0.68203125, "rewards/reflection_reward_pos": 0.00078125, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 615.33828125, "epoch": 1.375266524520256, "grad_norm": 0.08894416689872742, "kl": 0.0030879974365234375, "learning_rate": 1.975001990702209e-06, "loss": 0.0046, "reward": 0.71484375, "reward_std": 0.12042912067845464, "rewards/accuracy_reward": 0.71484375, "rewards/reflection_reward_pos": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 629.1375, "epoch": 1.4605543710021323, "grad_norm": 0.08637527376413345, "kl": 0.0033039093017578126, "learning_rate": 1.829572846368326e-06, "loss": 0.0055, "reward": 0.690625, "reward_std": 0.11490484941750764, "rewards/accuracy_reward": 0.68984375, "rewards/reflection_reward_pos": 0.00078125, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 652.8375, "epoch": 1.5458422174840085, "grad_norm": 0.0866626650094986, "kl": 0.0031810760498046874, "learning_rate": 1.6808050203829845e-06, "loss": 0.002, "reward": 0.6484375, "reward_std": 0.09501747125759721, "rewards/accuracy_reward": 0.64765625, "rewards/reflection_reward_pos": 0.00078125, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 646.7421875, "epoch": 1.6311300639658848, "grad_norm": 0.09559585154056549, "kl": 0.0030200958251953127, "learning_rate": 1.5302055798981605e-06, "loss": 0.0044, "reward": 0.65625, "reward_std": 0.11048543220385909, "rewards/accuracy_reward": 0.65625, "rewards/reflection_reward_pos": 0.0, "step": 95 }, { "epoch": 1.716417910447761, "grad_norm": 0.06905966252088547, "learning_rate": 1.3793001469249112e-06, "loss": 0.0084, "step": 100 }, { "epoch": 1.716417910447761, "eval_clip_ratio": 0.0, "eval_completion_length": 625.3352635782747, "eval_kl": 0.0029512838052865417, "eval_loss": 0.0050412570126354694, "eval_reward": 0.6336861022364217, "eval_reward_std": 0.12947621855872887, "eval_rewards/accuracy_reward": 0.6328873801916933, "eval_rewards/reflection_reward_pos": 0.0007987220447284345, "eval_runtime": 6061.8369, "eval_samples_per_second": 0.825, "eval_steps_per_second": 0.026, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 643.7375, "epoch": 1.8017057569296375, "grad_norm": 0.08424519002437592, "kl": 0.0029918670654296873, "learning_rate": 1.2296174432791415e-06, "loss": 0.0065, "reward": 0.676171875, "reward_std": 0.11103785866871477, "rewards/accuracy_reward": 0.676171875, "rewards/reflection_reward_pos": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 644.7671875, "epoch": 1.886993603411514, "grad_norm": 0.08362549543380737, "kl": 0.00287628173828125, "learning_rate": 1.0826738041253211e-06, "loss": 0.0102, "reward": 0.6671875, "reward_std": 0.12374368365854024, "rewards/accuracy_reward": 0.6671875, "rewards/reflection_reward_pos": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 633.04375, "epoch": 1.9722814498933903, "grad_norm": 0.0973709300160408, "kl": 0.003045654296875, "learning_rate": 9.399578170010685e-07, "loss": 0.0006, "reward": 0.67265625, "reward_std": 0.1182194116525352, "rewards/accuracy_reward": 0.67265625, "rewards/reflection_reward_pos": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 651.9445343017578, "epoch": 2.068230277185501, "grad_norm": 0.07164692878723145, "kl": 0.0029529571533203126, "learning_rate": 8.029152419343472e-07, "loss": 0.0074, "reward": 0.67265625, "reward_std": 0.10496116001158953, "rewards/accuracy_reward": 0.67265625, "rewards/reflection_reward_pos": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 639.64375, "epoch": 2.1535181236673773, "grad_norm": 0.07949467748403549, "kl": 0.0030269622802734375, "learning_rate": 6.729343654174626e-07, "loss": 0.0081, "reward": 0.6953125, "reward_std": 0.11490484857931733, "rewards/accuracy_reward": 0.6953125, "rewards/reflection_reward_pos": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 656.5984375, "epoch": 2.2388059701492535, "grad_norm": 0.08217156678438187, "kl": 0.0028797149658203124, "learning_rate": 5.513319366069343e-07, "loss": 0.0076, "reward": 0.6890625, "reward_std": 0.11711455835029483, "rewards/accuracy_reward": 0.6890625, "rewards/reflection_reward_pos": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 657.54296875, "epoch": 2.3240938166311302, "grad_norm": 0.08467955142259598, "kl": 0.0028537750244140626, "learning_rate": 4.3933982822017883e-07, "loss": 0.0046, "reward": 0.67109375, "reward_std": 0.10938057713210583, "rewards/accuracy_reward": 0.6703125, "rewards/reflection_reward_pos": 0.00078125, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 642.66484375, "epoch": 2.4093816631130065, "grad_norm": 0.08187657594680786, "kl": 0.0027801513671875, "learning_rate": 3.380925572585183e-07, "loss": 0.008, "reward": 0.6625, "reward_std": 0.11711455713957548, "rewards/accuracy_reward": 0.6625, "rewards/reflection_reward_pos": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 622.1828125, "epoch": 2.4946695095948828, "grad_norm": 0.08926448225975037, "kl": 0.00284576416015625, "learning_rate": 2.4861579197570804e-07, "loss": 0.0096, "reward": 0.678125, "reward_std": 0.11711455807089806, "rewards/accuracy_reward": 0.678125, "rewards/reflection_reward_pos": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 636.60546875, "epoch": 2.579957356076759, "grad_norm": 0.09873559325933456, "kl": 0.00290679931640625, "learning_rate": 1.718159615201853e-07, "loss": 0.0088, "reward": 0.66328125, "reward_std": 0.12263882830739022, "rewards/accuracy_reward": 0.6625, "rewards/reflection_reward_pos": 0.00078125, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 641.1921875, "epoch": 2.6652452025586353, "grad_norm": 0.08141663670539856, "kl": 0.00279998779296875, "learning_rate": 1.0847107350878571e-07, "loss": 0.0056, "reward": 0.66875, "reward_std": 0.11269514048472047, "rewards/accuracy_reward": 0.66796875, "rewards/reflection_reward_pos": 0.00078125, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 635.07578125, "epoch": 2.750533049040512, "grad_norm": 0.07290147989988327, "kl": 0.002706146240234375, "learning_rate": 5.922283255294164e-08, "loss": 0.0028, "reward": 0.7171875, "reward_std": 0.09722718009725213, "rewards/accuracy_reward": 0.7171875, "rewards/reflection_reward_pos": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 633.3, "epoch": 2.835820895522388, "grad_norm": 0.07411785423755646, "kl": 0.0027303695678710938, "learning_rate": 2.4570139579284723e-08, "loss": 0.0062, "reward": 0.6953125, "reward_std": 0.10385630559176207, "rewards/accuracy_reward": 0.6953125, "rewards/reflection_reward_pos": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 639.90546875, "epoch": 2.9211087420042645, "grad_norm": 0.09646094590425491, "kl": 0.0026676177978515623, "learning_rate": 4.864037798685106e-09, "loss": 0.0018, "reward": 0.69296875, "reward_std": 0.10717086931690574, "rewards/accuracy_reward": 0.69296875, "rewards/reflection_reward_pos": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 644.8330116271973, "epoch": 2.9893390191897655, "kl": 0.0027103424072265625, "reward": 0.669921875, "reward_std": 0.11877183895558119, "rewards/accuracy_reward": 0.669921875, "rewards/reflection_reward_pos": 0.0, "step": 174, "total_flos": 0.0, "train_loss": 0.005782647762212089, "train_runtime": 41644.7787, "train_samples_per_second": 0.54, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 174, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }