|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 10000, |
|
"global_step": 78, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"advantage": 2.050441083312035, |
|
"completion_length": 549.2318519592285, |
|
"epoch": 0.064, |
|
"grad_norm": 0.3377276062965393, |
|
"kl": 0.00016334056854248047, |
|
"learning_rate": 1.875e-06, |
|
"loss": 0.0, |
|
"reward": 1.4327381163835526, |
|
"reward_std": 0.2913255948573351, |
|
"rewards/accuracy_reward": 0.43273810148239134, |
|
"rewards/format_reward": 1.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"advantage": 1.8731657922267915, |
|
"completion_length": 535.2622138977051, |
|
"epoch": 0.128, |
|
"grad_norm": 0.2055761218070984, |
|
"kl": 0.001010751724243164, |
|
"learning_rate": 2.993961440992859e-06, |
|
"loss": 0.0, |
|
"reward": 1.500892886519432, |
|
"reward_std": 0.25598252601921556, |
|
"rewards/accuracy_reward": 0.5008928671479225, |
|
"rewards/format_reward": 1.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"advantage": 1.8132987678050996, |
|
"completion_length": 546.9449508666992, |
|
"epoch": 0.192, |
|
"grad_norm": 0.22043277323246002, |
|
"kl": 0.00526275634765625, |
|
"learning_rate": 2.9265847744427307e-06, |
|
"loss": 0.0002, |
|
"reward": 1.535119077563286, |
|
"reward_std": 0.23049755450338125, |
|
"rewards/accuracy_reward": 0.5351190570741892, |
|
"rewards/format_reward": 1.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"advantage": 1.9496307998895646, |
|
"completion_length": 559.6901908874512, |
|
"epoch": 0.256, |
|
"grad_norm": 0.18047218024730682, |
|
"kl": 0.006356048583984375, |
|
"learning_rate": 2.7876731904027993e-06, |
|
"loss": 0.0003, |
|
"reward": 1.5279762238264083, |
|
"reward_std": 0.21949078943580388, |
|
"rewards/accuracy_reward": 0.5279762007296085, |
|
"rewards/format_reward": 1.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"advantage": 1.9445811256766319, |
|
"completion_length": 561.3247123718262, |
|
"epoch": 0.32, |
|
"grad_norm": 0.17815344035625458, |
|
"kl": 0.00570068359375, |
|
"learning_rate": 2.584192295741087e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5446428924798965, |
|
"reward_std": 0.23059090217575431, |
|
"rewards/accuracy_reward": 0.5446428753435612, |
|
"rewards/format_reward": 1.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"advantage": 1.7600619524717331, |
|
"completion_length": 547.6345359802247, |
|
"epoch": 0.384, |
|
"grad_norm": 0.18666820228099823, |
|
"kl": 0.00573883056640625, |
|
"learning_rate": 2.3263454721781537e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5431547909975052, |
|
"reward_std": 0.21657919492572547, |
|
"rewards/accuracy_reward": 0.5431547746062279, |
|
"rewards/format_reward": 1.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"advantage": 1.7993007361888886, |
|
"completion_length": 546.9157859802247, |
|
"epoch": 0.448, |
|
"grad_norm": 0.1700781136751175, |
|
"kl": 0.00593109130859375, |
|
"learning_rate": 2.027062236122014e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5273809880018234, |
|
"reward_std": 0.2096662785857916, |
|
"rewards/accuracy_reward": 0.5273809637874365, |
|
"rewards/format_reward": 1.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"advantage": 1.896444058418274, |
|
"completion_length": 552.3208389282227, |
|
"epoch": 0.512, |
|
"grad_norm": 0.1920849233865738, |
|
"kl": 0.005233001708984375, |
|
"learning_rate": 1.7013498987264833e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5434524178504945, |
|
"reward_std": 0.22238524220883846, |
|
"rewards/accuracy_reward": 0.5434523917734623, |
|
"rewards/format_reward": 1.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"advantage": 1.870384192466736, |
|
"completion_length": 558.927098083496, |
|
"epoch": 0.576, |
|
"grad_norm": 0.18718139827251434, |
|
"kl": 0.0053081512451171875, |
|
"learning_rate": 1.3655410366448499e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5154762089252471, |
|
"reward_std": 0.21234574727714062, |
|
"rewards/accuracy_reward": 0.5154761992394924, |
|
"rewards/format_reward": 1.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"advantage": 1.9293582290410995, |
|
"completion_length": 554.5389984130859, |
|
"epoch": 0.64, |
|
"grad_norm": 0.2384500652551651, |
|
"kl": 0.0057861328125, |
|
"learning_rate": 1.036474508437579e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5339285999536514, |
|
"reward_std": 0.2257133638486266, |
|
"rewards/accuracy_reward": 0.5339285805821419, |
|
"rewards/format_reward": 1.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"advantage": 1.9427187949419022, |
|
"completion_length": 556.450008392334, |
|
"epoch": 0.704, |
|
"grad_norm": 0.19755816459655762, |
|
"kl": 0.005307769775390625, |
|
"learning_rate": 7.30651083891141e-07, |
|
"loss": 0.0002, |
|
"reward": 1.5452381312847137, |
|
"reward_std": 0.2197699649259448, |
|
"rewards/accuracy_reward": 0.545238108932972, |
|
"rewards/format_reward": 1.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"advantage": 1.8897637754678727, |
|
"completion_length": 560.4640045166016, |
|
"epoch": 0.768, |
|
"grad_norm": 0.24272561073303223, |
|
"kl": 0.005287933349609375, |
|
"learning_rate": 4.63406026519703e-07, |
|
"loss": 0.0002, |
|
"reward": 1.525297647714615, |
|
"reward_std": 0.24526160284876825, |
|
"rewards/accuracy_reward": 0.5252976305782795, |
|
"rewards/format_reward": 1.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"advantage": 1.9503421396017075, |
|
"completion_length": 557.6199478149414, |
|
"epoch": 0.832, |
|
"grad_norm": 0.18535475432872772, |
|
"kl": 0.005422210693359375, |
|
"learning_rate": 2.48140119418046e-07, |
|
"loss": 0.0002, |
|
"reward": 1.517559552192688, |
|
"reward_std": 0.21927062328904867, |
|
"rewards/accuracy_reward": 0.5175595372915268, |
|
"rewards/format_reward": 1.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"advantage": 1.9777270182967186, |
|
"completion_length": 550.9339363098145, |
|
"epoch": 0.896, |
|
"grad_norm": 0.23573845624923706, |
|
"kl": 0.00623931884765625, |
|
"learning_rate": 9.564769404039419e-08, |
|
"loss": 0.0002, |
|
"reward": 1.5324405014514924, |
|
"reward_std": 0.22246364038437605, |
|
"rewards/accuracy_reward": 0.5324404895305633, |
|
"rewards/format_reward": 1.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"advantage": 1.8454515248537064, |
|
"completion_length": 566.992268371582, |
|
"epoch": 0.96, |
|
"grad_norm": 0.18028907477855682, |
|
"kl": 0.00513458251953125, |
|
"learning_rate": 1.357535734809795e-08, |
|
"loss": 0.0002, |
|
"reward": 1.515476217865944, |
|
"reward_std": 0.22508260188624263, |
|
"rewards/accuracy_reward": 0.5154761977493763, |
|
"rewards/format_reward": 1.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"advantage": 2.0446028262376785, |
|
"completion_length": 550.128978729248, |
|
"epoch": 0.9984, |
|
"kl": 0.005794525146484375, |
|
"reward": 1.536210338274638, |
|
"reward_std": 0.22843041829764843, |
|
"rewards/accuracy_reward": 0.5362103283405304, |
|
"rewards/format_reward": 1.0, |
|
"step": 78, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0001987359993598567, |
|
"train_runtime": 7692.2558, |
|
"train_samples_per_second": 0.975, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 78, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|