Qwen-2.5-7B-Simple-RL / trainer_state.json
lzy337's picture
Model save
a3b4118 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 100,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 604.3152841567993,
"epoch": 0.032,
"grad_norm": 0.14503227174282074,
"kl": 0.00012428760528564452,
"learning_rate": 9.375e-07,
"loss": 0.0,
"reward": 0.6548611164093018,
"reward_std": 0.3155763540416956,
"rewards/accuracy_reward": 0.6541666720062494,
"rewards/format_reward": 0.0006944444496184588,
"step": 5
},
{
"completion_length": 643.951396560669,
"epoch": 0.064,
"grad_norm": 0.08788644522428513,
"kl": 0.00025680065155029295,
"learning_rate": 1.875e-06,
"loss": 0.0,
"reward": 0.6215277845971287,
"reward_std": 0.30458943024277685,
"rewards/accuracy_reward": 0.6208333398215473,
"rewards/format_reward": 0.0006944444496184588,
"step": 10
},
{
"completion_length": 606.6493129730225,
"epoch": 0.096,
"grad_norm": 0.10090751200914383,
"kl": 0.0006760239601135253,
"learning_rate": 2.8125e-06,
"loss": 0.0,
"reward": 0.6791666748467833,
"reward_std": 0.27497591376304625,
"rewards/accuracy_reward": 0.6791666748467833,
"rewards/format_reward": 0.0,
"step": 15
},
{
"completion_length": 628.4958393096924,
"epoch": 0.128,
"grad_norm": 0.10319401323795319,
"kl": 0.0019372463226318359,
"learning_rate": 2.993961440992859e-06,
"loss": 0.0001,
"reward": 0.7437500089406968,
"reward_std": 0.23143401462584734,
"rewards/accuracy_reward": 0.7437500089406968,
"rewards/format_reward": 0.0,
"step": 20
},
{
"completion_length": 637.5111164093017,
"epoch": 0.16,
"grad_norm": 0.0020248512737452984,
"kl": 0.13670992851257324,
"learning_rate": 2.9695130976348534e-06,
"loss": 0.0055,
"reward": 0.7256944484543055,
"reward_std": 0.23655044846236706,
"rewards/accuracy_reward": 0.7256944484543055,
"rewards/format_reward": 0.0,
"step": 25
},
{
"completion_length": 628.5451442718506,
"epoch": 0.192,
"grad_norm": 0.09042873978614807,
"kl": 0.0024117946624755858,
"learning_rate": 2.9265847744427307e-06,
"loss": 0.0001,
"reward": 0.7555555619299412,
"reward_std": 0.20260455049574375,
"rewards/accuracy_reward": 0.7555555619299412,
"rewards/format_reward": 0.0,
"step": 30
},
{
"completion_length": 615.7291702270508,
"epoch": 0.224,
"grad_norm": 0.07207166403532028,
"kl": 0.0032260894775390627,
"learning_rate": 2.865716319988224e-06,
"loss": 0.0001,
"reward": 0.7326388933230191,
"reward_std": 0.21108066253364086,
"rewards/accuracy_reward": 0.7326388933230191,
"rewards/format_reward": 0.0,
"step": 35
},
{
"completion_length": 626.9736179351806,
"epoch": 0.256,
"grad_norm": 0.054428163915872574,
"kl": 0.0026816368103027345,
"learning_rate": 2.7876731904027993e-06,
"loss": 0.0001,
"reward": 0.7451388964429497,
"reward_std": 0.2157622052356601,
"rewards/accuracy_reward": 0.7451388964429497,
"rewards/format_reward": 0.0,
"step": 40
},
{
"completion_length": 594.1208389282226,
"epoch": 0.288,
"grad_norm": 0.08016934990882874,
"kl": 0.002810811996459961,
"learning_rate": 2.6934368233226715e-06,
"loss": 0.0001,
"reward": 0.7694444504100829,
"reward_std": 0.1868146162480116,
"rewards/accuracy_reward": 0.7694444504100829,
"rewards/format_reward": 0.0,
"step": 45
},
{
"completion_length": 614.3645858764648,
"epoch": 0.32,
"grad_norm": 0.11283142864704132,
"kl": 0.002644062042236328,
"learning_rate": 2.584192295741087e-06,
"loss": 0.0001,
"reward": 0.7666666714474559,
"reward_std": 0.18280234448611737,
"rewards/accuracy_reward": 0.7666666714474559,
"rewards/format_reward": 0.0,
"step": 50
},
{
"completion_length": 587.173616027832,
"epoch": 0.352,
"grad_norm": 0.052785150706768036,
"kl": 0.003171491622924805,
"learning_rate": 2.461313420977536e-06,
"loss": 0.0001,
"reward": 0.7930555634200573,
"reward_std": 0.17841423936188222,
"rewards/accuracy_reward": 0.7930555634200573,
"rewards/format_reward": 0.0,
"step": 55
},
{
"completion_length": 593.7201442718506,
"epoch": 0.384,
"grad_norm": 0.1886148303747177,
"kl": 0.004308557510375977,
"learning_rate": 2.3263454721781537e-06,
"loss": 0.0002,
"reward": 0.777083340100944,
"reward_std": 0.18415420949459077,
"rewards/accuracy_reward": 0.777083340100944,
"rewards/format_reward": 0.0,
"step": 60
},
{
"completion_length": 600.0354248046875,
"epoch": 0.416,
"grad_norm": 0.019889166578650475,
"kl": 0.0036341667175292967,
"learning_rate": 2.18098574960932e-06,
"loss": 0.0001,
"reward": 0.7277777815237642,
"reward_std": 0.18065068628638983,
"rewards/accuracy_reward": 0.7277777815237642,
"rewards/format_reward": 0.0,
"step": 65
},
{
"completion_length": 598.8555599212647,
"epoch": 0.448,
"grad_norm": 0.050428278744220734,
"kl": 0.016699600219726562,
"learning_rate": 2.027062236122014e-06,
"loss": 0.0007,
"reward": 0.733333338610828,
"reward_std": 0.18704659640789031,
"rewards/accuracy_reward": 0.733333338610828,
"rewards/format_reward": 0.0,
"step": 70
},
{
"completion_length": 589.2645866394043,
"epoch": 0.48,
"grad_norm": 0.5229154229164124,
"kl": 0.004603099822998047,
"learning_rate": 1.866510609206841e-06,
"loss": 0.0002,
"reward": 0.7680555649101735,
"reward_std": 0.20577130634337665,
"rewards/accuracy_reward": 0.7680555649101735,
"rewards/format_reward": 0.0,
"step": 75
},
{
"completion_length": 613.6562576293945,
"epoch": 0.512,
"grad_norm": 0.03132776543498039,
"kl": 0.0037424564361572266,
"learning_rate": 1.7013498987264833e-06,
"loss": 0.0001,
"reward": 0.7611111178994179,
"reward_std": 0.1536103853955865,
"rewards/accuracy_reward": 0.7611111178994179,
"rewards/format_reward": 0.0,
"step": 80
},
{
"completion_length": 580.3000034332275,
"epoch": 0.544,
"grad_norm": 0.041480712592601776,
"kl": 0.0032805442810058595,
"learning_rate": 1.5336570964437077e-06,
"loss": 0.0001,
"reward": 0.7500000051222742,
"reward_std": 0.18460483234375716,
"rewards/accuracy_reward": 0.7500000051222742,
"rewards/format_reward": 0.0,
"step": 85
},
{
"completion_length": 577.7583400726319,
"epoch": 0.576,
"grad_norm": 0.0553722009062767,
"kl": 0.003495025634765625,
"learning_rate": 1.3655410366448499e-06,
"loss": 0.0001,
"reward": 0.7687500093132258,
"reward_std": 0.17976610995829106,
"rewards/accuracy_reward": 0.7687500093132258,
"rewards/format_reward": 0.0,
"step": 90
},
{
"completion_length": 585.31389503479,
"epoch": 0.608,
"grad_norm": 0.05203121528029442,
"kl": 0.003870105743408203,
"learning_rate": 1.199115876325091e-06,
"loss": 0.0002,
"reward": 0.7888888973742724,
"reward_std": 0.15750880874693393,
"rewards/accuracy_reward": 0.7888888973742724,
"rewards/format_reward": 0.0,
"step": 95
},
{
"completion_length": 565.354870223999,
"epoch": 0.64,
"grad_norm": 0.06253661960363388,
"kl": 0.003356456756591797,
"learning_rate": 1.036474508437579e-06,
"loss": 0.0001,
"reward": 0.7930555619299412,
"reward_std": 0.15678047761321068,
"rewards/accuracy_reward": 0.7930555619299412,
"rewards/format_reward": 0.0,
"step": 100
},
{
"epoch": 0.64,
"eval_completion_length": 580.157574237716,
"eval_kl": 0.007348269420822866,
"eval_loss": 0.00029379926854744554,
"eval_reward": 0.6814303872967167,
"eval_reward_std": 0.20519427888108788,
"eval_rewards/accuracy_reward": 0.6814303872967167,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 15488.5186,
"eval_samples_per_second": 0.323,
"eval_steps_per_second": 0.108,
"step": 100
},
{
"completion_length": 574.4409788131713,
"epoch": 0.672,
"grad_norm": 0.034538425505161285,
"kl": 0.003883838653564453,
"learning_rate": 8.796622425502193e-07,
"loss": 0.0002,
"reward": 0.7833333427086473,
"reward_std": 0.1512267516925931,
"rewards/accuracy_reward": 0.7833333427086473,
"rewards/format_reward": 0.0,
"step": 105
},
{
"completion_length": 611.8923690795898,
"epoch": 0.704,
"grad_norm": 0.004378489218652248,
"kl": 0.0033061027526855467,
"learning_rate": 7.30651083891141e-07,
"loss": 0.0001,
"reward": 0.7430555634200573,
"reward_std": 0.19789394550025463,
"rewards/accuracy_reward": 0.7430555634200573,
"rewards/format_reward": 0.0,
"step": 110
},
{
"completion_length": 608.2687576293945,
"epoch": 0.736,
"grad_norm": 0.0850713923573494,
"kl": 0.0032596588134765625,
"learning_rate": 5.913149342387704e-07,
"loss": 0.0001,
"reward": 0.7694444509223104,
"reward_std": 0.1801286071538925,
"rewards/accuracy_reward": 0.7694444509223104,
"rewards/format_reward": 0.0,
"step": 115
},
{
"completion_length": 572.259033203125,
"epoch": 0.768,
"grad_norm": 0.06911196559667587,
"kl": 0.0034391403198242186,
"learning_rate": 4.63406026519703e-07,
"loss": 0.0001,
"reward": 0.7722222298849374,
"reward_std": 0.15622506141662598,
"rewards/accuracy_reward": 0.7722222298849374,
"rewards/format_reward": 0.0,
"step": 120
},
{
"completion_length": 604.2020915985107,
"epoch": 0.8,
"grad_norm": 0.09203875064849854,
"kl": 0.003432178497314453,
"learning_rate": 3.4853288946298335e-07,
"loss": 0.0001,
"reward": 0.7388888964429497,
"reward_std": 0.18809502944350243,
"rewards/accuracy_reward": 0.7388888964429497,
"rewards/format_reward": 0.0,
"step": 125
},
{
"completion_length": 582.3465305328369,
"epoch": 0.832,
"grad_norm": 0.04850884899497032,
"kl": 0.0036045074462890624,
"learning_rate": 2.48140119418046e-07,
"loss": 0.0001,
"reward": 0.7687500065192581,
"reward_std": 0.15752214156091213,
"rewards/accuracy_reward": 0.7687500065192581,
"rewards/format_reward": 0.0,
"step": 130
},
{
"completion_length": 599.1916728973389,
"epoch": 0.864,
"grad_norm": 0.03212558850646019,
"kl": 0.0031602859497070314,
"learning_rate": 1.634902137174483e-07,
"loss": 0.0001,
"reward": 0.7312500088475644,
"reward_std": 0.1761287273839116,
"rewards/accuracy_reward": 0.7312500088475644,
"rewards/format_reward": 0.0,
"step": 135
},
{
"completion_length": 577.2263946533203,
"epoch": 0.896,
"grad_norm": 0.03874632343649864,
"kl": 0.003940582275390625,
"learning_rate": 9.564769404039419e-08,
"loss": 0.0002,
"reward": 0.7729166727513075,
"reward_std": 0.189943253621459,
"rewards/accuracy_reward": 0.7729166727513075,
"rewards/format_reward": 0.0,
"step": 140
},
{
"completion_length": 575.8805583953857,
"epoch": 0.928,
"grad_norm": 0.07863741368055344,
"kl": 0.004055213928222656,
"learning_rate": 4.546571943496969e-08,
"loss": 0.0002,
"reward": 0.7590277843177319,
"reward_std": 0.19089928902685643,
"rewards/accuracy_reward": 0.7590277843177319,
"rewards/format_reward": 0.0,
"step": 145
},
{
"completion_length": 585.861116027832,
"epoch": 0.96,
"grad_norm": 0.02937915176153183,
"kl": 0.004036521911621094,
"learning_rate": 1.357535734809795e-08,
"loss": 0.0002,
"reward": 0.7618055611848831,
"reward_std": 0.169761879183352,
"rewards/accuracy_reward": 0.7618055611848831,
"rewards/format_reward": 0.0,
"step": 150
},
{
"completion_length": 580.2944515228271,
"epoch": 0.992,
"grad_norm": 0.04815535247325897,
"kl": 0.003546333312988281,
"learning_rate": 3.77647586240204e-10,
"loss": 0.0001,
"reward": 0.7680555626749992,
"reward_std": 0.18228026237338782,
"rewards/accuracy_reward": 0.7680555626749992,
"rewards/format_reward": 0.0,
"step": 155
},
{
"completion_length": 629.1527805328369,
"epoch": 0.9984,
"kl": 0.002917766571044922,
"reward": 0.7187500055879354,
"reward_std": 0.1746763400733471,
"rewards/accuracy_reward": 0.7187500055879354,
"rewards/format_reward": 0.0,
"step": 156,
"total_flos": 0.0,
"train_loss": 0.0003127560741445333,
"train_runtime": 45856.1745,
"train_samples_per_second": 0.164,
"train_steps_per_second": 0.003
}
],
"logging_steps": 5,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}