jingluo commited on
Commit
24cd100
·
verified ·
1 Parent(s): 68b0885

Model save

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/luojing020713-siat/huggingface/runs/h0e5120i)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/luojing020713-siat/huggingface/runs/i24cg4sm)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.019510615981001962,
4
- "train_runtime": 40590.3738,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.554,
7
  "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.005782647762212089,
4
+ "train_runtime": 41644.7787,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.54,
7
  "train_steps_per_second": 0.004
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1eea86c220d76b32b2b9cc665dd8fec3d8a18681bdb03f0c0e420a3b5c108fbc
3
  size 4957560304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a295261b7109e67173c7e7151d15f10df25b1a1a0842779d40df942c3caccdf
3
  size 4957560304
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e40b6202a36e1c10cff24667c72d0716696a37d80ba3fd243368c348d390b70b
3
  size 1836696752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7065c71486bc879a9100559062b65218d55d9b73b619409d90ec664d49b7ce79
3
  size 1836696752
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.019510615981001962,
4
- "train_runtime": 40590.3738,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.554,
7
  "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.005782647762212089,
4
+ "train_runtime": 41644.7787,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.54,
7
  "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -10,552 +10,516 @@
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
- "completion_length": 598.53515625,
14
  "epoch": 0.017057569296375266,
15
- "grad_norm": 0.21371020376682281,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
- "loss": 0.0226,
19
- "reward": 0.6796875,
20
- "reward_std": 0.25411649234592915,
21
- "rewards/accuracy_reward": 0.53125,
22
- "rewards/format_reward": 0.1484375,
23
- "rewards/reflection_reward_pos": 0.0,
24
  "step": 1
25
  },
26
  {
27
  "clip_ratio": 0.0,
28
- "completion_length": 573.52734375,
29
  "epoch": 0.08528784648187633,
30
- "grad_norm": 0.22427839040756226,
31
- "kl": 0.00026004016399383545,
32
  "learning_rate": 8.333333333333334e-07,
33
- "loss": 0.0238,
34
- "reward": 0.6337890625,
35
- "reward_std": 0.24444902036339045,
36
- "rewards/accuracy_reward": 0.4833984375,
37
- "rewards/format_reward": 0.1494140625,
38
- "rewards/reflection_reward_pos": 0.0009765625,
39
  "step": 5
40
  },
41
  {
42
  "clip_ratio": 0.0,
43
- "completion_length": 530.23671875,
44
  "epoch": 0.17057569296375266,
45
- "grad_norm": 0.24708664417266846,
46
- "kl": 0.0064833641052246095,
47
  "learning_rate": 1.6666666666666669e-06,
48
- "loss": 0.0162,
49
- "reward": 0.6828125,
50
- "reward_std": 0.22759998859837652,
51
- "rewards/accuracy_reward": 0.51171875,
52
- "rewards/format_reward": 0.1703125,
53
- "rewards/reflection_reward_pos": 0.00078125,
54
  "step": 10
55
  },
56
  {
57
  "clip_ratio": 0.0,
58
- "completion_length": 543.28359375,
59
  "epoch": 0.255863539445629,
60
- "grad_norm": 0.20078016817569733,
61
- "kl": 0.001663541793823242,
62
  "learning_rate": 2.5e-06,
63
- "loss": 0.0205,
64
- "reward": 0.68671875,
65
- "reward_std": 0.22207571836188436,
66
- "rewards/accuracy_reward": 0.51640625,
67
- "rewards/format_reward": 0.16796875,
68
- "rewards/reflection_reward_pos": 0.00234375,
69
  "step": 15
70
  },
71
  {
72
  "clip_ratio": 0.0,
73
- "completion_length": 515.36875,
74
  "epoch": 0.3411513859275053,
75
- "grad_norm": 0.48252072930336,
76
- "kl": 0.008718490600585938,
77
  "learning_rate": 2.9987834972573546e-06,
78
- "loss": 0.0157,
79
- "reward": 0.70234375,
80
- "reward_std": 0.23533396841958165,
81
- "rewards/accuracy_reward": 0.53046875,
82
- "rewards/format_reward": 0.16875,
83
- "rewards/reflection_reward_pos": 0.003125,
84
  "step": 20
85
  },
86
  {
87
  "clip_ratio": 0.0,
88
- "completion_length": 563.7859375,
89
  "epoch": 0.42643923240938164,
90
- "grad_norm": 0.25852227210998535,
91
- "kl": 0.015869140625,
92
  "learning_rate": 2.9851204919417464e-06,
93
- "loss": 0.0099,
94
- "reward": 0.63984375,
95
- "reward_std": 0.22870484348386527,
96
- "rewards/accuracy_reward": 0.49140625,
97
- "rewards/format_reward": 0.1453125,
98
- "rewards/reflection_reward_pos": 0.003125,
99
  "step": 25
100
  },
101
  {
102
  "clip_ratio": 0.0,
103
- "completion_length": 568.0296875,
104
  "epoch": 0.511727078891258,
105
- "grad_norm": 0.17840448021888733,
106
- "kl": 0.017840576171875,
107
  "learning_rate": 2.956412726139078e-06,
108
- "loss": 0.0123,
109
- "reward": 0.6546875,
110
- "reward_std": 0.23422911390662193,
111
- "rewards/accuracy_reward": 0.51875,
112
- "rewards/format_reward": 0.13515625,
113
- "rewards/reflection_reward_pos": 0.00078125,
114
  "step": 30
115
  },
116
  {
117
  "clip_ratio": 0.0,
118
- "completion_length": 571.41328125,
119
  "epoch": 0.5970149253731343,
120
- "grad_norm": 0.20260392129421234,
121
- "kl": 0.022509765625,
122
  "learning_rate": 2.9129510189868974e-06,
123
- "loss": 0.0141,
124
- "reward": 0.64609375,
125
- "reward_std": 0.19997863341122865,
126
- "rewards/accuracy_reward": 0.48125,
127
- "rewards/format_reward": 0.16328125,
128
- "rewards/reflection_reward_pos": 0.0015625,
129
  "step": 35
130
  },
131
  {
132
  "clip_ratio": 0.0,
133
- "completion_length": 544.46328125,
134
  "epoch": 0.6823027718550106,
135
- "grad_norm": 0.3073066174983978,
136
- "kl": 0.0449493408203125,
137
  "learning_rate": 2.8551756519155732e-06,
138
- "loss": 0.0171,
139
- "reward": 0.6875,
140
- "reward_std": 0.21876115268096327,
141
- "rewards/accuracy_reward": 0.50078125,
142
- "rewards/format_reward": 0.1859375,
143
- "rewards/reflection_reward_pos": 0.00078125,
144
  "step": 40
145
  },
146
  {
147
  "clip_ratio": 0.0,
148
- "completion_length": 570.12890625,
149
  "epoch": 0.767590618336887,
150
- "grad_norm": 0.26227954030036926,
151
- "kl": 0.091302490234375,
152
  "learning_rate": 2.7836719084521715e-06,
153
- "loss": 0.0133,
154
- "reward": 0.65390625,
155
- "reward_std": 0.22428542636334897,
156
- "rewards/accuracy_reward": 0.49140625,
157
- "rewards/format_reward": 0.159375,
158
- "rewards/reflection_reward_pos": 0.003125,
159
  "step": 45
160
  },
161
  {
162
  "clip_ratio": 0.0,
163
- "completion_length": 548.32421875,
164
  "epoch": 0.8528784648187633,
165
- "grad_norm": 0.15913987159729004,
166
- "kl": 0.046868896484375,
167
  "learning_rate": 2.699164145105252e-06,
168
- "loss": 0.0176,
169
- "reward": 0.646875,
170
- "reward_std": 0.24085824135690928,
171
- "rewards/accuracy_reward": 0.46953125,
172
- "rewards/format_reward": 0.17578125,
173
- "rewards/reflection_reward_pos": 0.0015625,
174
  "step": 50
175
  },
176
  {
177
  "clip_ratio": 0.0,
178
- "completion_length": 539.67109375,
179
  "epoch": 0.9381663113006397,
180
- "grad_norm": 0.34673067927360535,
181
- "kl": 0.0633026123046875,
182
  "learning_rate": 2.602508453394493e-06,
183
- "loss": 0.0218,
184
- "reward": 0.68203125,
185
- "reward_std": 0.23975338581949474,
186
- "rewards/accuracy_reward": 0.50390625,
187
- "rewards/format_reward": 0.17578125,
188
- "rewards/reflection_reward_pos": 0.00234375,
189
  "step": 55
190
  },
191
  {
192
  "clip_ratio": 0.0,
193
- "completion_length": 533.0619819641113,
194
  "epoch": 1.0341151385927505,
195
- "grad_norm": 0.17685599625110626,
196
- "kl": 0.079449462890625,
197
  "learning_rate": 2.4946839873611927e-06,
198
- "loss": 0.0214,
199
- "reward": 0.66875,
200
- "reward_std": 0.223180572129786,
201
- "rewards/accuracy_reward": 0.51640625,
202
- "rewards/format_reward": 0.15078125,
203
- "rewards/reflection_reward_pos": 0.0015625,
204
  "step": 60
205
  },
206
  {
207
  "clip_ratio": 0.0,
208
- "completion_length": 531.99140625,
209
  "epoch": 1.1194029850746268,
210
- "grad_norm": 0.2092791497707367,
211
- "kl": 0.0752899169921875,
212
  "learning_rate": 2.3767830444148337e-06,
213
- "loss": 0.0198,
214
- "reward": 0.6484375,
215
- "reward_std": 0.24306795094162226,
216
- "rewards/accuracy_reward": 0.48828125,
217
- "rewards/format_reward": 0.1546875,
218
- "rewards/reflection_reward_pos": 0.00546875,
219
  "step": 65
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
- "completion_length": 534.46328125,
224
  "epoch": 1.2046908315565032,
225
- "grad_norm": 0.16183458268642426,
226
- "kl": 0.06942138671875,
227
  "learning_rate": 2.25e-06,
228
- "loss": 0.0175,
229
- "reward": 0.69609375,
230
- "reward_std": 0.21765630040317774,
231
- "rewards/accuracy_reward": 0.5046875,
232
- "rewards/format_reward": 0.18984375,
233
- "rewards/reflection_reward_pos": 0.0015625,
234
  "step": 70
235
  },
236
  {
237
  "clip_ratio": 0.0,
238
- "completion_length": 550.5828125,
239
  "epoch": 1.2899786780383795,
240
- "grad_norm": 0.31448841094970703,
241
- "kl": 0.170635986328125,
242
  "learning_rate": 2.1156192081791355e-06,
243
- "loss": 0.0202,
244
- "reward": 0.6625,
245
- "reward_std": 0.22980969864875078,
246
- "rewards/accuracy_reward": 0.49765625,
247
- "rewards/format_reward": 0.1609375,
248
- "rewards/reflection_reward_pos": 0.00390625,
249
  "step": 75
250
  },
251
  {
252
  "clip_ratio": 0.0,
253
- "completion_length": 531.7296875,
254
  "epoch": 1.375266524520256,
255
- "grad_norm": 2.1899828910827637,
256
- "kl": 0.155224609375,
257
  "learning_rate": 1.975001990702209e-06,
258
- "loss": 0.0176,
259
- "reward": 0.6984375,
260
- "reward_std": 0.23422911493107676,
261
- "rewards/accuracy_reward": 0.52890625,
262
- "rewards/format_reward": 0.16875,
263
- "rewards/reflection_reward_pos": 0.00078125,
264
  "step": 80
265
  },
266
  {
267
  "clip_ratio": 0.0,
268
- "completion_length": 544.83046875,
269
  "epoch": 1.4605543710021323,
270
- "grad_norm": 0.4367416501045227,
271
- "kl": 0.205291748046875,
272
  "learning_rate": 1.829572846368326e-06,
273
- "loss": 0.0195,
274
- "reward": 0.675,
275
- "reward_std": 0.26074561905115845,
276
- "rewards/accuracy_reward": 0.509375,
277
- "rewards/format_reward": 0.16484375,
278
  "rewards/reflection_reward_pos": 0.00078125,
279
  "step": 85
280
  },
281
  {
282
  "clip_ratio": 0.0,
283
- "completion_length": 552.22890625,
284
  "epoch": 1.5458422174840085,
285
- "grad_norm": 0.24191254377365112,
286
- "kl": 0.123126220703125,
287
  "learning_rate": 1.6808050203829845e-06,
288
- "loss": 0.0164,
289
- "reward": 0.66015625,
290
- "reward_std": 0.21986600933596492,
291
- "rewards/accuracy_reward": 0.51015625,
292
- "rewards/format_reward": 0.14765625,
293
- "rewards/reflection_reward_pos": 0.00234375,
294
  "step": 90
295
  },
296
  {
297
  "clip_ratio": 0.0,
298
- "completion_length": 558.94765625,
299
  "epoch": 1.6311300639658848,
300
- "grad_norm": 0.27353477478027344,
301
- "kl": 0.246923828125,
302
  "learning_rate": 1.5302055798981605e-06,
303
- "loss": 0.0191,
304
- "reward": 0.6890625,
305
- "reward_std": 0.22539028134196998,
306
- "rewards/accuracy_reward": 0.51484375,
307
- "rewards/format_reward": 0.171875,
308
- "rewards/reflection_reward_pos": 0.00234375,
309
  "step": 95
310
  },
311
  {
312
  "epoch": 1.716417910447761,
313
- "grad_norm": 0.6983962059020996,
314
  "learning_rate": 1.3793001469249112e-06,
315
- "loss": 0.0255,
316
  "step": 100
317
  },
318
  {
319
  "epoch": 1.716417910447761,
320
  "eval_clip_ratio": 0.0,
321
- "eval_completion_length": 535.5470247603834,
322
- "eval_kl": 0.17427428614217252,
323
- "eval_loss": 0.01942022331058979,
324
- "eval_reward": 0.6449680511182109,
325
- "eval_reward_std": 0.24596245542835124,
326
- "eval_rewards/accuracy_reward": 0.461461661341853,
327
- "eval_rewards/format_reward": 0.1814097444089457,
328
- "eval_rewards/reflection_reward_pos": 0.0020966453674121405,
329
- "eval_runtime": 5724.0277,
330
- "eval_samples_per_second": 0.874,
331
- "eval_steps_per_second": 0.027,
332
  "step": 100
333
  },
334
  {
335
  "clip_ratio": 0.0,
336
- "completion_length": 561.4390625,
337
  "epoch": 1.8017057569296375,
338
- "grad_norm": 0.3275902569293976,
339
- "kl": 0.16116943359375,
340
  "learning_rate": 1.2296174432791415e-06,
341
- "loss": 0.0212,
342
- "reward": 0.686328125,
343
- "reward_std": 0.2524592101573944,
344
- "rewards/accuracy_reward": 0.516015625,
345
- "rewards/format_reward": 0.16796875,
346
- "rewards/reflection_reward_pos": 0.00234375,
347
  "step": 105
348
  },
349
  {
350
  "clip_ratio": 0.0,
351
- "completion_length": 557.31796875,
352
  "epoch": 1.886993603411514,
353
- "grad_norm": 0.6603850722312927,
354
- "kl": 0.16229248046875,
355
  "learning_rate": 1.0826738041253211e-06,
356
- "loss": 0.0244,
357
- "reward": 0.64921875,
358
- "reward_std": 0.22649513594806195,
359
- "rewards/accuracy_reward": 0.484375,
360
- "rewards/format_reward": 0.1625,
361
- "rewards/reflection_reward_pos": 0.00234375,
362
  "step": 110
363
  },
364
  {
365
  "clip_ratio": 0.0,
366
- "completion_length": 553.05,
367
  "epoch": 1.9722814498933903,
368
- "grad_norm": 0.3981146216392517,
369
- "kl": 0.19359130859375,
370
  "learning_rate": 9.399578170010685e-07,
371
- "loss": 0.0234,
372
- "reward": 0.665625,
373
- "reward_std": 0.2563262009993196,
374
- "rewards/accuracy_reward": 0.49296875,
375
- "rewards/format_reward": 0.16953125,
376
- "rewards/reflection_reward_pos": 0.003125,
377
  "step": 115
378
  },
379
  {
380
  "clip_ratio": 0.0,
381
- "completion_length": 575.7098999023438,
382
  "epoch": 2.068230277185501,
383
- "grad_norm": 0.39923951029777527,
384
- "kl": 0.27755126953125,
385
  "learning_rate": 8.029152419343472e-07,
386
- "loss": 0.016,
387
- "reward": 0.67421875,
388
- "reward_std": 0.21986600682139396,
389
- "rewards/accuracy_reward": 0.49609375,
390
- "rewards/format_reward": 0.17734375,
391
- "rewards/reflection_reward_pos": 0.00078125,
392
  "step": 120
393
  },
394
  {
395
  "clip_ratio": 0.0,
396
- "completion_length": 534.37890625,
397
  "epoch": 2.1535181236673773,
398
- "grad_norm": 0.7698010802268982,
399
- "kl": 0.29091796875,
400
  "learning_rate": 6.729343654174626e-07,
401
- "loss": 0.0192,
402
- "reward": 0.67578125,
403
- "reward_std": 0.24859222043305634,
404
- "rewards/accuracy_reward": 0.4953125,
405
- "rewards/format_reward": 0.17890625,
406
- "rewards/reflection_reward_pos": 0.0015625,
407
  "step": 125
408
  },
409
  {
410
  "clip_ratio": 0.0,
411
- "completion_length": 565.246875,
412
  "epoch": 2.2388059701492535,
413
- "grad_norm": 0.5925753116607666,
414
- "kl": 0.27186279296875,
415
  "learning_rate": 5.513319366069343e-07,
416
- "loss": 0.0179,
417
- "reward": 0.66640625,
418
- "reward_std": 0.24417280461639165,
419
- "rewards/accuracy_reward": 0.51328125,
420
- "rewards/format_reward": 0.15234375,
421
- "rewards/reflection_reward_pos": 0.00078125,
422
  "step": 130
423
  },
424
  {
425
  "clip_ratio": 0.0,
426
- "completion_length": 553.93984375,
427
  "epoch": 2.3240938166311302,
428
- "grad_norm": 0.45327264070510864,
429
- "kl": 0.29208984375,
430
  "learning_rate": 4.3933982822017883e-07,
431
- "loss": 0.0209,
432
- "reward": 0.6453125,
433
- "reward_std": 0.20550290141254662,
434
- "rewards/accuracy_reward": 0.46953125,
435
- "rewards/format_reward": 0.17421875,
436
- "rewards/reflection_reward_pos": 0.0015625,
437
  "step": 135
438
  },
439
  {
440
  "clip_ratio": 0.0,
441
- "completion_length": 544.51875,
442
  "epoch": 2.4093816631130065,
443
- "grad_norm": 11.81939697265625,
444
- "kl": 0.615234375,
445
  "learning_rate": 3.380925572585183e-07,
446
- "loss": 0.0236,
447
- "reward": 0.65546875,
448
- "reward_std": 0.22649513762444257,
449
- "rewards/accuracy_reward": 0.48359375,
450
- "rewards/format_reward": 0.16953125,
451
- "rewards/reflection_reward_pos": 0.00234375,
452
  "step": 140
453
  },
454
  {
455
  "clip_ratio": 0.0,
456
- "completion_length": 531.19765625,
457
  "epoch": 2.4946695095948828,
458
- "grad_norm": 0.4690285325050354,
459
- "kl": 0.2049560546875,
460
  "learning_rate": 2.4861579197570804e-07,
461
- "loss": 0.0214,
462
- "reward": 0.65546875,
463
- "reward_std": 0.2375436789356172,
464
- "rewards/accuracy_reward": 0.48359375,
465
- "rewards/format_reward": 0.17109375,
466
- "rewards/reflection_reward_pos": 0.00078125,
467
  "step": 145
468
  },
469
  {
470
  "clip_ratio": 0.0,
471
- "completion_length": 557.59921875,
472
  "epoch": 2.579957356076759,
473
- "grad_norm": 0.4469051957130432,
474
- "kl": 0.2426513671875,
475
  "learning_rate": 1.718159615201853e-07,
476
- "loss": 0.0271,
477
- "reward": 0.62734375,
478
- "reward_std": 0.2264951358549297,
479
- "rewards/accuracy_reward": 0.4875,
480
- "rewards/format_reward": 0.13828125,
481
- "rewards/reflection_reward_pos": 0.0015625,
482
  "step": 150
483
  },
484
  {
485
  "clip_ratio": 0.0,
486
- "completion_length": 544.9953125,
487
  "epoch": 2.6652452025586353,
488
- "grad_norm": 0.5272353887557983,
489
- "kl": 0.239892578125,
490
  "learning_rate": 1.0847107350878571e-07,
491
- "loss": 0.0235,
492
- "reward": 0.68046875,
493
- "reward_std": 0.23533396869897844,
494
- "rewards/accuracy_reward": 0.5125,
495
- "rewards/format_reward": 0.1671875,
496
  "rewards/reflection_reward_pos": 0.00078125,
497
  "step": 155
498
  },
499
  {
500
  "clip_ratio": 0.0,
501
- "completion_length": 545.94296875,
502
  "epoch": 2.750533049040512,
503
- "grad_norm": 0.6531479358673096,
504
- "kl": 0.25457763671875,
505
  "learning_rate": 5.922283255294164e-08,
506
- "loss": 0.0213,
507
- "reward": 0.70234375,
508
- "reward_std": 0.26847960213199257,
509
- "rewards/accuracy_reward": 0.52421875,
510
- "rewards/format_reward": 0.17734375,
511
- "rewards/reflection_reward_pos": 0.00078125,
512
  "step": 160
513
  },
514
  {
515
  "clip_ratio": 0.0,
516
- "completion_length": 551.03359375,
517
  "epoch": 2.835820895522388,
518
- "grad_norm": 0.6810430884361267,
519
- "kl": 0.2883544921875,
520
  "learning_rate": 2.4570139579284723e-08,
521
- "loss": 0.0245,
522
- "reward": 0.671875,
523
- "reward_std": 0.22097086254507303,
524
- "rewards/accuracy_reward": 0.5046875,
525
- "rewards/format_reward": 0.1640625,
526
- "rewards/reflection_reward_pos": 0.003125,
527
  "step": 165
528
  },
529
  {
530
  "clip_ratio": 0.0,
531
- "completion_length": 562.71015625,
532
  "epoch": 2.9211087420042645,
533
- "grad_norm": 0.718024492263794,
534
- "kl": 0.270947265625,
535
  "learning_rate": 4.864037798685106e-09,
536
- "loss": 0.017,
537
- "reward": 0.68359375,
538
- "reward_std": 0.23754367623478173,
539
- "rewards/accuracy_reward": 0.5265625,
540
- "rewards/format_reward": 0.15390625,
541
- "rewards/reflection_reward_pos": 0.003125,
542
  "step": 170
543
  },
544
  {
545
  "clip_ratio": 0.0,
546
- "completion_length": 597.6367235183716,
547
  "epoch": 2.9893390191897655,
548
- "kl": 0.2867584228515625,
549
- "reward": 0.63671875,
550
- "reward_std": 0.20992232125718147,
551
- "rewards/accuracy_reward": 0.4658203125,
552
- "rewards/format_reward": 0.16796875,
553
- "rewards/reflection_reward_pos": 0.0029296875,
554
  "step": 174,
555
  "total_flos": 0.0,
556
- "train_loss": 0.019510615981001962,
557
- "train_runtime": 40590.3738,
558
- "train_samples_per_second": 0.554,
559
  "train_steps_per_second": 0.004
560
  }
561
  ],
 
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
+ "completion_length": 648.7421875,
14
  "epoch": 0.017057569296375266,
15
+ "grad_norm": 0.06589560955762863,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
+ "loss": 0.0062,
19
+ "reward": 0.703125,
20
+ "reward_std": 0.0883883461356163,
21
+ "rewards/accuracy_reward": 0.69921875,
22
+ "rewards/reflection_reward_pos": 0.00390625,
 
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
+ "completion_length": 648.5087890625,
28
  "epoch": 0.08528784648187633,
29
+ "grad_norm": 0.06861326843500137,
30
+ "kl": 0.00011852383613586426,
31
  "learning_rate": 8.333333333333334e-07,
32
+ "loss": -0.0011,
33
+ "reward": 0.6640625,
34
+ "reward_std": 0.09115048055537045,
35
+ "rewards/accuracy_reward": 0.6640625,
36
+ "rewards/reflection_reward_pos": 0.0,
 
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
+ "completion_length": 638.33359375,
42
  "epoch": 0.17057569296375266,
43
+ "grad_norm": 0.06051831692457199,
44
+ "kl": 0.00015828609466552734,
45
  "learning_rate": 1.6666666666666669e-06,
46
+ "loss": 0.01,
47
+ "reward": 0.67734375,
48
+ "reward_std": 0.10275145107880235,
49
+ "rewards/accuracy_reward": 0.67734375,
50
+ "rewards/reflection_reward_pos": 0.0,
 
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
+ "completion_length": 622.8640625,
56
  "epoch": 0.255863539445629,
57
+ "grad_norm": 0.09521586447954178,
58
+ "kl": 0.0001518726348876953,
59
  "learning_rate": 2.5e-06,
60
+ "loss": 0.0024,
61
+ "reward": 0.71015625,
62
+ "reward_std": 0.12042912095785141,
63
+ "rewards/accuracy_reward": 0.709375,
64
+ "rewards/reflection_reward_pos": 0.00078125,
 
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
+ "completion_length": 607.86640625,
70
  "epoch": 0.3411513859275053,
71
+ "grad_norm": 0.08084629476070404,
72
+ "kl": 0.00022783279418945313,
73
  "learning_rate": 2.9987834972573546e-06,
74
+ "loss": 0.0036,
75
+ "reward": 0.70546875,
76
+ "reward_std": 0.12042912160977721,
77
+ "rewards/accuracy_reward": 0.70390625,
78
+ "rewards/reflection_reward_pos": 0.0015625,
 
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
+ "completion_length": 645.02421875,
84
  "epoch": 0.42643923240938164,
85
+ "grad_norm": 0.08875104784965515,
86
+ "kl": 0.00043997764587402346,
87
  "learning_rate": 2.9851204919417464e-06,
88
+ "loss": 0.0049,
89
+ "reward": 0.65859375,
90
+ "reward_std": 0.11379999481141567,
91
+ "rewards/accuracy_reward": 0.65859375,
92
+ "rewards/reflection_reward_pos": 0.0,
 
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
+ "completion_length": 670.86953125,
98
  "epoch": 0.511727078891258,
99
+ "grad_norm": 0.07565722614526749,
100
+ "kl": 0.0007645606994628907,
101
  "learning_rate": 2.956412726139078e-06,
102
+ "loss": 0.0067,
103
+ "reward": 0.67421875,
104
+ "reward_std": 0.10054174307733774,
105
+ "rewards/accuracy_reward": 0.67265625,
106
+ "rewards/reflection_reward_pos": 0.0015625,
 
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
+ "completion_length": 659.73203125,
112
  "epoch": 0.5970149253731343,
113
+ "grad_norm": 0.07428640872240067,
114
+ "kl": 0.0010577201843261718,
115
  "learning_rate": 2.9129510189868974e-06,
116
+ "loss": 0.0062,
117
+ "reward": 0.65,
118
+ "reward_std": 0.08838834529742598,
119
+ "rewards/accuracy_reward": 0.65,
120
+ "rewards/reflection_reward_pos": 0.0,
 
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
+ "completion_length": 645.309375,
126
  "epoch": 0.6823027718550106,
127
+ "grad_norm": 0.08162426203489304,
128
+ "kl": 0.0015665054321289062,
129
  "learning_rate": 2.8551756519155732e-06,
130
+ "loss": 0.0098,
131
+ "reward": 0.6921875,
132
+ "reward_std": 0.10385630577802658,
133
+ "rewards/accuracy_reward": 0.6921875,
134
+ "rewards/reflection_reward_pos": 0.0,
 
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
+ "completion_length": 655.0703125,
140
  "epoch": 0.767590618336887,
141
+ "grad_norm": 0.08563440293073654,
142
+ "kl": 0.0018520355224609375,
143
  "learning_rate": 2.7836719084521715e-06,
144
+ "loss": 0.0064,
145
+ "reward": 0.65859375,
146
+ "reward_std": 0.12484853798523546,
147
+ "rewards/accuracy_reward": 0.65859375,
148
+ "rewards/reflection_reward_pos": 0.0,
 
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
+ "completion_length": 634.13515625,
154
  "epoch": 0.8528784648187633,
155
+ "grad_norm": 0.08154138922691345,
156
+ "kl": 0.0025421142578125,
157
  "learning_rate": 2.699164145105252e-06,
158
+ "loss": 0.0071,
159
+ "reward": 0.6671875,
160
+ "reward_std": 0.11490484932437539,
161
+ "rewards/accuracy_reward": 0.66484375,
162
+ "rewards/reflection_reward_pos": 0.00234375,
 
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
+ "completion_length": 648.434375,
168
  "epoch": 0.9381663113006397,
169
+ "grad_norm": 0.09137308597564697,
170
+ "kl": 0.0026458740234375,
171
  "learning_rate": 2.602508453394493e-06,
172
+ "loss": 0.0071,
173
+ "reward": 0.6796875,
174
+ "reward_std": 0.1303728088736534,
175
+ "rewards/accuracy_reward": 0.6796875,
176
+ "rewards/reflection_reward_pos": 0.0,
 
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
+ "completion_length": 622.5903686523437,
182
  "epoch": 1.0341151385927505,
183
+ "grad_norm": 0.10423822700977325,
184
+ "kl": 0.0027740478515625,
185
  "learning_rate": 2.4946839873611927e-06,
186
+ "loss": 0.0048,
187
+ "reward": 0.6671875,
188
+ "reward_std": 0.11932426644489169,
189
+ "rewards/accuracy_reward": 0.6671875,
190
+ "rewards/reflection_reward_pos": 0.0,
 
191
  "step": 60
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
+ "completion_length": 608.39921875,
196
  "epoch": 1.1194029850746268,
197
+ "grad_norm": 0.10257695615291595,
198
+ "kl": 0.0032474517822265623,
199
  "learning_rate": 2.3767830444148337e-06,
200
+ "loss": 0.0086,
201
+ "reward": 0.68515625,
202
+ "reward_std": 0.10496116010472178,
203
+ "rewards/accuracy_reward": 0.68515625,
204
+ "rewards/reflection_reward_pos": 0.0,
 
205
  "step": 65
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
+ "completion_length": 617.39140625,
210
  "epoch": 1.2046908315565032,
211
+ "grad_norm": 0.0765712708234787,
212
+ "kl": 0.0033966064453125,
213
  "learning_rate": 2.25e-06,
214
+ "loss": 0.005,
215
+ "reward": 0.68046875,
216
+ "reward_std": 0.10275145145133138,
217
+ "rewards/accuracy_reward": 0.68046875,
218
+ "rewards/reflection_reward_pos": 0.0,
 
219
  "step": 70
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
+ "completion_length": 620.7875,
224
  "epoch": 1.2899786780383795,
225
+ "grad_norm": 0.1029704362154007,
226
+ "kl": 0.0029888153076171875,
227
  "learning_rate": 2.1156192081791355e-06,
228
+ "loss": 0.0023,
229
+ "reward": 0.6828125,
230
+ "reward_std": 0.12595339212566614,
231
+ "rewards/accuracy_reward": 0.68203125,
232
+ "rewards/reflection_reward_pos": 0.00078125,
 
233
  "step": 75
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
+ "completion_length": 615.33828125,
238
  "epoch": 1.375266524520256,
239
+ "grad_norm": 0.08894416689872742,
240
+ "kl": 0.0030879974365234375,
241
  "learning_rate": 1.975001990702209e-06,
242
+ "loss": 0.0046,
243
+ "reward": 0.71484375,
244
+ "reward_std": 0.12042912067845464,
245
+ "rewards/accuracy_reward": 0.71484375,
246
+ "rewards/reflection_reward_pos": 0.0,
 
247
  "step": 80
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
+ "completion_length": 629.1375,
252
  "epoch": 1.4605543710021323,
253
+ "grad_norm": 0.08637527376413345,
254
+ "kl": 0.0033039093017578126,
255
  "learning_rate": 1.829572846368326e-06,
256
+ "loss": 0.0055,
257
+ "reward": 0.690625,
258
+ "reward_std": 0.11490484941750764,
259
+ "rewards/accuracy_reward": 0.68984375,
 
260
  "rewards/reflection_reward_pos": 0.00078125,
261
  "step": 85
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
+ "completion_length": 652.8375,
266
  "epoch": 1.5458422174840085,
267
+ "grad_norm": 0.0866626650094986,
268
+ "kl": 0.0031810760498046874,
269
  "learning_rate": 1.6808050203829845e-06,
270
+ "loss": 0.002,
271
+ "reward": 0.6484375,
272
+ "reward_std": 0.09501747125759721,
273
+ "rewards/accuracy_reward": 0.64765625,
274
+ "rewards/reflection_reward_pos": 0.00078125,
 
275
  "step": 90
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
+ "completion_length": 646.7421875,
280
  "epoch": 1.6311300639658848,
281
+ "grad_norm": 0.09559585154056549,
282
+ "kl": 0.0030200958251953127,
283
  "learning_rate": 1.5302055798981605e-06,
284
+ "loss": 0.0044,
285
+ "reward": 0.65625,
286
+ "reward_std": 0.11048543220385909,
287
+ "rewards/accuracy_reward": 0.65625,
288
+ "rewards/reflection_reward_pos": 0.0,
 
289
  "step": 95
290
  },
291
  {
292
  "epoch": 1.716417910447761,
293
+ "grad_norm": 0.06905966252088547,
294
  "learning_rate": 1.3793001469249112e-06,
295
+ "loss": 0.0084,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 1.716417910447761,
300
  "eval_clip_ratio": 0.0,
301
+ "eval_completion_length": 625.3352635782747,
302
+ "eval_kl": 0.0029512838052865417,
303
+ "eval_loss": 0.0050412570126354694,
304
+ "eval_reward": 0.6336861022364217,
305
+ "eval_reward_std": 0.12947621855872887,
306
+ "eval_rewards/accuracy_reward": 0.6328873801916933,
307
+ "eval_rewards/reflection_reward_pos": 0.0007987220447284345,
308
+ "eval_runtime": 6061.8369,
309
+ "eval_samples_per_second": 0.825,
310
+ "eval_steps_per_second": 0.026,
 
311
  "step": 100
312
  },
313
  {
314
  "clip_ratio": 0.0,
315
+ "completion_length": 643.7375,
316
  "epoch": 1.8017057569296375,
317
+ "grad_norm": 0.08424519002437592,
318
+ "kl": 0.0029918670654296873,
319
  "learning_rate": 1.2296174432791415e-06,
320
+ "loss": 0.0065,
321
+ "reward": 0.676171875,
322
+ "reward_std": 0.11103785866871477,
323
+ "rewards/accuracy_reward": 0.676171875,
324
+ "rewards/reflection_reward_pos": 0.0,
 
325
  "step": 105
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
+ "completion_length": 644.7671875,
330
  "epoch": 1.886993603411514,
331
+ "grad_norm": 0.08362549543380737,
332
+ "kl": 0.00287628173828125,
333
  "learning_rate": 1.0826738041253211e-06,
334
+ "loss": 0.0102,
335
+ "reward": 0.6671875,
336
+ "reward_std": 0.12374368365854024,
337
+ "rewards/accuracy_reward": 0.6671875,
338
+ "rewards/reflection_reward_pos": 0.0,
 
339
  "step": 110
340
  },
341
  {
342
  "clip_ratio": 0.0,
343
+ "completion_length": 633.04375,
344
  "epoch": 1.9722814498933903,
345
+ "grad_norm": 0.0973709300160408,
346
+ "kl": 0.003045654296875,
347
  "learning_rate": 9.399578170010685e-07,
348
+ "loss": 0.0006,
349
+ "reward": 0.67265625,
350
+ "reward_std": 0.1182194116525352,
351
+ "rewards/accuracy_reward": 0.67265625,
352
+ "rewards/reflection_reward_pos": 0.0,
 
353
  "step": 115
354
  },
355
  {
356
  "clip_ratio": 0.0,
357
+ "completion_length": 651.9445343017578,
358
  "epoch": 2.068230277185501,
359
+ "grad_norm": 0.07164692878723145,
360
+ "kl": 0.0029529571533203126,
361
  "learning_rate": 8.029152419343472e-07,
362
+ "loss": 0.0074,
363
+ "reward": 0.67265625,
364
+ "reward_std": 0.10496116001158953,
365
+ "rewards/accuracy_reward": 0.67265625,
366
+ "rewards/reflection_reward_pos": 0.0,
 
367
  "step": 120
368
  },
369
  {
370
  "clip_ratio": 0.0,
371
+ "completion_length": 639.64375,
372
  "epoch": 2.1535181236673773,
373
+ "grad_norm": 0.07949467748403549,
374
+ "kl": 0.0030269622802734375,
375
  "learning_rate": 6.729343654174626e-07,
376
+ "loss": 0.0081,
377
+ "reward": 0.6953125,
378
+ "reward_std": 0.11490484857931733,
379
+ "rewards/accuracy_reward": 0.6953125,
380
+ "rewards/reflection_reward_pos": 0.0,
 
381
  "step": 125
382
  },
383
  {
384
  "clip_ratio": 0.0,
385
+ "completion_length": 656.5984375,
386
  "epoch": 2.2388059701492535,
387
+ "grad_norm": 0.08217156678438187,
388
+ "kl": 0.0028797149658203124,
389
  "learning_rate": 5.513319366069343e-07,
390
+ "loss": 0.0076,
391
+ "reward": 0.6890625,
392
+ "reward_std": 0.11711455835029483,
393
+ "rewards/accuracy_reward": 0.6890625,
394
+ "rewards/reflection_reward_pos": 0.0,
 
395
  "step": 130
396
  },
397
  {
398
  "clip_ratio": 0.0,
399
+ "completion_length": 657.54296875,
400
  "epoch": 2.3240938166311302,
401
+ "grad_norm": 0.08467955142259598,
402
+ "kl": 0.0028537750244140626,
403
  "learning_rate": 4.3933982822017883e-07,
404
+ "loss": 0.0046,
405
+ "reward": 0.67109375,
406
+ "reward_std": 0.10938057713210583,
407
+ "rewards/accuracy_reward": 0.6703125,
408
+ "rewards/reflection_reward_pos": 0.00078125,
 
409
  "step": 135
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
+ "completion_length": 642.66484375,
414
  "epoch": 2.4093816631130065,
415
+ "grad_norm": 0.08187657594680786,
416
+ "kl": 0.0027801513671875,
417
  "learning_rate": 3.380925572585183e-07,
418
+ "loss": 0.008,
419
+ "reward": 0.6625,
420
+ "reward_std": 0.11711455713957548,
421
+ "rewards/accuracy_reward": 0.6625,
422
+ "rewards/reflection_reward_pos": 0.0,
 
423
  "step": 140
424
  },
425
  {
426
  "clip_ratio": 0.0,
427
+ "completion_length": 622.1828125,
428
  "epoch": 2.4946695095948828,
429
+ "grad_norm": 0.08926448225975037,
430
+ "kl": 0.00284576416015625,
431
  "learning_rate": 2.4861579197570804e-07,
432
+ "loss": 0.0096,
433
+ "reward": 0.678125,
434
+ "reward_std": 0.11711455807089806,
435
+ "rewards/accuracy_reward": 0.678125,
436
+ "rewards/reflection_reward_pos": 0.0,
 
437
  "step": 145
438
  },
439
  {
440
  "clip_ratio": 0.0,
441
+ "completion_length": 636.60546875,
442
  "epoch": 2.579957356076759,
443
+ "grad_norm": 0.09873559325933456,
444
+ "kl": 0.00290679931640625,
445
  "learning_rate": 1.718159615201853e-07,
446
+ "loss": 0.0088,
447
+ "reward": 0.66328125,
448
+ "reward_std": 0.12263882830739022,
449
+ "rewards/accuracy_reward": 0.6625,
450
+ "rewards/reflection_reward_pos": 0.00078125,
 
451
  "step": 150
452
  },
453
  {
454
  "clip_ratio": 0.0,
455
+ "completion_length": 641.1921875,
456
  "epoch": 2.6652452025586353,
457
+ "grad_norm": 0.08141663670539856,
458
+ "kl": 0.00279998779296875,
459
  "learning_rate": 1.0847107350878571e-07,
460
+ "loss": 0.0056,
461
+ "reward": 0.66875,
462
+ "reward_std": 0.11269514048472047,
463
+ "rewards/accuracy_reward": 0.66796875,
 
464
  "rewards/reflection_reward_pos": 0.00078125,
465
  "step": 155
466
  },
467
  {
468
  "clip_ratio": 0.0,
469
+ "completion_length": 635.07578125,
470
  "epoch": 2.750533049040512,
471
+ "grad_norm": 0.07290147989988327,
472
+ "kl": 0.002706146240234375,
473
  "learning_rate": 5.922283255294164e-08,
474
+ "loss": 0.0028,
475
+ "reward": 0.7171875,
476
+ "reward_std": 0.09722718009725213,
477
+ "rewards/accuracy_reward": 0.7171875,
478
+ "rewards/reflection_reward_pos": 0.0,
 
479
  "step": 160
480
  },
481
  {
482
  "clip_ratio": 0.0,
483
+ "completion_length": 633.3,
484
  "epoch": 2.835820895522388,
485
+ "grad_norm": 0.07411785423755646,
486
+ "kl": 0.0027303695678710938,
487
  "learning_rate": 2.4570139579284723e-08,
488
+ "loss": 0.0062,
489
+ "reward": 0.6953125,
490
+ "reward_std": 0.10385630559176207,
491
+ "rewards/accuracy_reward": 0.6953125,
492
+ "rewards/reflection_reward_pos": 0.0,
 
493
  "step": 165
494
  },
495
  {
496
  "clip_ratio": 0.0,
497
+ "completion_length": 639.90546875,
498
  "epoch": 2.9211087420042645,
499
+ "grad_norm": 0.09646094590425491,
500
+ "kl": 0.0026676177978515623,
501
  "learning_rate": 4.864037798685106e-09,
502
+ "loss": 0.0018,
503
+ "reward": 0.69296875,
504
+ "reward_std": 0.10717086931690574,
505
+ "rewards/accuracy_reward": 0.69296875,
506
+ "rewards/reflection_reward_pos": 0.0,
 
507
  "step": 170
508
  },
509
  {
510
  "clip_ratio": 0.0,
511
+ "completion_length": 644.8330116271973,
512
  "epoch": 2.9893390191897655,
513
+ "kl": 0.0027103424072265625,
514
+ "reward": 0.669921875,
515
+ "reward_std": 0.11877183895558119,
516
+ "rewards/accuracy_reward": 0.669921875,
517
+ "rewards/reflection_reward_pos": 0.0,
 
518
  "step": 174,
519
  "total_flos": 0.0,
520
+ "train_loss": 0.005782647762212089,
521
+ "train_runtime": 41644.7787,
522
+ "train_samples_per_second": 0.54,
523
  "train_steps_per_second": 0.004
524
  }
525
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:495f3baedf05f2b9cc79184cd9e8d5eedc0d5c8072d38bfd1f4d015f1254096b
3
- size 8184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db7f6d6c128be56fa07ba7fc1b230400695276f9ea554cbfedcc1a931511184
3
+ size 7800