thuannh commited on
Commit
a8c6b42
·
verified ·
1 Parent(s): 3ceb462

Upload folder using huggingface_hub

Browse files
generation_config.json CHANGED
@@ -9,5 +9,6 @@
9
  "temperature": 0.6,
10
  "top_k": 20,
11
  "top_p": 0.95,
12
- "transformers_version": "4.55.0"
 
13
  }
 
9
  "temperature": 0.6,
10
  "top_k": 20,
11
  "top_p": 0.95,
12
+ "transformers_version": "4.55.0",
13
+ "presence_penalty": 1.5
14
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c51ce2cb6bf69f96633cce65d321f6e767bf121cecd033069e90a149c3f7a76
3
  size 3441185608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15024564b172a09f1deea887d9bb3fd42a3eb629435e2fb92c7d1a6698ef9fca
3
  size 3441185608
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0019372336303758234,
6
  "eval_steps": 500,
7
- "global_step": 80,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -16,27 +16,27 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 5021.0,
20
- "completions/max_terminated_length": 5021.0,
21
- "completions/mean_length": 3011.5625,
22
- "completions/mean_terminated_length": 3011.5625,
23
- "completions/min_length": 1324.0,
24
- "completions/min_terminated_length": 1324.0,
25
- "epoch": 2.4215420379697792e-05,
26
- "grad_norm": 0.0013862343573498733,
27
- "kl": 0.000823974609375,
28
  "learning_rate": 0.0,
29
- "loss": -0.0001,
30
- "num_tokens": 227146.0,
31
- "reward": 0.8549610376358032,
32
- "reward_std": 0.021876953542232513,
33
- "rewards/avg_thinking_length_func": 157.7991485595703,
34
- "rewards/correct_answer_reward_func": 0.75,
35
- "rewards/efficient_thinking_reward_func": 0.9122447304419112,
36
  "rewards/format_reward_func": 1.0,
37
- "rewards/num_xml_reward_func": 1.7957969903945923,
38
- "rewards/tool_execution_reward_func": 1.9947917461395264,
39
- "rewards/visit_tool_reward_func": 0.9724012017250061,
40
  "step": 1
41
  },
42
  {
@@ -45,11 +45,11 @@
45
  "clip_ratio/low_mean": 0.0,
46
  "clip_ratio/low_min": 0.0,
47
  "clip_ratio/region_mean": 0.0,
48
- "epoch": 4.8430840759395585e-05,
49
- "grad_norm": 0.0013858304532204059,
50
- "kl": 0.000823974609375,
51
  "learning_rate": 6.25e-08,
52
- "loss": -0.0001,
53
  "step": 2
54
  },
55
  {
@@ -58,11 +58,11 @@
58
  "clip_ratio/low_mean": 0.0,
59
  "clip_ratio/low_min": 0.0,
60
  "clip_ratio/region_mean": 0.0,
61
- "epoch": 7.264626113909338e-05,
62
- "grad_norm": 0.0013865629989697915,
63
- "kl": 0.000820159912109375,
64
  "learning_rate": 1.25e-07,
65
- "loss": -0.0001,
66
  "step": 3
67
  },
68
  {
@@ -71,11 +71,11 @@
71
  "clip_ratio/low_mean": 0.0,
72
  "clip_ratio/low_min": 0.0,
73
  "clip_ratio/region_mean": 0.0,
74
- "epoch": 9.686168151879117e-05,
75
- "grad_norm": 0.0013802536996446572,
76
- "kl": 0.0008420944213867188,
77
  "learning_rate": 1.875e-07,
78
- "loss": -0.0001,
79
  "step": 4
80
  },
81
  {
@@ -85,27 +85,27 @@
85
  "clip_ratio/low_min": 0.0,
86
  "clip_ratio/region_mean": 0.0,
87
  "completions/clipped_ratio": 0.0,
88
- "completions/max_length": 8221.0,
89
- "completions/max_terminated_length": 8221.0,
90
- "completions/mean_length": 5079.5625,
91
- "completions/mean_terminated_length": 5079.5625,
92
- "completions/min_length": 2306.0,
93
- "completions/min_terminated_length": 2306.0,
94
- "epoch": 0.00012107710189848896,
95
- "grad_norm": 0.03869369457145168,
96
- "kl": 0.0009288787841796875,
97
  "learning_rate": 2.5e-07,
98
- "loss": 0.0052,
99
- "num_tokens": 580872.0,
100
- "reward": 0.30476048588752747,
101
- "reward_std": 0.4756142497062683,
102
- "rewards/avg_thinking_length_func": 195.94981384277344,
103
- "rewards/correct_answer_reward_func": 0.3125,
104
- "rewards/efficient_thinking_reward_func": 0.8946171495693498,
105
  "rewards/format_reward_func": 1.0,
106
- "rewards/num_xml_reward_func": 1.3311374187469482,
107
- "rewards/tool_execution_reward_func": 2.0,
108
- "rewards/visit_tool_reward_func": 1.0090813636779785,
109
  "step": 5
110
  },
111
  {
@@ -114,11 +114,11 @@
114
  "clip_ratio/low_mean": 0.0,
115
  "clip_ratio/low_min": 0.0,
116
  "clip_ratio/region_mean": 0.0,
117
- "epoch": 0.00014529252227818675,
118
- "grad_norm": 0.03816851177241048,
119
- "kl": 0.0009202957153320312,
120
  "learning_rate": 3.1249999999999997e-07,
121
- "loss": 0.0052,
122
  "step": 6
123
  },
124
  {
@@ -127,11 +127,11 @@
127
  "clip_ratio/low_mean": 0.0,
128
  "clip_ratio/low_min": 0.0,
129
  "clip_ratio/region_mean": 0.0,
130
- "epoch": 0.00016950794265788453,
131
- "grad_norm": 0.03841593140613752,
132
- "kl": 0.0009307861328125,
133
  "learning_rate": 3.75e-07,
134
- "loss": 0.0052,
135
  "step": 7
136
  },
137
  {
@@ -140,11 +140,11 @@
140
  "clip_ratio/low_mean": 0.0,
141
  "clip_ratio/low_min": 0.0,
142
  "clip_ratio/region_mean": 0.0,
143
- "epoch": 0.00019372336303758234,
144
- "grad_norm": 0.03873458948340372,
145
- "kl": 0.000919342041015625,
146
  "learning_rate": 4.375e-07,
147
- "loss": 0.0052,
148
  "step": 8
149
  },
150
  {
@@ -154,27 +154,27 @@
154
  "clip_ratio/low_min": 0.0,
155
  "clip_ratio/region_mean": 0.0,
156
  "completions/clipped_ratio": 0.0,
157
- "completions/max_length": 8205.0,
158
- "completions/max_terminated_length": 8205.0,
159
- "completions/mean_length": 4930.875,
160
- "completions/mean_terminated_length": 4930.875,
161
- "completions/min_length": 2085.0,
162
- "completions/min_terminated_length": 2085.0,
163
- "epoch": 0.00021793878341728012,
164
- "grad_norm": 0.027339756517219562,
165
- "kl": 0.0009183883666992188,
166
  "learning_rate": 5e-07,
167
- "loss": 0.0039,
168
- "num_tokens": 926931.0,
169
- "reward": 0.41861122846603394,
170
- "reward_std": 0.399558424949646,
171
- "rewards/avg_thinking_length_func": 189.7052764892578,
172
- "rewards/correct_answer_reward_func": 0.40625,
173
- "rewards/efficient_thinking_reward_func": 0.9012974528882096,
174
  "rewards/format_reward_func": 1.0,
175
- "rewards/num_xml_reward_func": 1.572190284729004,
176
- "rewards/tool_execution_reward_func": 2.0,
177
- "rewards/visit_tool_reward_func": 1.0377196073532104,
178
  "step": 9
179
  },
180
  {
@@ -183,11 +183,11 @@
183
  "clip_ratio/low_mean": 0.0,
184
  "clip_ratio/low_min": 0.0,
185
  "clip_ratio/region_mean": 0.0,
186
- "epoch": 0.00024215420379697792,
187
- "grad_norm": 0.027389009217824802,
188
- "kl": 0.0009317398071289062,
189
  "learning_rate": 5.625e-07,
190
- "loss": 0.0039,
191
  "step": 10
192
  },
193
  {
@@ -196,11 +196,11 @@
196
  "clip_ratio/low_mean": 0.0,
197
  "clip_ratio/low_min": 0.0,
198
  "clip_ratio/region_mean": 0.0,
199
- "epoch": 0.0002663696241766757,
200
- "grad_norm": 0.027404975057647246,
201
- "kl": 0.0009403228759765625,
202
  "learning_rate": 6.249999999999999e-07,
203
- "loss": 0.0039,
204
  "step": 11
205
  },
206
  {
@@ -209,11 +209,11 @@
209
  "clip_ratio/low_mean": 0.0,
210
  "clip_ratio/low_min": 0.0,
211
  "clip_ratio/region_mean": 0.0,
212
- "epoch": 0.0002905850445563735,
213
- "grad_norm": 0.027663414412920105,
214
- "kl": 0.0010051727294921875,
215
  "learning_rate": 6.875e-07,
216
- "loss": 0.0039,
217
  "step": 12
218
  },
219
  {
@@ -223,27 +223,27 @@
223
  "clip_ratio/low_min": 0.0,
224
  "clip_ratio/region_mean": 0.0,
225
  "completions/clipped_ratio": 0.0,
226
- "completions/max_length": 8889.0,
227
- "completions/max_terminated_length": 8889.0,
228
- "completions/mean_length": 5363.5,
229
- "completions/mean_terminated_length": 5363.5,
230
- "completions/min_length": 1743.0,
231
- "completions/min_terminated_length": 1743.0,
232
- "epoch": 0.0003148004649360713,
233
- "grad_norm": 0.0407893748945086,
234
- "kl": 0.0009937286376953125,
235
  "learning_rate": 7.5e-07,
236
- "loss": -0.0002,
237
- "num_tokens": 1294834.0,
238
- "reward": 0.3248969614505768,
239
- "reward_std": 0.6025969982147217,
240
- "rewards/avg_thinking_length_func": 197.81080627441406,
241
- "rewards/correct_answer_reward_func": 0.375,
242
- "rewards/efficient_thinking_reward_func": 0.8764530262818557,
243
  "rewards/format_reward_func": 1.0,
244
- "rewards/num_xml_reward_func": 1.5131020545959473,
245
- "rewards/tool_execution_reward_func": 1.9973957538604736,
246
- "rewards/visit_tool_reward_func": 1.029296636581421,
247
  "step": 13
248
  },
249
  {
@@ -252,11 +252,11 @@
252
  "clip_ratio/low_mean": 0.0,
253
  "clip_ratio/low_min": 0.0,
254
  "clip_ratio/region_mean": 0.0,
255
- "epoch": 0.00033901588531576906,
256
- "grad_norm": 0.04111370827114344,
257
- "kl": 0.0009851455688476562,
258
  "learning_rate": 8.125e-07,
259
- "loss": -0.0002,
260
  "step": 14
261
  },
262
  {
@@ -265,11 +265,11 @@
265
  "clip_ratio/low_mean": 0.0,
266
  "clip_ratio/low_min": 0.0,
267
  "clip_ratio/region_mean": 0.0,
268
- "epoch": 0.00036323130569546687,
269
- "grad_norm": 0.040997833074323406,
270
- "kl": 0.0010662078857421875,
271
  "learning_rate": 8.75e-07,
272
- "loss": -0.0002,
273
  "step": 15
274
  },
275
  {
@@ -278,11 +278,11 @@
278
  "clip_ratio/low_mean": 0.0,
279
  "clip_ratio/low_min": 0.0,
280
  "clip_ratio/region_mean": 0.0,
281
- "epoch": 0.0003874467260751647,
282
- "grad_norm": 0.04130516332775726,
283
- "kl": 0.0011501312255859375,
284
  "learning_rate": 9.374999999999999e-07,
285
- "loss": -0.0002,
286
  "step": 16
287
  },
288
  {
@@ -292,27 +292,27 @@
292
  "clip_ratio/low_min": 0.0,
293
  "clip_ratio/region_mean": 0.0,
294
  "completions/clipped_ratio": 0.0,
295
- "completions/max_length": 8681.0,
296
- "completions/max_terminated_length": 8681.0,
297
- "completions/mean_length": 5228.8125,
298
- "completions/mean_terminated_length": 5228.8125,
299
- "completions/min_length": 2233.0,
300
- "completions/min_terminated_length": 2233.0,
301
- "epoch": 0.0004116621464548625,
302
- "grad_norm": 0.03978610475757746,
303
- "kl": 0.001308441162109375,
304
  "learning_rate": 1e-06,
305
- "loss": 0.006,
306
- "num_tokens": 1648672.0,
307
- "reward": 0.34718748927116394,
308
- "reward_std": 0.5881420969963074,
309
- "rewards/avg_thinking_length_func": 195.60931396484375,
310
- "rewards/correct_answer_reward_func": 0.34375,
311
- "rewards/efficient_thinking_reward_func": 0.8885279571058924,
312
  "rewards/format_reward_func": 1.0,
313
- "rewards/num_xml_reward_func": 1.63374924659729,
314
- "rewards/tool_execution_reward_func": 2.0,
315
- "rewards/visit_tool_reward_func": 1.035430908203125,
316
  "step": 17
317
  },
318
  {
@@ -321,11 +321,11 @@
321
  "clip_ratio/low_mean": 0.0,
322
  "clip_ratio/low_min": 0.0,
323
  "clip_ratio/region_mean": 0.0,
324
- "epoch": 0.00043587756683456023,
325
- "grad_norm": 0.04006457357588843,
326
- "kl": 0.001514434814453125,
327
  "learning_rate": 1.0625e-06,
328
- "loss": 0.006,
329
  "step": 18
330
  },
331
  {
@@ -334,11 +334,11 @@
334
  "clip_ratio/low_mean": 0.0,
335
  "clip_ratio/low_min": 0.0,
336
  "clip_ratio/region_mean": 0.0,
337
- "epoch": 0.00046009298721425804,
338
- "grad_norm": 0.04053645195569142,
339
- "kl": 0.0020732879638671875,
340
  "learning_rate": 1.125e-06,
341
- "loss": 0.006,
342
  "step": 19
343
  },
344
  {
@@ -347,11 +347,11 @@
347
  "clip_ratio/low_mean": 0.0,
348
  "clip_ratio/low_min": 0.0,
349
  "clip_ratio/region_mean": 0.0,
350
- "epoch": 0.00048430840759395585,
351
- "grad_norm": 0.040941267103466746,
352
- "kl": 0.002826690673828125,
353
  "learning_rate": 1.1874999999999999e-06,
354
- "loss": 0.006,
355
  "step": 20
356
  },
357
  {
@@ -361,27 +361,27 @@
361
  "clip_ratio/low_min": 0.0,
362
  "clip_ratio/region_mean": 0.0,
363
  "completions/clipped_ratio": 0.0,
364
- "completions/max_length": 8595.0,
365
- "completions/max_terminated_length": 8595.0,
366
- "completions/mean_length": 5146.78125,
367
- "completions/mean_terminated_length": 5146.78125,
368
- "completions/min_length": 2093.0,
369
- "completions/min_terminated_length": 2093.0,
370
- "epoch": 0.0005085238279736536,
371
- "grad_norm": 0.034201382636368564,
372
- "kl": 0.00348663330078125,
373
  "learning_rate": 1.2499999999999999e-06,
374
- "loss": 0.0012,
375
- "num_tokens": 2015057.0,
376
- "reward": 0.3354427218437195,
377
- "reward_std": 0.5395293831825256,
378
- "rewards/avg_thinking_length_func": 181.95034790039062,
379
- "rewards/correct_answer_reward_func": 0.4375,
380
- "rewards/efficient_thinking_reward_func": 0.8707138202597767,
381
- "rewards/format_reward_func": 0.9997023940086365,
382
- "rewards/num_xml_reward_func": 1.4494500160217285,
383
- "rewards/tool_execution_reward_func": 2.0,
384
- "rewards/visit_tool_reward_func": 1.0431455373764038,
385
  "step": 21
386
  },
387
  {
@@ -390,11 +390,11 @@
390
  "clip_ratio/low_mean": 0.0,
391
  "clip_ratio/low_min": 0.0,
392
  "clip_ratio/region_mean": 0.0,
393
- "epoch": 0.0005327392483533514,
394
- "grad_norm": 0.03384948845768777,
395
- "kl": 0.00420379638671875,
396
  "learning_rate": 1.3125e-06,
397
- "loss": 0.0012,
398
  "step": 22
399
  },
400
  {
@@ -403,11 +403,11 @@
403
  "clip_ratio/low_mean": 0.0,
404
  "clip_ratio/low_min": 0.0,
405
  "clip_ratio/region_mean": 0.0,
406
- "epoch": 0.0005569546687330492,
407
- "grad_norm": 0.03413362282042912,
408
- "kl": 0.0050048828125,
409
  "learning_rate": 1.375e-06,
410
- "loss": 0.0012,
411
  "step": 23
412
  },
413
  {
@@ -416,11 +416,11 @@
416
  "clip_ratio/low_mean": 0.0,
417
  "clip_ratio/low_min": 0.0,
418
  "clip_ratio/region_mean": 0.0,
419
- "epoch": 0.000581170089112747,
420
- "grad_norm": 0.03410120905325483,
421
- "kl": 0.0059661865234375,
422
  "learning_rate": 1.4375e-06,
423
- "loss": 0.0012,
424
  "step": 24
425
  },
426
  {
@@ -430,27 +430,27 @@
430
  "clip_ratio/low_min": 0.0,
431
  "clip_ratio/region_mean": 0.0,
432
  "completions/clipped_ratio": 0.0,
433
- "completions/max_length": 8285.0,
434
- "completions/max_terminated_length": 8285.0,
435
- "completions/mean_length": 5087.59375,
436
- "completions/mean_terminated_length": 5087.59375,
437
- "completions/min_length": 2272.0,
438
- "completions/min_terminated_length": 2272.0,
439
- "epoch": 0.0006053855094924448,
440
- "grad_norm": 0.044445551609988844,
441
- "kl": 0.00791168212890625,
442
  "learning_rate": 1.5e-06,
443
- "loss": 0.0078,
444
- "num_tokens": 2383540.0,
445
- "reward": 0.3267378509044647,
446
- "reward_std": 0.4416780471801758,
447
- "rewards/avg_thinking_length_func": 174.10765075683594,
448
- "rewards/correct_answer_reward_func": 0.28125,
449
- "rewards/efficient_thinking_reward_func": 0.9263086220559518,
450
  "rewards/format_reward_func": 1.0,
451
- "rewards/num_xml_reward_func": 1.755671501159668,
452
- "rewards/tool_execution_reward_func": 1.9970238208770752,
453
- "rewards/visit_tool_reward_func": 1.0175025463104248,
454
  "step": 25
455
  },
456
  {
@@ -459,11 +459,11 @@
459
  "clip_ratio/low_mean": 0.0,
460
  "clip_ratio/low_min": 0.0,
461
  "clip_ratio/region_mean": 0.0,
462
- "epoch": 0.0006296009298721426,
463
- "grad_norm": 0.04825803348555062,
464
- "kl": 0.012359619140625,
465
  "learning_rate": 1.5624999999999999e-06,
466
- "loss": 0.0078,
467
  "step": 26
468
  },
469
  {
@@ -472,11 +472,11 @@
472
  "clip_ratio/low_mean": 0.0,
473
  "clip_ratio/low_min": 0.0,
474
  "clip_ratio/region_mean": 0.0,
475
- "epoch": 0.0006538163502518403,
476
- "grad_norm": 0.05116686338837774,
477
- "kl": 0.021484375,
478
  "learning_rate": 1.625e-06,
479
- "loss": 0.0078,
480
  "step": 27
481
  },
482
  {
@@ -485,11 +485,11 @@
485
  "clip_ratio/low_mean": 0.0,
486
  "clip_ratio/low_min": 0.0,
487
  "clip_ratio/region_mean": 0.0,
488
- "epoch": 0.0006780317706315381,
489
- "grad_norm": 0.05316346328608609,
490
- "kl": 0.03985595703125,
491
  "learning_rate": 1.6875e-06,
492
- "loss": 0.0078,
493
  "step": 28
494
  },
495
  {
@@ -499,27 +499,27 @@
499
  "clip_ratio/low_min": 0.0,
500
  "clip_ratio/region_mean": 0.0,
501
  "completions/clipped_ratio": 0.0,
502
- "completions/max_length": 7189.0,
503
- "completions/max_terminated_length": 7189.0,
504
- "completions/mean_length": 4996.46875,
505
- "completions/mean_terminated_length": 4996.46875,
506
- "completions/min_length": 2314.0,
507
- "completions/min_terminated_length": 2314.0,
508
- "epoch": 0.0007022471910112359,
509
- "grad_norm": 0.04891502789141398,
510
- "kl": 0.04736328125,
511
  "learning_rate": 1.75e-06,
512
- "loss": 0.0067,
513
- "num_tokens": 2761999.0,
514
- "reward": 0.41611552238464355,
515
- "reward_std": 0.456102192401886,
516
- "rewards/avg_thinking_length_func": 165.63626098632812,
517
- "rewards/correct_answer_reward_func": 0.40625,
518
- "rewards/efficient_thinking_reward_func": 0.9453047237018808,
519
- "rewards/format_reward_func": 1.0,
520
- "rewards/num_xml_reward_func": 1.5178555250167847,
521
- "rewards/tool_execution_reward_func": 1.9925000667572021,
522
- "rewards/visit_tool_reward_func": 1.0254861116409302,
523
  "step": 29
524
  },
525
  {
@@ -528,11 +528,11 @@
528
  "clip_ratio/low_mean": 0.0,
529
  "clip_ratio/low_min": 0.0,
530
  "clip_ratio/region_mean": 0.0,
531
- "epoch": 0.0007264626113909337,
532
- "grad_norm": 0.05075936350489512,
533
- "kl": 0.07891845703125,
534
  "learning_rate": 1.8125e-06,
535
- "loss": 0.0067,
536
  "step": 30
537
  },
538
  {
@@ -541,11 +541,11 @@
541
  "clip_ratio/low_mean": 0.0,
542
  "clip_ratio/low_min": 0.0,
543
  "clip_ratio/region_mean": 0.0,
544
- "epoch": 0.0007506780317706315,
545
- "grad_norm": 0.05378691343356215,
546
- "kl": 0.1292724609375,
547
  "learning_rate": 1.8749999999999998e-06,
548
- "loss": 0.0068,
549
  "step": 31
550
  },
551
  {
@@ -554,11 +554,11 @@
554
  "clip_ratio/low_mean": 0.0,
555
  "clip_ratio/low_min": 0.0,
556
  "clip_ratio/region_mean": 0.0,
557
- "epoch": 0.0007748934521503294,
558
- "grad_norm": 0.06810533074784274,
559
- "kl": 0.19677734375,
560
  "learning_rate": 1.9375e-06,
561
- "loss": 0.0069,
562
  "step": 32
563
  },
564
  {
@@ -568,27 +568,27 @@
568
  "clip_ratio/low_min": 0.0,
569
  "clip_ratio/region_mean": 0.0,
570
  "completions/clipped_ratio": 0.0,
571
- "completions/max_length": 6880.0,
572
- "completions/max_terminated_length": 6880.0,
573
- "completions/mean_length": 5091.3125,
574
- "completions/mean_terminated_length": 5091.3125,
575
- "completions/min_length": 2317.0,
576
- "completions/min_terminated_length": 2317.0,
577
- "epoch": 0.0007991088725300272,
578
- "grad_norm": 0.1762564569620267,
579
- "kl": 0.204345703125,
580
  "learning_rate": 2e-06,
581
- "loss": 0.0021,
582
- "num_tokens": 3193245.0,
583
- "reward": 0.606549859046936,
584
- "reward_std": 0.42073318362236023,
585
- "rewards/avg_thinking_length_func": 136.30906677246094,
586
- "rewards/correct_answer_reward_func": 0.5625,
587
- "rewards/efficient_thinking_reward_func": 0.9376242936192475,
588
  "rewards/format_reward_func": 1.0,
589
- "rewards/num_xml_reward_func": 1.400514841079712,
590
- "rewards/tool_execution_reward_func": 2.0,
591
- "rewards/visit_tool_reward_func": 1.0149691104888916,
592
  "step": 33
593
  },
594
  {
@@ -597,11 +597,11 @@
597
  "clip_ratio/low_mean": 0.0,
598
  "clip_ratio/low_min": 0.0,
599
  "clip_ratio/region_mean": 0.0,
600
- "epoch": 0.000823324292909725,
601
- "grad_norm": 0.09601002970053912,
602
- "kl": 0.177001953125,
603
  "learning_rate": 2e-06,
604
- "loss": 0.0021,
605
  "step": 34
606
  },
607
  {
@@ -610,11 +610,11 @@
610
  "clip_ratio/low_mean": 0.0,
611
  "clip_ratio/low_min": 0.0,
612
  "clip_ratio/region_mean": 0.0,
613
- "epoch": 0.0008475397132894227,
614
- "grad_norm": 0.059153354466426514,
615
- "kl": 0.1494140625,
616
  "learning_rate": 2e-06,
617
- "loss": 0.002,
618
  "step": 35
619
  },
620
  {
@@ -623,11 +623,11 @@
623
  "clip_ratio/low_mean": 0.0,
624
  "clip_ratio/low_min": 0.0,
625
  "clip_ratio/region_mean": 0.0,
626
- "epoch": 0.0008717551336691205,
627
- "grad_norm": 0.04692926604339145,
628
- "kl": 0.134033203125,
629
  "learning_rate": 2e-06,
630
- "loss": 0.002,
631
  "step": 36
632
  },
633
  {
@@ -637,27 +637,27 @@
637
  "clip_ratio/low_min": 0.0,
638
  "clip_ratio/region_mean": 0.0,
639
  "completions/clipped_ratio": 0.0,
640
- "completions/max_length": 6702.0,
641
- "completions/max_terminated_length": 6702.0,
642
- "completions/mean_length": 4464.4375,
643
- "completions/mean_terminated_length": 4464.4375,
644
- "completions/min_length": 2517.0,
645
- "completions/min_terminated_length": 2517.0,
646
- "epoch": 0.0008959705540488183,
647
- "grad_norm": 0.04156023447496563,
648
- "kl": 0.089111328125,
649
  "learning_rate": 2e-06,
650
- "loss": 0.0009,
651
- "num_tokens": 3614725.0,
652
- "reward": 0.5230777859687805,
653
- "reward_std": 0.4564175009727478,
654
- "rewards/avg_thinking_length_func": 113.61294555664062,
655
- "rewards/correct_answer_reward_func": 0.5,
656
- "rewards/efficient_thinking_reward_func": 0.8956072636293169,
657
- "rewards/format_reward_func": 1.0,
658
- "rewards/num_xml_reward_func": 1.4630262851715088,
659
- "rewards/tool_execution_reward_func": 1.9854960441589355,
660
- "rewards/visit_tool_reward_func": 0.9511741399765015,
661
  "step": 37
662
  },
663
  {
@@ -666,11 +666,11 @@
666
  "clip_ratio/low_mean": 0.0,
667
  "clip_ratio/low_min": 0.0,
668
  "clip_ratio/region_mean": 0.0,
669
- "epoch": 0.0009201859744285161,
670
- "grad_norm": 0.030847266935293525,
671
- "kl": 0.0772705078125,
672
  "learning_rate": 2e-06,
673
- "loss": 0.0009,
674
  "step": 38
675
  },
676
  {
@@ -679,11 +679,11 @@
679
  "clip_ratio/low_mean": 0.0,
680
  "clip_ratio/low_min": 0.0,
681
  "clip_ratio/region_mean": 0.0,
682
- "epoch": 0.0009444013948082139,
683
- "grad_norm": 0.029857330293238966,
684
- "kl": 0.0762939453125,
685
  "learning_rate": 2e-06,
686
- "loss": 0.0009,
687
  "step": 39
688
  },
689
  {
@@ -692,11 +692,11 @@
692
  "clip_ratio/low_mean": 0.0,
693
  "clip_ratio/low_min": 0.0,
694
  "clip_ratio/region_mean": 0.0,
695
- "epoch": 0.0009686168151879117,
696
- "grad_norm": 0.029616934835158485,
697
- "kl": 0.0780029296875,
698
  "learning_rate": 2e-06,
699
- "loss": 0.0009,
700
  "step": 40
701
  },
702
  {
@@ -706,27 +706,27 @@
706
  "clip_ratio/low_min": 0.0,
707
  "clip_ratio/region_mean": 0.0,
708
  "completions/clipped_ratio": 0.0,
709
- "completions/max_length": 5952.0,
710
- "completions/max_terminated_length": 5952.0,
711
- "completions/mean_length": 4219.25,
712
- "completions/mean_terminated_length": 4219.25,
713
- "completions/min_length": 2282.0,
714
- "completions/min_terminated_length": 2282.0,
715
- "epoch": 0.0009928322355676094,
716
- "grad_norm": 0.027887879707077786,
717
- "kl": 0.0771484375,
718
  "learning_rate": 2e-06,
719
- "loss": 0.0014,
720
- "num_tokens": 4009367.0,
721
- "reward": 0.6270047426223755,
722
- "reward_std": 0.432157427072525,
723
- "rewards/avg_thinking_length_func": 115.17754364013672,
724
- "rewards/correct_answer_reward_func": 0.59375,
725
- "rewards/efficient_thinking_reward_func": 0.9136282100129471,
726
  "rewards/format_reward_func": 1.0,
727
- "rewards/num_xml_reward_func": 1.6435627937316895,
728
- "rewards/tool_execution_reward_func": 1.9891107082366943,
729
- "rewards/visit_tool_reward_func": 0.9315186738967896,
730
  "step": 41
731
  },
732
  {
@@ -735,11 +735,11 @@
735
  "clip_ratio/low_mean": 0.0,
736
  "clip_ratio/low_min": 0.0,
737
  "clip_ratio/region_mean": 0.0,
738
- "epoch": 0.0010170476559473072,
739
- "grad_norm": 0.02835635253506866,
740
- "kl": 0.0794677734375,
741
  "learning_rate": 2e-06,
742
- "loss": 0.0014,
743
  "step": 42
744
  },
745
  {
@@ -748,11 +748,11 @@
748
  "clip_ratio/low_mean": 0.0,
749
  "clip_ratio/low_min": 0.0,
750
  "clip_ratio/region_mean": 0.0,
751
- "epoch": 0.001041263076327005,
752
- "grad_norm": 0.028940878534019835,
753
- "kl": 0.083740234375,
754
  "learning_rate": 2e-06,
755
- "loss": 0.0014,
756
  "step": 43
757
  },
758
  {
@@ -761,11 +761,11 @@
761
  "clip_ratio/low_mean": 0.0,
762
  "clip_ratio/low_min": 0.0,
763
  "clip_ratio/region_mean": 0.0,
764
- "epoch": 0.0010654784967067028,
765
- "grad_norm": 0.029849752671864262,
766
- "kl": 0.08837890625,
767
  "learning_rate": 2e-06,
768
- "loss": 0.0014,
769
  "step": 44
770
  },
771
  {
@@ -775,27 +775,27 @@
775
  "clip_ratio/low_min": 0.0,
776
  "clip_ratio/region_mean": 0.0,
777
  "completions/clipped_ratio": 0.0,
778
- "completions/max_length": 6019.0,
779
- "completions/max_terminated_length": 6019.0,
780
- "completions/mean_length": 4090.75,
781
- "completions/mean_terminated_length": 4090.75,
782
- "completions/min_length": 2240.0,
783
- "completions/min_terminated_length": 2240.0,
784
- "epoch": 0.0010896939170864006,
785
- "grad_norm": 0.02660440382201049,
786
- "kl": 0.09228515625,
787
  "learning_rate": 2e-06,
788
- "loss": -0.0002,
789
- "num_tokens": 4366159.0,
790
- "reward": 0.6755548715591431,
791
- "reward_std": 0.47086238861083984,
792
- "rewards/avg_thinking_length_func": 125.40975189208984,
793
- "rewards/correct_answer_reward_func": 0.6875,
794
- "rewards/efficient_thinking_reward_func": 0.9314815674427694,
795
- "rewards/format_reward_func": 1.0,
796
- "rewards/num_xml_reward_func": 1.7004401683807373,
797
- "rewards/tool_execution_reward_func": 2.0,
798
- "rewards/visit_tool_reward_func": 0.9642323851585388,
799
  "step": 45
800
  },
801
  {
@@ -804,11 +804,11 @@
804
  "clip_ratio/low_mean": 0.0,
805
  "clip_ratio/low_min": 0.0,
806
  "clip_ratio/region_mean": 0.0,
807
- "epoch": 0.0011139093374660984,
808
- "grad_norm": 0.026841083756153995,
809
- "kl": 0.09619140625,
810
  "learning_rate": 2e-06,
811
- "loss": -0.0002,
812
  "step": 46
813
  },
814
  {
@@ -817,11 +817,11 @@
817
  "clip_ratio/low_mean": 0.0,
818
  "clip_ratio/low_min": 0.0,
819
  "clip_ratio/region_mean": 0.0,
820
- "epoch": 0.0011381247578457962,
821
- "grad_norm": 0.026782109601823232,
822
- "kl": 0.0999755859375,
823
  "learning_rate": 2e-06,
824
- "loss": -0.0002,
825
  "step": 47
826
  },
827
  {
@@ -830,11 +830,11 @@
830
  "clip_ratio/low_mean": 0.0,
831
  "clip_ratio/low_min": 0.0,
832
  "clip_ratio/region_mean": 0.0,
833
- "epoch": 0.001162340178225494,
834
- "grad_norm": 0.02659970469430567,
835
- "kl": 0.103515625,
836
  "learning_rate": 2e-06,
837
- "loss": -0.0002,
838
  "step": 48
839
  },
840
  {
@@ -844,27 +844,27 @@
844
  "clip_ratio/low_min": 0.0,
845
  "clip_ratio/region_mean": 0.0,
846
  "completions/clipped_ratio": 0.0,
847
- "completions/max_length": 5239.0,
848
- "completions/max_terminated_length": 5239.0,
849
- "completions/mean_length": 3787.1875,
850
- "completions/mean_terminated_length": 3787.1875,
851
- "completions/min_length": 2194.0,
852
- "completions/min_terminated_length": 2194.0,
853
- "epoch": 0.0011865555986051918,
854
- "grad_norm": 0.02076190210123295,
855
- "kl": 0.0938720703125,
856
  "learning_rate": 2e-06,
857
- "loss": 0.0001,
858
- "num_tokens": 4663125.0,
859
- "reward": 0.7249662280082703,
860
- "reward_std": 0.16501030325889587,
861
- "rewards/avg_thinking_length_func": 140.40789794921875,
862
- "rewards/correct_answer_reward_func": 0.625,
863
- "rewards/efficient_thinking_reward_func": 0.9519768288322725,
864
  "rewards/format_reward_func": 1.0,
865
- "rewards/num_xml_reward_func": 1.8708701133728027,
866
- "rewards/tool_execution_reward_func": 2.0,
867
- "rewards/visit_tool_reward_func": 0.9880074858665466,
868
  "step": 49
869
  },
870
  {
@@ -873,11 +873,11 @@
873
  "clip_ratio/low_mean": 0.0,
874
  "clip_ratio/low_min": 0.0,
875
  "clip_ratio/region_mean": 0.0,
876
- "epoch": 0.0012107710189848896,
877
- "grad_norm": 0.020723077749825757,
878
- "kl": 0.0958251953125,
879
  "learning_rate": 2e-06,
880
- "loss": 0.0001,
881
  "step": 50
882
  },
883
  {
@@ -886,11 +886,11 @@
886
  "clip_ratio/low_mean": 0.0,
887
  "clip_ratio/low_min": 0.0,
888
  "clip_ratio/region_mean": 0.0,
889
- "epoch": 0.0012349864393645874,
890
- "grad_norm": 0.02054164461884488,
891
- "kl": 0.09716796875,
892
  "learning_rate": 2e-06,
893
- "loss": 0.0001,
894
  "step": 51
895
  },
896
  {
@@ -899,11 +899,11 @@
899
  "clip_ratio/low_mean": 0.0,
900
  "clip_ratio/low_min": 0.0,
901
  "clip_ratio/region_mean": 0.0,
902
- "epoch": 0.0012592018597442853,
903
- "grad_norm": 0.020335770375883248,
904
- "kl": 0.0972900390625,
905
  "learning_rate": 2e-06,
906
- "loss": 0.0001,
907
  "step": 52
908
  },
909
  {
@@ -913,27 +913,27 @@
913
  "clip_ratio/low_min": 0.0,
914
  "clip_ratio/region_mean": 0.0,
915
  "completions/clipped_ratio": 0.0,
916
- "completions/max_length": 5500.0,
917
- "completions/max_terminated_length": 5500.0,
918
- "completions/mean_length": 3872.6875,
919
- "completions/mean_terminated_length": 3872.6875,
920
- "completions/min_length": 1995.0,
921
- "completions/min_terminated_length": 1995.0,
922
- "epoch": 0.001283417280123983,
923
- "grad_norm": 0.019885369914639652,
924
- "kl": 0.1060791015625,
925
  "learning_rate": 2e-06,
926
- "loss": -0.0001,
927
- "num_tokens": 4965048.0,
928
- "reward": 0.8397980332374573,
929
- "reward_std": 0.24992188811302185,
930
- "rewards/avg_thinking_length_func": 148.27633666992188,
931
- "rewards/correct_answer_reward_func": 0.71875,
932
- "rewards/efficient_thinking_reward_func": 0.9891416230502088,
933
  "rewards/format_reward_func": 1.0,
934
- "rewards/num_xml_reward_func": 1.8712050914764404,
935
  "rewards/tool_execution_reward_func": 2.0,
936
- "rewards/visit_tool_reward_func": 1.0259138345718384,
937
  "step": 53
938
  },
939
  {
@@ -942,11 +942,11 @@
942
  "clip_ratio/low_mean": 0.0,
943
  "clip_ratio/low_min": 0.0,
944
  "clip_ratio/region_mean": 0.0,
945
- "epoch": 0.0013076327005036806,
946
- "grad_norm": 0.019647566668717146,
947
- "kl": 0.1060791015625,
948
  "learning_rate": 2e-06,
949
- "loss": -0.0001,
950
  "step": 54
951
  },
952
  {
@@ -955,11 +955,11 @@
955
  "clip_ratio/low_mean": 0.0,
956
  "clip_ratio/low_min": 0.0,
957
  "clip_ratio/region_mean": 0.0,
958
- "epoch": 0.0013318481208833785,
959
- "grad_norm": 0.01955130786377932,
960
- "kl": 0.10595703125,
961
  "learning_rate": 2e-06,
962
- "loss": -0.0001,
963
  "step": 55
964
  },
965
  {
@@ -968,11 +968,11 @@
968
  "clip_ratio/low_mean": 0.0,
969
  "clip_ratio/low_min": 0.0,
970
  "clip_ratio/region_mean": 0.0,
971
- "epoch": 0.0013560635412630763,
972
- "grad_norm": 0.01954445177850349,
973
- "kl": 0.1064453125,
974
  "learning_rate": 2e-06,
975
- "loss": -0.0001,
976
  "step": 56
977
  },
978
  {
@@ -982,27 +982,27 @@
982
  "clip_ratio/low_min": 0.0,
983
  "clip_ratio/region_mean": 0.0,
984
  "completions/clipped_ratio": 0.0,
985
- "completions/max_length": 5363.0,
986
- "completions/max_terminated_length": 5363.0,
987
- "completions/mean_length": 3860.25,
988
- "completions/mean_terminated_length": 3860.25,
989
- "completions/min_length": 2462.0,
990
- "completions/min_terminated_length": 2462.0,
991
- "epoch": 0.001380278961642774,
992
- "grad_norm": 0.024884617159235588,
993
- "kl": 0.093994140625,
994
  "learning_rate": 2e-06,
995
- "loss": 0.0007,
996
- "num_tokens": 5246738.0,
997
- "reward": 0.6381564140319824,
998
- "reward_std": 0.36967217922210693,
999
- "rewards/avg_thinking_length_func": 145.39739990234375,
1000
- "rewards/correct_answer_reward_func": 0.59375,
1001
- "rewards/efficient_thinking_reward_func": 0.9513852113551942,
1002
  "rewards/format_reward_func": 1.0,
1003
- "rewards/num_xml_reward_func": 1.8717927932739258,
1004
- "rewards/tool_execution_reward_func": 2.0,
1005
- "rewards/visit_tool_reward_func": 1.0618340969085693,
1006
  "step": 57
1007
  },
1008
  {
@@ -1011,11 +1011,11 @@
1011
  "clip_ratio/low_mean": 0.0,
1012
  "clip_ratio/low_min": 0.0,
1013
  "clip_ratio/region_mean": 0.0,
1014
- "epoch": 0.0014044943820224719,
1015
- "grad_norm": 0.02492436560745493,
1016
- "kl": 0.09375,
1017
  "learning_rate": 2e-06,
1018
- "loss": 0.0007,
1019
  "step": 58
1020
  },
1021
  {
@@ -1024,11 +1024,11 @@
1024
  "clip_ratio/low_mean": 0.0,
1025
  "clip_ratio/low_min": 0.0,
1026
  "clip_ratio/region_mean": 0.0,
1027
- "epoch": 0.0014287098024021697,
1028
- "grad_norm": 0.02501212297767101,
1029
- "kl": 0.09326171875,
1030
  "learning_rate": 2e-06,
1031
- "loss": 0.0007,
1032
  "step": 59
1033
  },
1034
  {
@@ -1037,362 +1037,17 @@
1037
  "clip_ratio/low_mean": 0.0,
1038
  "clip_ratio/low_min": 0.0,
1039
  "clip_ratio/region_mean": 0.0,
1040
- "epoch": 0.0014529252227818675,
1041
- "grad_norm": 0.02474493078785747,
1042
- "kl": 0.093017578125,
1043
  "learning_rate": 2e-06,
1044
- "loss": 0.0007,
1045
  "step": 60
1046
- },
1047
- {
1048
- "clip_ratio/high_max": 0.0,
1049
- "clip_ratio/high_mean": 0.0,
1050
- "clip_ratio/low_mean": 0.0,
1051
- "clip_ratio/low_min": 0.0,
1052
- "clip_ratio/region_mean": 0.0,
1053
- "completions/clipped_ratio": 0.0,
1054
- "completions/max_length": 6190.0,
1055
- "completions/max_terminated_length": 6190.0,
1056
- "completions/mean_length": 4112.90625,
1057
- "completions/mean_terminated_length": 4112.90625,
1058
- "completions/min_length": 2503.0,
1059
- "completions/min_terminated_length": 2503.0,
1060
- "epoch": 0.0014771406431615653,
1061
- "grad_norm": 0.02178738104725311,
1062
- "kl": 0.0994873046875,
1063
- "learning_rate": 2e-06,
1064
- "loss": 0.0011,
1065
- "num_tokens": 5531673.0,
1066
- "reward": 0.7260021567344666,
1067
- "reward_std": 0.4103294909000397,
1068
- "rewards/avg_thinking_length_func": 154.5753936767578,
1069
- "rewards/correct_answer_reward_func": 0.65625,
1070
- "rewards/efficient_thinking_reward_func": 0.9408740694270286,
1071
- "rewards/format_reward_func": 1.0,
1072
- "rewards/num_xml_reward_func": 1.8727319240570068,
1073
- "rewards/tool_execution_reward_func": 2.0,
1074
- "rewards/visit_tool_reward_func": 1.1154723167419434,
1075
- "step": 61
1076
- },
1077
- {
1078
- "clip_ratio/high_max": 0.0,
1079
- "clip_ratio/high_mean": 0.0,
1080
- "clip_ratio/low_mean": 0.0,
1081
- "clip_ratio/low_min": 0.0,
1082
- "clip_ratio/region_mean": 0.0,
1083
- "epoch": 0.001501356063541263,
1084
- "grad_norm": 0.022400760235574114,
1085
- "kl": 0.09912109375,
1086
- "learning_rate": 2e-06,
1087
- "loss": 0.0011,
1088
- "step": 62
1089
- },
1090
- {
1091
- "clip_ratio/high_max": 0.0,
1092
- "clip_ratio/high_mean": 0.0,
1093
- "clip_ratio/low_mean": 0.0,
1094
- "clip_ratio/low_min": 0.0,
1095
- "clip_ratio/region_mean": 0.0,
1096
- "epoch": 0.001525571483920961,
1097
- "grad_norm": 0.02241024326849516,
1098
- "kl": 0.0986328125,
1099
- "learning_rate": 2e-06,
1100
- "loss": 0.0011,
1101
- "step": 63
1102
- },
1103
- {
1104
- "clip_ratio/high_max": 0.0,
1105
- "clip_ratio/high_mean": 0.0,
1106
- "clip_ratio/low_mean": 0.0,
1107
- "clip_ratio/low_min": 0.0,
1108
- "clip_ratio/region_mean": 0.0,
1109
- "epoch": 0.0015497869043006587,
1110
- "grad_norm": 0.02284296428778258,
1111
- "kl": 0.098388671875,
1112
- "learning_rate": 2e-06,
1113
- "loss": 0.0011,
1114
- "step": 64
1115
- },
1116
- {
1117
- "clip_ratio/high_max": 0.0,
1118
- "clip_ratio/high_mean": 0.0,
1119
- "clip_ratio/low_mean": 0.0,
1120
- "clip_ratio/low_min": 0.0,
1121
- "clip_ratio/region_mean": 0.0,
1122
- "completions/clipped_ratio": 0.0,
1123
- "completions/max_length": 6287.0,
1124
- "completions/max_terminated_length": 6287.0,
1125
- "completions/mean_length": 4386.625,
1126
- "completions/mean_terminated_length": 4386.625,
1127
- "completions/min_length": 3015.0,
1128
- "completions/min_terminated_length": 3015.0,
1129
- "epoch": 0.0015740023246803565,
1130
- "grad_norm": 0.024442904812157877,
1131
- "kl": 0.096923828125,
1132
- "learning_rate": 2e-06,
1133
- "loss": -0.0003,
1134
- "num_tokens": 5833487.0,
1135
- "reward": 0.8125966787338257,
1136
- "reward_std": 0.40657174587249756,
1137
- "rewards/avg_thinking_length_func": 169.9393310546875,
1138
- "rewards/correct_answer_reward_func": 0.6875,
1139
- "rewards/efficient_thinking_reward_func": 0.9483068783516069,
1140
- "rewards/format_reward_func": 1.0,
1141
- "rewards/num_xml_reward_func": 1.875028371810913,
1142
- "rewards/tool_execution_reward_func": 2.0,
1143
- "rewards/visit_tool_reward_func": 1.0933895111083984,
1144
- "step": 65
1145
- },
1146
- {
1147
- "clip_ratio/high_max": 0.0,
1148
- "clip_ratio/high_mean": 0.0,
1149
- "clip_ratio/low_mean": 0.0,
1150
- "clip_ratio/low_min": 0.0,
1151
- "clip_ratio/region_mean": 0.0,
1152
- "epoch": 0.0015982177450600543,
1153
- "grad_norm": 0.024525067733314074,
1154
- "kl": 0.096435546875,
1155
- "learning_rate": 2e-06,
1156
- "loss": -0.0003,
1157
- "step": 66
1158
- },
1159
- {
1160
- "clip_ratio/high_max": 0.0,
1161
- "clip_ratio/high_mean": 0.0,
1162
- "clip_ratio/low_mean": 0.0,
1163
- "clip_ratio/low_min": 0.0,
1164
- "clip_ratio/region_mean": 0.0,
1165
- "epoch": 0.0016224331654397521,
1166
- "grad_norm": 0.024873815112161177,
1167
- "kl": 0.0960693359375,
1168
- "learning_rate": 2e-06,
1169
- "loss": -0.0003,
1170
- "step": 67
1171
- },
1172
- {
1173
- "clip_ratio/high_max": 0.0,
1174
- "clip_ratio/high_mean": 0.0,
1175
- "clip_ratio/low_mean": 0.0,
1176
- "clip_ratio/low_min": 0.0,
1177
- "clip_ratio/region_mean": 0.0,
1178
- "epoch": 0.00164664858581945,
1179
- "grad_norm": 0.025222911405096846,
1180
- "kl": 0.0955810546875,
1181
- "learning_rate": 2e-06,
1182
- "loss": -0.0003,
1183
- "step": 68
1184
- },
1185
- {
1186
- "clip_ratio/high_max": 0.0,
1187
- "clip_ratio/high_mean": 0.0,
1188
- "clip_ratio/low_mean": 0.0,
1189
- "clip_ratio/low_min": 0.0,
1190
- "clip_ratio/region_mean": 0.0,
1191
- "completions/clipped_ratio": 0.0,
1192
- "completions/max_length": 7582.0,
1193
- "completions/max_terminated_length": 7582.0,
1194
- "completions/mean_length": 4336.375,
1195
- "completions/mean_terminated_length": 4336.375,
1196
- "completions/min_length": 2926.0,
1197
- "completions/min_terminated_length": 2926.0,
1198
- "epoch": 0.0016708640061991475,
1199
- "grad_norm": 0.017478901741330728,
1200
- "kl": 0.0911865234375,
1201
- "learning_rate": 2e-06,
1202
- "loss": 0.0019,
1203
- "num_tokens": 6132990.0,
1204
- "reward": 0.7490283846855164,
1205
- "reward_std": 0.15198753774166107,
1206
- "rewards/avg_thinking_length_func": 179.77679443359375,
1207
- "rewards/correct_answer_reward_func": 0.65625,
1208
- "rewards/efficient_thinking_reward_func": 0.938588221110238,
1209
- "rewards/format_reward_func": 1.0,
1210
- "rewards/num_xml_reward_func": 1.9357295036315918,
1211
- "rewards/tool_execution_reward_func": 2.0,
1212
- "rewards/visit_tool_reward_func": 0.9721688032150269,
1213
- "step": 69
1214
- },
1215
- {
1216
- "clip_ratio/high_max": 0.0,
1217
- "clip_ratio/high_mean": 0.0,
1218
- "clip_ratio/low_mean": 0.0,
1219
- "clip_ratio/low_min": 0.0,
1220
- "clip_ratio/region_mean": 0.0,
1221
- "epoch": 0.0016950794265788453,
1222
- "grad_norm": 0.018875622948253925,
1223
- "kl": 0.0909423828125,
1224
- "learning_rate": 2e-06,
1225
- "loss": 0.0019,
1226
- "step": 70
1227
- },
1228
- {
1229
- "clip_ratio/high_max": 0.0,
1230
- "clip_ratio/high_mean": 0.0,
1231
- "clip_ratio/low_mean": 0.0,
1232
- "clip_ratio/low_min": 0.0,
1233
- "clip_ratio/region_mean": 0.0,
1234
- "epoch": 0.0017192948469585431,
1235
- "grad_norm": 0.020484617082338733,
1236
- "kl": 0.0911865234375,
1237
- "learning_rate": 2e-06,
1238
- "loss": 0.0019,
1239
- "step": 71
1240
- },
1241
- {
1242
- "clip_ratio/high_max": 0.0,
1243
- "clip_ratio/high_mean": 0.0,
1244
- "clip_ratio/low_mean": 0.0,
1245
- "clip_ratio/low_min": 0.0,
1246
- "clip_ratio/region_mean": 0.0,
1247
- "epoch": 0.001743510267338241,
1248
- "grad_norm": 0.021532205957360162,
1249
- "kl": 0.0933837890625,
1250
- "learning_rate": 2e-06,
1251
- "loss": 0.0019,
1252
- "step": 72
1253
- },
1254
- {
1255
- "clip_ratio/high_max": 0.0,
1256
- "clip_ratio/high_mean": 0.0,
1257
- "clip_ratio/low_mean": 0.0,
1258
- "clip_ratio/low_min": 0.0,
1259
- "clip_ratio/region_mean": 0.0,
1260
- "completions/clipped_ratio": 0.0,
1261
- "completions/max_length": 6065.0,
1262
- "completions/max_terminated_length": 6065.0,
1263
- "completions/mean_length": 4207.40625,
1264
- "completions/mean_terminated_length": 4207.40625,
1265
- "completions/min_length": 2648.0,
1266
- "completions/min_terminated_length": 2648.0,
1267
- "epoch": 0.0017677256877179387,
1268
- "grad_norm": 0.02249842465908536,
1269
- "kl": 0.1011962890625,
1270
- "learning_rate": 2e-06,
1271
- "loss": 0.0006,
1272
- "num_tokens": 6415549.0,
1273
- "reward": 0.7511920928955078,
1274
- "reward_std": 0.23255480825901031,
1275
- "rewards/avg_thinking_length_func": 180.5469207763672,
1276
- "rewards/correct_answer_reward_func": 0.65625,
1277
- "rewards/efficient_thinking_reward_func": 0.9317484663097895,
1278
- "rewards/format_reward_func": 1.0,
1279
- "rewards/num_xml_reward_func": 1.9344795942306519,
1280
- "rewards/tool_execution_reward_func": 1.990625023841858,
1281
- "rewards/visit_tool_reward_func": 0.9795504808425903,
1282
- "step": 73
1283
- },
1284
- {
1285
- "clip_ratio/high_max": 0.0,
1286
- "clip_ratio/high_mean": 0.0,
1287
- "clip_ratio/low_mean": 0.0,
1288
- "clip_ratio/low_min": 0.0,
1289
- "clip_ratio/region_mean": 0.0,
1290
- "epoch": 0.0017919411080976365,
1291
- "grad_norm": 0.05340931146356945,
1292
- "kl": 0.13232421875,
1293
- "learning_rate": 2e-06,
1294
- "loss": 0.0007,
1295
- "step": 74
1296
- },
1297
- {
1298
- "clip_ratio/high_max": 0.0,
1299
- "clip_ratio/high_mean": 0.0,
1300
- "clip_ratio/low_mean": 0.0,
1301
- "clip_ratio/low_min": 0.0,
1302
- "clip_ratio/region_mean": 0.0,
1303
- "epoch": 0.0018161565284773344,
1304
- "grad_norm": 0.02675181315227704,
1305
- "kl": 0.1058349609375,
1306
- "learning_rate": 2e-06,
1307
- "loss": 0.0006,
1308
- "step": 75
1309
- },
1310
- {
1311
- "clip_ratio/high_max": 0.0,
1312
- "clip_ratio/high_mean": 0.0,
1313
- "clip_ratio/low_mean": 0.0,
1314
- "clip_ratio/low_min": 0.0,
1315
- "clip_ratio/region_mean": 0.0,
1316
- "epoch": 0.0018403719488570322,
1317
- "grad_norm": 0.02697259723804417,
1318
- "kl": 0.091064453125,
1319
- "learning_rate": 2e-06,
1320
- "loss": 0.0006,
1321
- "step": 76
1322
- },
1323
- {
1324
- "clip_ratio/high_max": 0.0,
1325
- "clip_ratio/high_mean": 0.0,
1326
- "clip_ratio/low_mean": 0.0,
1327
- "clip_ratio/low_min": 0.0,
1328
- "clip_ratio/region_mean": 0.0,
1329
- "completions/clipped_ratio": 0.0,
1330
- "completions/max_length": 6482.0,
1331
- "completions/max_terminated_length": 6482.0,
1332
- "completions/mean_length": 4148.40625,
1333
- "completions/mean_terminated_length": 4148.40625,
1334
- "completions/min_length": 2778.0,
1335
- "completions/min_terminated_length": 2778.0,
1336
- "epoch": 0.00186458736923673,
1337
- "grad_norm": 0.03330572791868595,
1338
- "kl": 0.096923828125,
1339
- "learning_rate": 2e-06,
1340
- "loss": 0.0014,
1341
- "num_tokens": 6703449.0,
1342
- "reward": 0.6695541739463806,
1343
- "reward_std": 0.3436514735221863,
1344
- "rewards/avg_thinking_length_func": 184.87484741210938,
1345
- "rewards/correct_answer_reward_func": 0.59375,
1346
- "rewards/efficient_thinking_reward_func": 0.9242631827661938,
1347
- "rewards/format_reward_func": 1.0,
1348
- "rewards/num_xml_reward_func": 1.9313738346099854,
1349
- "rewards/tool_execution_reward_func": 2.0,
1350
- "rewards/visit_tool_reward_func": 0.9495425224304199,
1351
- "step": 77
1352
- },
1353
- {
1354
- "clip_ratio/high_max": 0.0,
1355
- "clip_ratio/high_mean": 0.0,
1356
- "clip_ratio/low_mean": 0.0,
1357
- "clip_ratio/low_min": 0.0,
1358
- "clip_ratio/region_mean": 0.0,
1359
- "epoch": 0.0018888027896164278,
1360
- "grad_norm": 0.03702086074093763,
1361
- "kl": 0.0970458984375,
1362
- "learning_rate": 2e-06,
1363
- "loss": 0.0014,
1364
- "step": 78
1365
- },
1366
- {
1367
- "clip_ratio/high_max": 0.0,
1368
- "clip_ratio/high_mean": 0.0,
1369
- "clip_ratio/low_mean": 0.0,
1370
- "clip_ratio/low_min": 0.0,
1371
- "clip_ratio/region_mean": 0.0,
1372
- "epoch": 0.0019130182099961256,
1373
- "grad_norm": 0.039763093996501166,
1374
- "kl": 0.1046142578125,
1375
- "learning_rate": 2e-06,
1376
- "loss": 0.0015,
1377
- "step": 79
1378
- },
1379
- {
1380
- "clip_ratio/high_max": 0.0,
1381
- "clip_ratio/high_mean": 0.0,
1382
- "clip_ratio/low_mean": 0.0,
1383
- "clip_ratio/low_min": 0.0,
1384
- "clip_ratio/region_mean": 0.0,
1385
- "epoch": 0.0019372336303758234,
1386
- "grad_norm": 0.04490759555701776,
1387
- "kl": 0.1263427734375,
1388
- "learning_rate": 2e-06,
1389
- "loss": 0.0015,
1390
- "step": 80
1391
  }
1392
  ],
1393
  "logging_steps": 1,
1394
  "max_steps": 640,
1395
- "num_input_tokens_seen": 6703449,
1396
  "num_train_epochs": 1,
1397
  "save_steps": 20,
1398
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.0014534883720930232,
6
  "eval_steps": 500,
7
+ "global_step": 60,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 10086.0,
20
+ "completions/max_terminated_length": 10086.0,
21
+ "completions/mean_length": 4296.546875,
22
+ "completions/mean_terminated_length": 4296.546875,
23
+ "completions/min_length": 1720.0,
24
+ "completions/min_terminated_length": 1720.0,
25
+ "epoch": 2.4224806201550387e-05,
26
+ "grad_norm": 0.016954593260394005,
27
+ "kl": 0.0009393692016601562,
28
  "learning_rate": 0.0,
29
+ "loss": -0.0003,
30
+ "num_tokens": 601834.0,
31
+ "reward": 0.4602593183517456,
32
+ "reward_std": 0.24803586304187775,
33
+ "rewards/avg_thinking_length_func": 185.02471923828125,
34
+ "rewards/correct_answer_reward_func": 0.453125,
35
+ "rewards/efficient_thinking_reward_func": 0.8889554441999474,
36
  "rewards/format_reward_func": 1.0,
37
+ "rewards/num_xml_reward_func": 1.7176268100738525,
38
+ "rewards/tool_execution_reward_func": 1.9936248064041138,
39
+ "rewards/visit_tool_reward_func": 0.9308543801307678,
40
  "step": 1
41
  },
42
  {
 
45
  "clip_ratio/low_mean": 0.0,
46
  "clip_ratio/low_min": 0.0,
47
  "clip_ratio/region_mean": 0.0,
48
+ "epoch": 4.8449612403100775e-05,
49
+ "grad_norm": 0.016953615886545852,
50
+ "kl": 0.0009393692016601562,
51
  "learning_rate": 6.25e-08,
52
+ "loss": -0.0003,
53
  "step": 2
54
  },
55
  {
 
58
  "clip_ratio/low_mean": 0.0,
59
  "clip_ratio/low_min": 0.0,
60
  "clip_ratio/region_mean": 0.0,
61
+ "epoch": 7.267441860465116e-05,
62
+ "grad_norm": 0.016864690676516626,
63
+ "kl": 0.0009565353393554688,
64
  "learning_rate": 1.25e-07,
65
+ "loss": -0.0003,
66
  "step": 3
67
  },
68
  {
 
71
  "clip_ratio/low_mean": 0.0,
72
  "clip_ratio/low_min": 0.0,
73
  "clip_ratio/region_mean": 0.0,
74
+ "epoch": 9.689922480620155e-05,
75
+ "grad_norm": 0.016822420848305722,
76
+ "kl": 0.0009622573852539062,
77
  "learning_rate": 1.875e-07,
78
+ "loss": -0.0003,
79
  "step": 4
80
  },
81
  {
 
85
  "clip_ratio/low_min": 0.0,
86
  "clip_ratio/region_mean": 0.0,
87
  "completions/clipped_ratio": 0.0,
88
+ "completions/max_length": 9385.0,
89
+ "completions/max_terminated_length": 9385.0,
90
+ "completions/mean_length": 4270.703125,
91
+ "completions/mean_terminated_length": 4270.703125,
92
+ "completions/min_length": 1390.0,
93
+ "completions/min_terminated_length": 1390.0,
94
+ "epoch": 0.00012112403100775194,
95
+ "grad_norm": 0.025862550499858347,
96
+ "kl": 0.000957489013671875,
97
  "learning_rate": 2.5e-07,
98
+ "loss": 0.0031,
99
+ "num_tokens": 1199795.0,
100
+ "reward": 0.566771388053894,
101
+ "reward_std": 0.48137491941452026,
102
+ "rewards/avg_thinking_length_func": 182.33303833007812,
103
+ "rewards/correct_answer_reward_func": 0.578125,
104
+ "rewards/efficient_thinking_reward_func": 0.8707049785861538,
105
  "rewards/format_reward_func": 1.0,
106
+ "rewards/num_xml_reward_func": 1.7195165157318115,
107
+ "rewards/tool_execution_reward_func": 1.9965277910232544,
108
+ "rewards/visit_tool_reward_func": 0.9274243116378784,
109
  "step": 5
110
  },
111
  {
 
114
  "clip_ratio/low_mean": 0.0,
115
  "clip_ratio/low_min": 0.0,
116
  "clip_ratio/region_mean": 0.0,
117
+ "epoch": 0.00014534883720930232,
118
+ "grad_norm": 0.025877236026611388,
119
+ "kl": 0.0009489059448242188,
120
  "learning_rate": 3.1249999999999997e-07,
121
+ "loss": 0.0031,
122
  "step": 6
123
  },
124
  {
 
127
  "clip_ratio/low_mean": 0.0,
128
  "clip_ratio/low_min": 0.0,
129
  "clip_ratio/region_mean": 0.0,
130
+ "epoch": 0.0001695736434108527,
131
+ "grad_norm": 0.025817236127475232,
132
+ "kl": 0.0009660720825195312,
133
  "learning_rate": 3.75e-07,
134
+ "loss": 0.0031,
135
  "step": 7
136
  },
137
  {
 
140
  "clip_ratio/low_mean": 0.0,
141
  "clip_ratio/low_min": 0.0,
142
  "clip_ratio/region_mean": 0.0,
143
+ "epoch": 0.0001937984496124031,
144
+ "grad_norm": 0.02584169829863559,
145
+ "kl": 0.0009441375732421875,
146
  "learning_rate": 4.375e-07,
147
+ "loss": 0.0031,
148
  "step": 8
149
  },
150
  {
 
154
  "clip_ratio/low_min": 0.0,
155
  "clip_ratio/region_mean": 0.0,
156
  "completions/clipped_ratio": 0.0,
157
+ "completions/max_length": 7008.0,
158
+ "completions/max_terminated_length": 7008.0,
159
+ "completions/mean_length": 4088.546875,
160
+ "completions/mean_terminated_length": 4088.546875,
161
+ "completions/min_length": 1705.0,
162
+ "completions/min_terminated_length": 1705.0,
163
+ "epoch": 0.00021802325581395349,
164
+ "grad_norm": 0.01625597308376849,
165
+ "kl": 0.0009918212890625,
166
  "learning_rate": 5e-07,
167
+ "loss": 0.0013,
168
+ "num_tokens": 1783761.0,
169
+ "reward": 0.3732198178768158,
170
+ "reward_std": 0.2907864451408386,
171
+ "rewards/avg_thinking_length_func": 177.95510864257812,
172
+ "rewards/correct_answer_reward_func": 0.390625,
173
+ "rewards/efficient_thinking_reward_func": 0.8993925619789238,
174
  "rewards/format_reward_func": 1.0,
175
+ "rewards/num_xml_reward_func": 1.6866124868392944,
176
+ "rewards/tool_execution_reward_func": 1.950781226158142,
177
+ "rewards/visit_tool_reward_func": 0.8574961423873901,
178
  "step": 9
179
  },
180
  {
 
183
  "clip_ratio/low_mean": 0.0,
184
  "clip_ratio/low_min": 0.0,
185
  "clip_ratio/region_mean": 0.0,
186
+ "epoch": 0.00024224806201550387,
187
+ "grad_norm": 0.016618535814852814,
188
+ "kl": 0.0009899139404296875,
189
  "learning_rate": 5.625e-07,
190
+ "loss": 0.0013,
191
  "step": 10
192
  },
193
  {
 
196
  "clip_ratio/low_mean": 0.0,
197
  "clip_ratio/low_min": 0.0,
198
  "clip_ratio/region_mean": 0.0,
199
+ "epoch": 0.00026647286821705426,
200
+ "grad_norm": 0.016248156263205492,
201
+ "kl": 0.0009660720825195312,
202
  "learning_rate": 6.249999999999999e-07,
203
+ "loss": 0.0013,
204
  "step": 11
205
  },
206
  {
 
209
  "clip_ratio/low_mean": 0.0,
210
  "clip_ratio/low_min": 0.0,
211
  "clip_ratio/region_mean": 0.0,
212
+ "epoch": 0.00029069767441860465,
213
+ "grad_norm": 0.016111032400620007,
214
+ "kl": 0.0009870529174804688,
215
  "learning_rate": 6.875e-07,
216
+ "loss": 0.0013,
217
  "step": 12
218
  },
219
  {
 
223
  "clip_ratio/low_min": 0.0,
224
  "clip_ratio/region_mean": 0.0,
225
  "completions/clipped_ratio": 0.0,
226
+ "completions/max_length": 6572.0,
227
+ "completions/max_terminated_length": 6572.0,
228
+ "completions/mean_length": 4119.703125,
229
+ "completions/mean_terminated_length": 4119.703125,
230
+ "completions/min_length": 1356.0,
231
+ "completions/min_terminated_length": 1356.0,
232
+ "epoch": 0.00031492248062015503,
233
+ "grad_norm": 0.019643777904198217,
234
+ "kl": 0.0009822845458984375,
235
  "learning_rate": 7.5e-07,
236
+ "loss": -0.0008,
237
+ "num_tokens": 2367034.0,
238
+ "reward": 0.6774564981460571,
239
+ "reward_std": 0.3563808798789978,
240
+ "rewards/avg_thinking_length_func": 176.69476318359375,
241
+ "rewards/correct_answer_reward_func": 0.625,
242
+ "rewards/efficient_thinking_reward_func": 0.8704519537344548,
243
  "rewards/format_reward_func": 1.0,
244
+ "rewards/num_xml_reward_func": 1.6578426361083984,
245
+ "rewards/tool_execution_reward_func": 2.0,
246
+ "rewards/visit_tool_reward_func": 0.9361900091171265,
247
  "step": 13
248
  },
249
  {
 
252
  "clip_ratio/low_mean": 0.0,
253
  "clip_ratio/low_min": 0.0,
254
  "clip_ratio/region_mean": 0.0,
255
+ "epoch": 0.0003391472868217054,
256
+ "grad_norm": 0.0194815826710202,
257
+ "kl": 0.0010242462158203125,
258
  "learning_rate": 8.125e-07,
259
+ "loss": -0.0008,
260
  "step": 14
261
  },
262
  {
 
265
  "clip_ratio/low_mean": 0.0,
266
  "clip_ratio/low_min": 0.0,
267
  "clip_ratio/region_mean": 0.0,
268
+ "epoch": 0.0003633720930232558,
269
+ "grad_norm": 0.019402854833833996,
270
+ "kl": 0.0010585784912109375,
271
  "learning_rate": 8.75e-07,
272
+ "loss": -0.0008,
273
  "step": 15
274
  },
275
  {
 
278
  "clip_ratio/low_mean": 0.0,
279
  "clip_ratio/low_min": 0.0,
280
  "clip_ratio/region_mean": 0.0,
281
+ "epoch": 0.0003875968992248062,
282
+ "grad_norm": 0.019438299719581362,
283
+ "kl": 0.0011272430419921875,
284
  "learning_rate": 9.374999999999999e-07,
285
+ "loss": -0.0008,
286
  "step": 16
287
  },
288
  {
 
292
  "clip_ratio/low_min": 0.0,
293
  "clip_ratio/region_mean": 0.0,
294
  "completions/clipped_ratio": 0.0,
295
+ "completions/max_length": 7597.0,
296
+ "completions/max_terminated_length": 7597.0,
297
+ "completions/mean_length": 4205.671875,
298
+ "completions/mean_terminated_length": 4205.671875,
299
+ "completions/min_length": 1507.0,
300
+ "completions/min_terminated_length": 1507.0,
301
+ "epoch": 0.0004118217054263566,
302
+ "grad_norm": 0.014823687168402296,
303
+ "kl": 0.0011005401611328125,
304
  "learning_rate": 1e-06,
305
+ "loss": 0.0009,
306
+ "num_tokens": 2985545.0,
307
+ "reward": 0.3260263204574585,
308
+ "reward_std": 0.2300996333360672,
309
+ "rewards/avg_thinking_length_func": 177.14329528808594,
310
+ "rewards/correct_answer_reward_func": 0.375,
311
+ "rewards/efficient_thinking_reward_func": 0.8988714947132084,
312
  "rewards/format_reward_func": 1.0,
313
+ "rewards/num_xml_reward_func": 1.8095711469650269,
314
+ "rewards/tool_execution_reward_func": 1.99609375,
315
+ "rewards/visit_tool_reward_func": 0.852025032043457,
316
  "step": 17
317
  },
318
  {
 
321
  "clip_ratio/low_mean": 0.0,
322
  "clip_ratio/low_min": 0.0,
323
  "clip_ratio/region_mean": 0.0,
324
+ "epoch": 0.00043604651162790697,
325
+ "grad_norm": 0.014727006858324664,
326
+ "kl": 0.0011577606201171875,
327
  "learning_rate": 1.0625e-06,
328
+ "loss": 0.0009,
329
  "step": 18
330
  },
331
  {
 
334
  "clip_ratio/low_mean": 0.0,
335
  "clip_ratio/low_min": 0.0,
336
  "clip_ratio/region_mean": 0.0,
337
+ "epoch": 0.00046027131782945736,
338
+ "grad_norm": 0.014837711956269274,
339
+ "kl": 0.0012722015380859375,
340
  "learning_rate": 1.125e-06,
341
+ "loss": 0.0009,
342
  "step": 19
343
  },
344
  {
 
347
  "clip_ratio/low_mean": 0.0,
348
  "clip_ratio/low_min": 0.0,
349
  "clip_ratio/region_mean": 0.0,
350
+ "epoch": 0.00048449612403100775,
351
+ "grad_norm": 0.014894430575329584,
352
+ "kl": 0.00146484375,
353
  "learning_rate": 1.1874999999999999e-06,
354
+ "loss": 0.0009,
355
  "step": 20
356
  },
357
  {
 
361
  "clip_ratio/low_min": 0.0,
362
  "clip_ratio/region_mean": 0.0,
363
  "completions/clipped_ratio": 0.0,
364
+ "completions/max_length": 7476.0,
365
+ "completions/max_terminated_length": 7476.0,
366
+ "completions/mean_length": 4097.921875,
367
+ "completions/mean_terminated_length": 4097.921875,
368
+ "completions/min_length": 1514.0,
369
+ "completions/min_terminated_length": 1514.0,
370
+ "epoch": 0.0005087209302325581,
371
+ "grad_norm": 0.0189498267274778,
372
+ "kl": 0.0019931793212890625,
373
  "learning_rate": 1.2499999999999999e-06,
374
+ "loss": 0.0003,
375
+ "num_tokens": 3561495.0,
376
+ "reward": 0.5717383623123169,
377
+ "reward_std": 0.33007949590682983,
378
+ "rewards/avg_thinking_length_func": 177.5142822265625,
379
+ "rewards/correct_answer_reward_func": 0.53125,
380
+ "rewards/efficient_thinking_reward_func": 0.8662384906971484,
381
+ "rewards/format_reward_func": 0.9937499761581421,
382
+ "rewards/num_xml_reward_func": 1.779766321182251,
383
+ "rewards/tool_execution_reward_func": 1.979819416999817,
384
+ "rewards/visit_tool_reward_func": 0.9004297256469727,
385
  "step": 21
386
  },
387
  {
 
390
  "clip_ratio/low_mean": 0.0,
391
  "clip_ratio/low_min": 0.0,
392
  "clip_ratio/region_mean": 0.0,
393
+ "epoch": 0.0005329457364341085,
394
+ "grad_norm": 0.019010527717988047,
395
+ "kl": 0.00229644775390625,
396
  "learning_rate": 1.3125e-06,
397
+ "loss": 0.0003,
398
  "step": 22
399
  },
400
  {
 
403
  "clip_ratio/low_mean": 0.0,
404
  "clip_ratio/low_min": 0.0,
405
  "clip_ratio/region_mean": 0.0,
406
+ "epoch": 0.0005571705426356589,
407
+ "grad_norm": 0.01910688815244073,
408
+ "kl": 0.00276947021484375,
409
  "learning_rate": 1.375e-06,
410
+ "loss": 0.0003,
411
  "step": 23
412
  },
413
  {
 
416
  "clip_ratio/low_mean": 0.0,
417
  "clip_ratio/low_min": 0.0,
418
  "clip_ratio/region_mean": 0.0,
419
+ "epoch": 0.0005813953488372093,
420
+ "grad_norm": 0.019047374161024387,
421
+ "kl": 0.00327301025390625,
422
  "learning_rate": 1.4375e-06,
423
+ "loss": 0.0003,
424
  "step": 24
425
  },
426
  {
 
430
  "clip_ratio/low_min": 0.0,
431
  "clip_ratio/region_mean": 0.0,
432
  "completions/clipped_ratio": 0.0,
433
+ "completions/max_length": 7779.0,
434
+ "completions/max_terminated_length": 7779.0,
435
+ "completions/mean_length": 4011.9375,
436
+ "completions/mean_terminated_length": 4011.9375,
437
+ "completions/min_length": 1884.0,
438
+ "completions/min_terminated_length": 1884.0,
439
+ "epoch": 0.0006056201550387597,
440
+ "grad_norm": 0.01969391991938911,
441
+ "kl": 0.0029449462890625,
442
  "learning_rate": 1.5e-06,
443
+ "loss": 0.0003,
444
+ "num_tokens": 4148002.0,
445
+ "reward": 0.4466557502746582,
446
+ "reward_std": 0.2478387951850891,
447
+ "rewards/avg_thinking_length_func": 174.6974639892578,
448
+ "rewards/correct_answer_reward_func": 0.40625,
449
+ "rewards/efficient_thinking_reward_func": 0.9054659197504085,
450
  "rewards/format_reward_func": 1.0,
451
+ "rewards/num_xml_reward_func": 1.806973934173584,
452
+ "rewards/tool_execution_reward_func": 1.9922122955322266,
453
+ "rewards/visit_tool_reward_func": 0.871803879737854,
454
  "step": 25
455
  },
456
  {
 
459
  "clip_ratio/low_mean": 0.0,
460
  "clip_ratio/low_min": 0.0,
461
  "clip_ratio/region_mean": 0.0,
462
+ "epoch": 0.0006298449612403101,
463
+ "grad_norm": 0.01979038843755439,
464
+ "kl": 0.003414154052734375,
465
  "learning_rate": 1.5624999999999999e-06,
466
+ "loss": 0.0003,
467
  "step": 26
468
  },
469
  {
 
472
  "clip_ratio/low_mean": 0.0,
473
  "clip_ratio/low_min": 0.0,
474
  "clip_ratio/region_mean": 0.0,
475
+ "epoch": 0.0006540697674418605,
476
+ "grad_norm": 0.019676702255338734,
477
+ "kl": 0.004245758056640625,
478
  "learning_rate": 1.625e-06,
479
+ "loss": 0.0003,
480
  "step": 27
481
  },
482
  {
 
485
  "clip_ratio/low_mean": 0.0,
486
  "clip_ratio/low_min": 0.0,
487
  "clip_ratio/region_mean": 0.0,
488
+ "epoch": 0.0006782945736434108,
489
+ "grad_norm": 0.0198896583655868,
490
+ "kl": 0.00508880615234375,
491
  "learning_rate": 1.6875e-06,
492
+ "loss": 0.0003,
493
  "step": 28
494
  },
495
  {
 
499
  "clip_ratio/low_min": 0.0,
500
  "clip_ratio/region_mean": 0.0,
501
  "completions/clipped_ratio": 0.0,
502
+ "completions/max_length": 7881.0,
503
+ "completions/max_terminated_length": 7881.0,
504
+ "completions/mean_length": 4278.0,
505
+ "completions/mean_terminated_length": 4278.0,
506
+ "completions/min_length": 1269.0,
507
+ "completions/min_terminated_length": 1269.0,
508
+ "epoch": 0.0007025193798449612,
509
+ "grad_norm": 0.02473412222614823,
510
+ "kl": 0.00722503662109375,
511
  "learning_rate": 1.75e-06,
512
+ "loss": 0.0005,
513
+ "num_tokens": 4732732.0,
514
+ "reward": 0.639769971370697,
515
+ "reward_std": 0.3489268720149994,
516
+ "rewards/avg_thinking_length_func": 183.79090881347656,
517
+ "rewards/correct_answer_reward_func": 0.640625,
518
+ "rewards/efficient_thinking_reward_func": 0.8433743364598003,
519
+ "rewards/format_reward_func": 0.9991071224212646,
520
+ "rewards/num_xml_reward_func": 1.686936616897583,
521
+ "rewards/tool_execution_reward_func": 1.9818710088729858,
522
+ "rewards/visit_tool_reward_func": 0.923589289188385,
523
  "step": 29
524
  },
525
  {
 
528
  "clip_ratio/low_mean": 0.0,
529
  "clip_ratio/low_min": 0.0,
530
  "clip_ratio/region_mean": 0.0,
531
+ "epoch": 0.0007267441860465116,
532
+ "grad_norm": 0.024757116664213524,
533
+ "kl": 0.0076904296875,
534
  "learning_rate": 1.8125e-06,
535
+ "loss": 0.0005,
536
  "step": 30
537
  },
538
  {
 
541
  "clip_ratio/low_mean": 0.0,
542
  "clip_ratio/low_min": 0.0,
543
  "clip_ratio/region_mean": 0.0,
544
+ "epoch": 0.000750968992248062,
545
+ "grad_norm": 0.02444644630643307,
546
+ "kl": 0.0073394775390625,
547
  "learning_rate": 1.8749999999999998e-06,
548
+ "loss": 0.0005,
549
  "step": 31
550
  },
551
  {
 
554
  "clip_ratio/low_mean": 0.0,
555
  "clip_ratio/low_min": 0.0,
556
  "clip_ratio/region_mean": 0.0,
557
+ "epoch": 0.0007751937984496124,
558
+ "grad_norm": 0.024210451469423133,
559
+ "kl": 0.007171630859375,
560
  "learning_rate": 1.9375e-06,
561
+ "loss": 0.0005,
562
  "step": 32
563
  },
564
  {
 
568
  "clip_ratio/low_min": 0.0,
569
  "clip_ratio/region_mean": 0.0,
570
  "completions/clipped_ratio": 0.0,
571
+ "completions/max_length": 7912.0,
572
+ "completions/max_terminated_length": 7912.0,
573
+ "completions/mean_length": 4317.890625,
574
+ "completions/mean_terminated_length": 4317.890625,
575
+ "completions/min_length": 1736.0,
576
+ "completions/min_terminated_length": 1736.0,
577
+ "epoch": 0.0007994186046511628,
578
+ "grad_norm": 0.020658762871057952,
579
+ "kl": 0.007049560546875,
580
  "learning_rate": 2e-06,
581
+ "loss": -0.0,
582
+ "num_tokens": 5347783.0,
583
+ "reward": 0.33683592081069946,
584
+ "reward_std": 0.32624948024749756,
585
+ "rewards/avg_thinking_length_func": 177.01129150390625,
586
+ "rewards/correct_answer_reward_func": 0.375,
587
+ "rewards/efficient_thinking_reward_func": 0.8817601664392056,
588
  "rewards/format_reward_func": 1.0,
589
+ "rewards/num_xml_reward_func": 1.5408036708831787,
590
+ "rewards/tool_execution_reward_func": 1.9917367696762085,
591
+ "rewards/visit_tool_reward_func": 0.9276807308197021,
592
  "step": 33
593
  },
594
  {
 
597
  "clip_ratio/low_mean": 0.0,
598
  "clip_ratio/low_min": 0.0,
599
  "clip_ratio/region_mean": 0.0,
600
+ "epoch": 0.0008236434108527132,
601
+ "grad_norm": 0.02072632567074888,
602
+ "kl": 0.0077972412109375,
603
  "learning_rate": 2e-06,
604
+ "loss": -0.0,
605
  "step": 34
606
  },
607
  {
 
610
  "clip_ratio/low_mean": 0.0,
611
  "clip_ratio/low_min": 0.0,
612
  "clip_ratio/region_mean": 0.0,
613
+ "epoch": 0.0008478682170542636,
614
+ "grad_norm": 0.020770020029080613,
615
+ "kl": 0.0087432861328125,
616
  "learning_rate": 2e-06,
617
+ "loss": -0.0,
618
  "step": 35
619
  },
620
  {
 
623
  "clip_ratio/low_mean": 0.0,
624
  "clip_ratio/low_min": 0.0,
625
  "clip_ratio/region_mean": 0.0,
626
+ "epoch": 0.0008720930232558139,
627
+ "grad_norm": 0.020487067102301602,
628
+ "kl": 0.0097198486328125,
629
  "learning_rate": 2e-06,
630
+ "loss": -0.0,
631
  "step": 36
632
  },
633
  {
 
637
  "clip_ratio/low_min": 0.0,
638
  "clip_ratio/region_mean": 0.0,
639
  "completions/clipped_ratio": 0.0,
640
+ "completions/max_length": 7378.0,
641
+ "completions/max_terminated_length": 7378.0,
642
+ "completions/mean_length": 4152.5,
643
+ "completions/mean_terminated_length": 4152.5,
644
+ "completions/min_length": 1423.0,
645
+ "completions/min_terminated_length": 1423.0,
646
+ "epoch": 0.0008963178294573643,
647
+ "grad_norm": 0.022364107178309313,
648
+ "kl": 0.0112152099609375,
649
  "learning_rate": 2e-06,
650
+ "loss": -0.0001,
651
+ "num_tokens": 5921090.0,
652
+ "reward": 0.6556656360626221,
653
+ "reward_std": 0.5008378028869629,
654
+ "rewards/avg_thinking_length_func": 170.4791259765625,
655
+ "rewards/correct_answer_reward_func": 0.625,
656
+ "rewards/efficient_thinking_reward_func": 0.8892575272805912,
657
+ "rewards/format_reward_func": 0.987500011920929,
658
+ "rewards/num_xml_reward_func": 1.5408031940460205,
659
+ "rewards/tool_execution_reward_func": 1.96875,
660
+ "rewards/visit_tool_reward_func": 0.9249746799468994,
661
  "step": 37
662
  },
663
  {
 
666
  "clip_ratio/low_mean": 0.0,
667
  "clip_ratio/low_min": 0.0,
668
  "clip_ratio/region_mean": 0.0,
669
+ "epoch": 0.0009205426356589147,
670
+ "grad_norm": 0.022597206540891295,
671
+ "kl": 0.0123443603515625,
672
  "learning_rate": 2e-06,
673
+ "loss": -0.0001,
674
  "step": 38
675
  },
676
  {
 
679
  "clip_ratio/low_mean": 0.0,
680
  "clip_ratio/low_min": 0.0,
681
  "clip_ratio/region_mean": 0.0,
682
+ "epoch": 0.0009447674418604651,
683
+ "grad_norm": 0.02246679376217943,
684
+ "kl": 0.013580322265625,
685
  "learning_rate": 2e-06,
686
+ "loss": -0.0001,
687
  "step": 39
688
  },
689
  {
 
692
  "clip_ratio/low_mean": 0.0,
693
  "clip_ratio/low_min": 0.0,
694
  "clip_ratio/region_mean": 0.0,
695
+ "epoch": 0.0009689922480620155,
696
+ "grad_norm": 0.022296105800735398,
697
+ "kl": 0.015106201171875,
698
  "learning_rate": 2e-06,
699
+ "loss": -0.0001,
700
  "step": 40
701
  },
702
  {
 
706
  "clip_ratio/low_min": 0.0,
707
  "clip_ratio/region_mean": 0.0,
708
  "completions/clipped_ratio": 0.0,
709
+ "completions/max_length": 7494.0,
710
+ "completions/max_terminated_length": 7494.0,
711
+ "completions/mean_length": 4562.296875,
712
+ "completions/mean_terminated_length": 4562.296875,
713
+ "completions/min_length": 2143.0,
714
+ "completions/min_terminated_length": 2143.0,
715
+ "epoch": 0.0009932170542635659,
716
+ "grad_norm": 0.021503135345542313,
717
+ "kl": 0.015594482421875,
718
  "learning_rate": 2e-06,
719
+ "loss": 0.0007,
720
+ "num_tokens": 6556719.0,
721
+ "reward": 0.47225743532180786,
722
+ "reward_std": 0.3904932141304016,
723
+ "rewards/avg_thinking_length_func": 169.57839965820312,
724
+ "rewards/correct_answer_reward_func": 0.4375,
725
+ "rewards/efficient_thinking_reward_func": 0.917264621947748,
726
  "rewards/format_reward_func": 1.0,
727
+ "rewards/num_xml_reward_func": 1.817958116531372,
728
+ "rewards/tool_execution_reward_func": 1.9884111881256104,
729
+ "rewards/visit_tool_reward_func": 0.9651369452476501,
730
  "step": 41
731
  },
732
  {
 
735
  "clip_ratio/low_mean": 0.0,
736
  "clip_ratio/low_min": 0.0,
737
  "clip_ratio/region_mean": 0.0,
738
+ "epoch": 0.0010174418604651163,
739
+ "grad_norm": 0.02149252867250571,
740
+ "kl": 0.01715087890625,
741
  "learning_rate": 2e-06,
742
+ "loss": 0.0007,
743
  "step": 42
744
  },
745
  {
 
748
  "clip_ratio/low_mean": 0.0,
749
  "clip_ratio/low_min": 0.0,
750
  "clip_ratio/region_mean": 0.0,
751
+ "epoch": 0.0010416666666666667,
752
+ "grad_norm": 0.02173596902997293,
753
+ "kl": 0.018798828125,
754
  "learning_rate": 2e-06,
755
+ "loss": 0.0007,
756
  "step": 43
757
  },
758
  {
 
761
  "clip_ratio/low_mean": 0.0,
762
  "clip_ratio/low_min": 0.0,
763
  "clip_ratio/region_mean": 0.0,
764
+ "epoch": 0.001065891472868217,
765
+ "grad_norm": 0.02188237517399594,
766
+ "kl": 0.020751953125,
767
  "learning_rate": 2e-06,
768
+ "loss": 0.0007,
769
  "step": 44
770
  },
771
  {
 
775
  "clip_ratio/low_min": 0.0,
776
  "clip_ratio/region_mean": 0.0,
777
  "completions/clipped_ratio": 0.0,
778
+ "completions/max_length": 9017.0,
779
+ "completions/max_terminated_length": 9017.0,
780
+ "completions/mean_length": 4664.796875,
781
+ "completions/mean_terminated_length": 4664.796875,
782
+ "completions/min_length": 1910.0,
783
+ "completions/min_terminated_length": 1910.0,
784
+ "epoch": 0.0010901162790697674,
785
+ "grad_norm": 0.02354857583102173,
786
+ "kl": 0.020477294921875,
787
  "learning_rate": 2e-06,
788
+ "loss": -0.0014,
789
+ "num_tokens": 7181732.0,
790
+ "reward": 0.7991669178009033,
791
+ "reward_std": 0.36247026920318604,
792
+ "rewards/avg_thinking_length_func": 171.8461151123047,
793
+ "rewards/correct_answer_reward_func": 0.703125,
794
+ "rewards/efficient_thinking_reward_func": 0.8915984372821139,
795
+ "rewards/format_reward_func": 0.9998437166213989,
796
+ "rewards/num_xml_reward_func": 1.8501074314117432,
797
+ "rewards/tool_execution_reward_func": 1.9971354007720947,
798
+ "rewards/visit_tool_reward_func": 1.071668028831482,
799
  "step": 45
800
  },
801
  {
 
804
  "clip_ratio/low_mean": 0.0,
805
  "clip_ratio/low_min": 0.0,
806
  "clip_ratio/region_mean": 0.0,
807
+ "epoch": 0.0011143410852713178,
808
+ "grad_norm": 0.023994471938115103,
809
+ "kl": 0.0224609375,
810
  "learning_rate": 2e-06,
811
+ "loss": -0.0014,
812
  "step": 46
813
  },
814
  {
 
817
  "clip_ratio/low_mean": 0.0,
818
  "clip_ratio/low_min": 0.0,
819
  "clip_ratio/region_mean": 0.0,
820
+ "epoch": 0.0011385658914728682,
821
+ "grad_norm": 0.026516939220345738,
822
+ "kl": 0.02508544921875,
823
  "learning_rate": 2e-06,
824
+ "loss": -0.0014,
825
  "step": 47
826
  },
827
  {
 
830
  "clip_ratio/low_mean": 0.0,
831
  "clip_ratio/low_min": 0.0,
832
  "clip_ratio/region_mean": 0.0,
833
+ "epoch": 0.0011627906976744186,
834
+ "grad_norm": 0.024485287814160223,
835
+ "kl": 0.0262451171875,
836
  "learning_rate": 2e-06,
837
+ "loss": -0.0014,
838
  "step": 48
839
  },
840
  {
 
844
  "clip_ratio/low_min": 0.0,
845
  "clip_ratio/region_mean": 0.0,
846
  "completions/clipped_ratio": 0.0,
847
+ "completions/max_length": 8522.0,
848
+ "completions/max_terminated_length": 8522.0,
849
+ "completions/mean_length": 4866.125,
850
+ "completions/mean_terminated_length": 4866.125,
851
+ "completions/min_length": 1959.0,
852
+ "completions/min_terminated_length": 1959.0,
853
+ "epoch": 0.001187015503875969,
854
+ "grad_norm": 0.02407332594201,
855
+ "kl": 0.032012939453125,
856
  "learning_rate": 2e-06,
857
+ "loss": 0.0014,
858
+ "num_tokens": 7868034.0,
859
+ "reward": 0.39128515124320984,
860
+ "reward_std": 0.3533371090888977,
861
+ "rewards/avg_thinking_length_func": 164.74734497070312,
862
+ "rewards/correct_answer_reward_func": 0.359375,
863
+ "rewards/efficient_thinking_reward_func": 0.9209367558816545,
864
  "rewards/format_reward_func": 1.0,
865
+ "rewards/num_xml_reward_func": 1.6406757831573486,
866
+ "rewards/tool_execution_reward_func": 1.98927903175354,
867
+ "rewards/visit_tool_reward_func": 1.0120830535888672,
868
  "step": 49
869
  },
870
  {
 
873
  "clip_ratio/low_mean": 0.0,
874
  "clip_ratio/low_min": 0.0,
875
  "clip_ratio/region_mean": 0.0,
876
+ "epoch": 0.0012112403100775194,
877
+ "grad_norm": 0.02479690454991753,
878
+ "kl": 0.035888671875,
879
  "learning_rate": 2e-06,
880
+ "loss": 0.0014,
881
  "step": 50
882
  },
883
  {
 
886
  "clip_ratio/low_mean": 0.0,
887
  "clip_ratio/low_min": 0.0,
888
  "clip_ratio/region_mean": 0.0,
889
+ "epoch": 0.0012354651162790698,
890
+ "grad_norm": 0.027012142633289393,
891
+ "kl": 0.04046630859375,
892
  "learning_rate": 2e-06,
893
+ "loss": 0.0014,
894
  "step": 51
895
  },
896
  {
 
899
  "clip_ratio/low_mean": 0.0,
900
  "clip_ratio/low_min": 0.0,
901
  "clip_ratio/region_mean": 0.0,
902
+ "epoch": 0.0012596899224806201,
903
+ "grad_norm": 0.026499465739179152,
904
+ "kl": 0.04803466796875,
905
  "learning_rate": 2e-06,
906
+ "loss": 0.0014,
907
  "step": 52
908
  },
909
  {
 
913
  "clip_ratio/low_min": 0.0,
914
  "clip_ratio/region_mean": 0.0,
915
  "completions/clipped_ratio": 0.0,
916
+ "completions/max_length": 7622.0,
917
+ "completions/max_terminated_length": 7622.0,
918
+ "completions/mean_length": 4509.75,
919
+ "completions/mean_terminated_length": 4509.75,
920
+ "completions/min_length": 1816.0,
921
+ "completions/min_terminated_length": 1816.0,
922
+ "epoch": 0.0012839147286821705,
923
+ "grad_norm": 0.019741394516818018,
924
+ "kl": 0.04510498046875,
925
  "learning_rate": 2e-06,
926
+ "loss": 0.0,
927
+ "num_tokens": 8481102.0,
928
+ "reward": 0.7655854225158691,
929
+ "reward_std": 0.27847254276275635,
930
+ "rewards/avg_thinking_length_func": 158.9434051513672,
931
+ "rewards/correct_answer_reward_func": 0.671875,
932
+ "rewards/efficient_thinking_reward_func": 0.884494477975468,
933
  "rewards/format_reward_func": 1.0,
934
+ "rewards/num_xml_reward_func": 1.8834664821624756,
935
  "rewards/tool_execution_reward_func": 2.0,
936
+ "rewards/visit_tool_reward_func": 1.1049017906188965,
937
  "step": 53
938
  },
939
  {
 
942
  "clip_ratio/low_mean": 0.0,
943
  "clip_ratio/low_min": 0.0,
944
  "clip_ratio/region_mean": 0.0,
945
+ "epoch": 0.001308139534883721,
946
+ "grad_norm": 0.028517188784132036,
947
+ "kl": 0.06060791015625,
948
  "learning_rate": 2e-06,
949
+ "loss": 0.0001,
950
  "step": 54
951
  },
952
  {
 
955
  "clip_ratio/low_mean": 0.0,
956
  "clip_ratio/low_min": 0.0,
957
  "clip_ratio/region_mean": 0.0,
958
+ "epoch": 0.0013323643410852713,
959
+ "grad_norm": 0.02643367822401968,
960
+ "kl": 0.06280517578125,
961
  "learning_rate": 2e-06,
962
+ "loss": 0.0001,
963
  "step": 55
964
  },
965
  {
 
968
  "clip_ratio/low_mean": 0.0,
969
  "clip_ratio/low_min": 0.0,
970
  "clip_ratio/region_mean": 0.0,
971
+ "epoch": 0.0013565891472868217,
972
+ "grad_norm": 0.020594752118506976,
973
+ "kl": 0.056884765625,
974
  "learning_rate": 2e-06,
975
+ "loss": 0.0001,
976
  "step": 56
977
  },
978
  {
 
982
  "clip_ratio/low_min": 0.0,
983
  "clip_ratio/region_mean": 0.0,
984
  "completions/clipped_ratio": 0.0,
985
+ "completions/max_length": 7375.0,
986
+ "completions/max_terminated_length": 7375.0,
987
+ "completions/mean_length": 4285.046875,
988
+ "completions/mean_terminated_length": 4285.046875,
989
+ "completions/min_length": 2418.0,
990
+ "completions/min_terminated_length": 2418.0,
991
+ "epoch": 0.001380813953488372,
992
+ "grad_norm": 0.019100627823517295,
993
+ "kl": 0.06195068359375,
994
  "learning_rate": 2e-06,
995
+ "loss": 0.0005,
996
+ "num_tokens": 9112297.0,
997
+ "reward": 0.5274717807769775,
998
+ "reward_std": 0.2380232810974121,
999
+ "rewards/avg_thinking_length_func": 145.75924682617188,
1000
+ "rewards/correct_answer_reward_func": 0.453125,
1001
+ "rewards/efficient_thinking_reward_func": 0.9274070198828231,
1002
  "rewards/format_reward_func": 1.0,
1003
+ "rewards/num_xml_reward_func": 1.7929463386535645,
1004
+ "rewards/tool_execution_reward_func": 1.9959805011749268,
1005
+ "rewards/visit_tool_reward_func": 1.0335674285888672,
1006
  "step": 57
1007
  },
1008
  {
 
1011
  "clip_ratio/low_mean": 0.0,
1012
  "clip_ratio/low_min": 0.0,
1013
  "clip_ratio/region_mean": 0.0,
1014
+ "epoch": 0.0014050387596899225,
1015
+ "grad_norm": 0.019834849658967178,
1016
+ "kl": 0.06695556640625,
1017
  "learning_rate": 2e-06,
1018
+ "loss": 0.0005,
1019
  "step": 58
1020
  },
1021
  {
 
1024
  "clip_ratio/low_mean": 0.0,
1025
  "clip_ratio/low_min": 0.0,
1026
  "clip_ratio/region_mean": 0.0,
1027
+ "epoch": 0.0014292635658914728,
1028
+ "grad_norm": 0.020359737753586633,
1029
+ "kl": 0.0740966796875,
1030
  "learning_rate": 2e-06,
1031
+ "loss": 0.0005,
1032
  "step": 59
1033
  },
1034
  {
 
1037
  "clip_ratio/low_mean": 0.0,
1038
  "clip_ratio/low_min": 0.0,
1039
  "clip_ratio/region_mean": 0.0,
1040
+ "epoch": 0.0014534883720930232,
1041
+ "grad_norm": 0.020904893352951728,
1042
+ "kl": 0.085693359375,
1043
  "learning_rate": 2e-06,
1044
+ "loss": 0.0005,
1045
  "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1046
  }
1047
  ],
1048
  "logging_steps": 1,
1049
  "max_steps": 640,
1050
+ "num_input_tokens_seen": 9112297,
1051
  "num_train_epochs": 1,
1052
  "save_steps": 20,
1053
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11bcd76e4cac994c7f4a4f0f72f38922a2e0592872e99e4bfd16fb340f40dfb4
3
  size 8465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ae74d09b5b242d5ca59c2266e1297852a0f23aabeea82e2a73b716a08ef1d73
3
  size 8465