zhiyang1 commited on
Commit
4a951c6
·
verified ·
1 Parent(s): 842eca1

Upload folder using huggingface_hub

Browse files
checkpoint-100/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab08303da8a7de9502f945c203be3c58766f663fc9882b998fa91d3bbe450776
3
  size 4950516240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ab56583d88a45107747efede4e2c57b6ead41b457b31501fa0ebc2c82f1953
3
  size 4950516240
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c454328ae5fc93c49bcbce71fcf1eeb8a273d1de3a9a50fbcb1f8b495cd3430
3
  size 7549751118
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e713f730f3fc554b5951ef09e0709025ef3b3fe2b8778ad8f77eddc07f4aedee
3
  size 7549751118
checkpoint-100/trainer_state.json CHANGED
@@ -10,702 +10,702 @@
10
  "log_history": [
11
  {
12
  "epoch": 4.9952545082171936e-05,
13
- "grad_norm": 0.87890625,
14
  "learning_rate": 0.0002,
15
- "loss": 3.1785,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 9.990509016434387e-05,
20
- "grad_norm": 0.92578125,
21
  "learning_rate": 0.0002,
22
- "loss": 2.9941,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.0001498576352465158,
27
- "grad_norm": 1.0859375,
28
  "learning_rate": 0.0002,
29
- "loss": 2.7646,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.00019981018032868775,
34
- "grad_norm": 1.0,
35
  "learning_rate": 0.0002,
36
- "loss": 2.6073,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.0002497627254108597,
41
- "grad_norm": 0.984375,
42
  "learning_rate": 0.0002,
43
- "loss": 2.3667,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.0002997152704930316,
48
- "grad_norm": 0.734375,
49
  "learning_rate": 0.0002,
50
- "loss": 2.0751,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.0003496678155752036,
55
- "grad_norm": 0.54296875,
56
  "learning_rate": 0.0002,
57
- "loss": 2.0617,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.0003996203606573755,
62
- "grad_norm": 0.423828125,
63
  "learning_rate": 0.0002,
64
- "loss": 1.9628,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.0004495729057395474,
69
- "grad_norm": 0.39453125,
70
  "learning_rate": 0.0002,
71
- "loss": 2.0122,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.0004995254508217194,
76
- "grad_norm": 0.3203125,
77
  "learning_rate": 0.0002,
78
- "loss": 1.9424,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.0005494779959038913,
83
- "grad_norm": 0.30859375,
84
  "learning_rate": 0.0002,
85
- "loss": 1.9736,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.0005994305409860632,
90
- "grad_norm": 0.25390625,
91
  "learning_rate": 0.0002,
92
- "loss": 1.9844,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.0006493830860682351,
97
  "grad_norm": 0.25390625,
98
  "learning_rate": 0.0002,
99
- "loss": 1.9095,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.0006993356311504072,
104
- "grad_norm": 0.21875,
105
  "learning_rate": 0.0002,
106
- "loss": 1.9302,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.0007492881762325791,
111
- "grad_norm": 0.2119140625,
112
  "learning_rate": 0.0002,
113
- "loss": 1.9764,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.000799240721314751,
118
- "grad_norm": 0.1923828125,
119
  "learning_rate": 0.0002,
120
- "loss": 1.9615,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.0008491932663969229,
125
- "grad_norm": 0.19140625,
126
  "learning_rate": 0.0002,
127
- "loss": 1.9642,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.0008991458114790948,
132
- "grad_norm": 0.18359375,
133
  "learning_rate": 0.0002,
134
- "loss": 1.9293,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0009490983565612668,
139
- "grad_norm": 0.2041015625,
140
  "learning_rate": 0.0002,
141
- "loss": 1.9398,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.0009990509016434388,
146
- "grad_norm": 0.16015625,
147
  "learning_rate": 0.0002,
148
- "loss": 1.9432,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.0010490034467256107,
153
- "grad_norm": 0.1962890625,
154
  "learning_rate": 0.0002,
155
- "loss": 1.959,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0010989559918077826,
160
- "grad_norm": 0.1611328125,
161
  "learning_rate": 0.0002,
162
- "loss": 1.9206,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.0011489085368899545,
167
- "grad_norm": 0.25390625,
168
  "learning_rate": 0.0002,
169
- "loss": 1.9494,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.0011988610819721264,
174
- "grad_norm": 0.158203125,
175
  "learning_rate": 0.0002,
176
- "loss": 1.9419,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.0012488136270542983,
181
- "grad_norm": 0.208984375,
182
  "learning_rate": 0.0002,
183
- "loss": 1.924,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.0012987661721364703,
188
- "grad_norm": 0.17578125,
189
  "learning_rate": 0.0002,
190
- "loss": 1.9857,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.0013487187172186424,
195
- "grad_norm": 0.1533203125,
196
  "learning_rate": 0.0002,
197
- "loss": 1.91,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.0013986712623008143,
202
- "grad_norm": 0.19140625,
203
  "learning_rate": 0.0002,
204
- "loss": 1.9722,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.0014486238073829862,
209
- "grad_norm": 0.1591796875,
210
  "learning_rate": 0.0002,
211
- "loss": 1.9358,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.0014985763524651581,
216
- "grad_norm": 0.138671875,
217
  "learning_rate": 0.0002,
218
- "loss": 1.927,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.00154852889754733,
223
- "grad_norm": 0.1298828125,
224
  "learning_rate": 0.0002,
225
- "loss": 1.9234,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.001598481442629502,
230
- "grad_norm": 0.1689453125,
231
  "learning_rate": 0.0002,
232
- "loss": 1.9374,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.0016484339877116739,
237
- "grad_norm": 0.134765625,
238
  "learning_rate": 0.0002,
239
- "loss": 1.9535,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.0016983865327938458,
244
- "grad_norm": 0.130859375,
245
  "learning_rate": 0.0002,
246
- "loss": 1.9122,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.0017483390778760177,
251
- "grad_norm": 0.1455078125,
252
  "learning_rate": 0.0002,
253
- "loss": 1.9185,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.0017982916229581896,
258
- "grad_norm": 0.1474609375,
259
  "learning_rate": 0.0002,
260
- "loss": 1.9548,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.0018482441680403618,
265
- "grad_norm": 0.1484375,
266
  "learning_rate": 0.0002,
267
- "loss": 1.9392,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.0018981967131225337,
272
- "grad_norm": 0.1533203125,
273
  "learning_rate": 0.0002,
274
- "loss": 1.9088,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.0019481492582047056,
279
- "grad_norm": 0.1416015625,
280
  "learning_rate": 0.0002,
281
- "loss": 1.9547,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.0019981018032868775,
286
- "grad_norm": 0.1630859375,
287
  "learning_rate": 0.0002,
288
- "loss": 1.9569,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.0020480543483690494,
293
- "grad_norm": 0.171875,
294
  "learning_rate": 0.0002,
295
- "loss": 1.9443,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.0020980068934512213,
300
- "grad_norm": 0.1376953125,
301
  "learning_rate": 0.0002,
302
- "loss": 1.9093,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.0021479594385333933,
307
- "grad_norm": 0.1748046875,
308
  "learning_rate": 0.0002,
309
- "loss": 1.9484,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.002197911983615565,
314
- "grad_norm": 0.140625,
315
  "learning_rate": 0.0002,
316
- "loss": 1.9196,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.002247864528697737,
321
- "grad_norm": 0.154296875,
322
  "learning_rate": 0.0002,
323
- "loss": 1.9028,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.002297817073779909,
328
- "grad_norm": 0.1591796875,
329
  "learning_rate": 0.0002,
330
- "loss": 1.9247,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.002347769618862081,
335
- "grad_norm": 0.1591796875,
336
  "learning_rate": 0.0002,
337
- "loss": 1.9197,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.002397722163944253,
342
- "grad_norm": 0.1669921875,
343
  "learning_rate": 0.0002,
344
- "loss": 1.929,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.0024476747090264248,
349
- "grad_norm": 0.169921875,
350
  "learning_rate": 0.0002,
351
- "loss": 1.9066,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.0024976272541085967,
356
- "grad_norm": 0.1708984375,
357
  "learning_rate": 0.0002,
358
- "loss": 1.954,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.0025475797991907686,
363
- "grad_norm": 0.1669921875,
364
  "learning_rate": 0.0002,
365
- "loss": 1.9443,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.0025975323442729405,
370
  "grad_norm": 0.1552734375,
371
  "learning_rate": 0.0002,
372
- "loss": 1.9144,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.002647484889355113,
377
- "grad_norm": 0.1748046875,
378
  "learning_rate": 0.0002,
379
- "loss": 1.8864,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.0026974374344372848,
384
- "grad_norm": 0.162109375,
385
  "learning_rate": 0.0002,
386
- "loss": 1.9331,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.0027473899795194567,
391
- "grad_norm": 0.15234375,
392
  "learning_rate": 0.0002,
393
- "loss": 1.9029,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.0027973425246016286,
398
- "grad_norm": 0.1513671875,
399
  "learning_rate": 0.0002,
400
- "loss": 1.9292,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.0028472950696838005,
405
- "grad_norm": 0.154296875,
406
  "learning_rate": 0.0002,
407
- "loss": 1.9331,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.0028972476147659724,
412
- "grad_norm": 0.1572265625,
413
  "learning_rate": 0.0002,
414
- "loss": 1.9132,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.0029472001598481443,
419
- "grad_norm": 0.181640625,
420
  "learning_rate": 0.0002,
421
- "loss": 1.8793,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.0029971527049303163,
426
- "grad_norm": 0.2490234375,
427
  "learning_rate": 0.0002,
428
- "loss": 1.8864,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.003047105250012488,
433
- "grad_norm": 0.18359375,
434
  "learning_rate": 0.0002,
435
- "loss": 1.8755,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.00309705779509466,
440
- "grad_norm": 0.171875,
441
  "learning_rate": 0.0002,
442
- "loss": 1.9099,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.003147010340176832,
447
- "grad_norm": 0.1982421875,
448
  "learning_rate": 0.0002,
449
- "loss": 1.9434,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.003196962885259004,
454
- "grad_norm": 0.171875,
455
  "learning_rate": 0.0002,
456
- "loss": 1.9597,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.003246915430341176,
461
- "grad_norm": 0.169921875,
462
  "learning_rate": 0.0002,
463
- "loss": 1.8813,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.0032968679754233478,
468
- "grad_norm": 0.1806640625,
469
  "learning_rate": 0.0002,
470
- "loss": 1.9121,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.0033468205205055197,
475
- "grad_norm": 0.1845703125,
476
  "learning_rate": 0.0002,
477
- "loss": 1.8903,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.0033967730655876916,
482
- "grad_norm": 0.208984375,
483
  "learning_rate": 0.0002,
484
- "loss": 1.892,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.0034467256106698635,
489
- "grad_norm": 0.16796875,
490
  "learning_rate": 0.0002,
491
- "loss": 1.9102,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.0034966781557520354,
496
- "grad_norm": 0.25390625,
497
  "learning_rate": 0.0002,
498
- "loss": 1.9265,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.0035466307008342073,
503
- "grad_norm": 0.2060546875,
504
  "learning_rate": 0.0002,
505
- "loss": 1.9059,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.0035965832459163793,
510
- "grad_norm": 0.2451171875,
511
  "learning_rate": 0.0002,
512
- "loss": 1.8916,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.003646535790998551,
517
- "grad_norm": 0.2041015625,
518
  "learning_rate": 0.0002,
519
- "loss": 1.9525,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.0036964883360807235,
524
- "grad_norm": 0.2197265625,
525
  "learning_rate": 0.0002,
526
- "loss": 1.912,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.0037464408811628954,
531
- "grad_norm": 0.1669921875,
532
  "learning_rate": 0.0002,
533
- "loss": 1.8936,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.0037963934262450674,
538
- "grad_norm": 0.2001953125,
539
  "learning_rate": 0.0002,
540
- "loss": 1.8205,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.0038463459713272393,
545
- "grad_norm": 0.2275390625,
546
  "learning_rate": 0.0002,
547
- "loss": 1.8902,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.003896298516409411,
552
- "grad_norm": 0.1884765625,
553
  "learning_rate": 0.0002,
554
- "loss": 1.8934,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.003946251061491583,
559
- "grad_norm": 0.2138671875,
560
  "learning_rate": 0.0002,
561
- "loss": 1.8697,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.003996203606573755,
566
- "grad_norm": 0.240234375,
567
  "learning_rate": 0.0002,
568
- "loss": 1.8818,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.004046156151655927,
573
- "grad_norm": 0.2041015625,
574
  "learning_rate": 0.0002,
575
- "loss": 1.8684,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.004096108696738099,
580
- "grad_norm": 0.2001953125,
581
  "learning_rate": 0.0002,
582
- "loss": 1.8652,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.004146061241820271,
587
- "grad_norm": 0.1826171875,
588
  "learning_rate": 0.0002,
589
- "loss": 1.9427,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.004196013786902443,
594
- "grad_norm": 0.197265625,
595
  "learning_rate": 0.0002,
596
- "loss": 1.8898,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.004245966331984615,
601
- "grad_norm": 0.1953125,
602
  "learning_rate": 0.0002,
603
- "loss": 1.8865,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.0042959188770667865,
608
- "grad_norm": 0.208984375,
609
  "learning_rate": 0.0002,
610
- "loss": 1.9,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.004345871422148958,
615
- "grad_norm": 0.224609375,
616
  "learning_rate": 0.0002,
617
- "loss": 1.8938,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.00439582396723113,
622
- "grad_norm": 0.1962890625,
623
  "learning_rate": 0.0002,
624
- "loss": 1.8586,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.004445776512313302,
629
- "grad_norm": 0.2041015625,
630
  "learning_rate": 0.0002,
631
- "loss": 1.9017,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.004495729057395474,
636
- "grad_norm": 0.216796875,
637
  "learning_rate": 0.0002,
638
- "loss": 1.8907,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.004545681602477646,
643
- "grad_norm": 0.2021484375,
644
  "learning_rate": 0.0002,
645
- "loss": 1.9174,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.004595634147559818,
650
- "grad_norm": 0.2021484375,
651
  "learning_rate": 0.0002,
652
- "loss": 1.8972,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.00464558669264199,
657
- "grad_norm": 0.236328125,
658
  "learning_rate": 0.0002,
659
- "loss": 1.8409,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.004695539237724162,
664
- "grad_norm": 0.19921875,
665
  "learning_rate": 0.0002,
666
- "loss": 1.9078,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.004745491782806334,
671
- "grad_norm": 0.197265625,
672
  "learning_rate": 0.0002,
673
- "loss": 1.8524,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.004795444327888506,
678
- "grad_norm": 0.205078125,
679
  "learning_rate": 0.0002,
680
- "loss": 1.8636,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.004845396872970678,
685
- "grad_norm": 0.189453125,
686
  "learning_rate": 0.0002,
687
- "loss": 1.86,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.0048953494180528495,
692
- "grad_norm": 0.193359375,
693
  "learning_rate": 0.0002,
694
- "loss": 1.8506,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.004945301963135021,
699
- "grad_norm": 0.205078125,
700
  "learning_rate": 0.0002,
701
- "loss": 1.8516,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.004995254508217193,
706
- "grad_norm": 0.1953125,
707
  "learning_rate": 0.0002,
708
- "loss": 1.8619,
709
  "step": 100
710
  }
711
  ],
@@ -713,7 +713,7 @@
713
  "max_steps": 12011400,
714
  "num_input_tokens_seen": 0,
715
  "num_train_epochs": 600,
716
- "save_steps": 50,
717
  "stateful_callbacks": {
718
  "TrainerControl": {
719
  "args": {
 
10
  "log_history": [
11
  {
12
  "epoch": 4.9952545082171936e-05,
13
+ "grad_norm": 0.86328125,
14
  "learning_rate": 0.0002,
15
+ "loss": 3.1772,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 9.990509016434387e-05,
20
+ "grad_norm": 0.8515625,
21
  "learning_rate": 0.0002,
22
+ "loss": 2.9924,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.0001498576352465158,
27
+ "grad_norm": 1.078125,
28
  "learning_rate": 0.0002,
29
+ "loss": 2.7756,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.00019981018032868775,
34
+ "grad_norm": 0.92578125,
35
  "learning_rate": 0.0002,
36
+ "loss": 2.6268,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.0002497627254108597,
41
+ "grad_norm": 0.97265625,
42
  "learning_rate": 0.0002,
43
+ "loss": 2.3838,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.0002997152704930316,
48
+ "grad_norm": 0.80078125,
49
  "learning_rate": 0.0002,
50
+ "loss": 2.1048,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.0003496678155752036,
55
+ "grad_norm": 0.625,
56
  "learning_rate": 0.0002,
57
+ "loss": 2.0735,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.0003996203606573755,
62
+ "grad_norm": 0.46875,
63
  "learning_rate": 0.0002,
64
+ "loss": 1.969,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.0004495729057395474,
69
+ "grad_norm": 0.427734375,
70
  "learning_rate": 0.0002,
71
+ "loss": 2.0166,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.0004995254508217194,
76
+ "grad_norm": 0.33203125,
77
  "learning_rate": 0.0002,
78
+ "loss": 1.9446,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.0005494779959038913,
83
+ "grad_norm": 0.31640625,
84
  "learning_rate": 0.0002,
85
+ "loss": 1.975,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.0005994305409860632,
90
+ "grad_norm": 0.267578125,
91
  "learning_rate": 0.0002,
92
+ "loss": 1.9867,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.0006493830860682351,
97
  "grad_norm": 0.25390625,
98
  "learning_rate": 0.0002,
99
+ "loss": 1.9101,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.0006993356311504072,
104
+ "grad_norm": 0.2109375,
105
  "learning_rate": 0.0002,
106
+ "loss": 1.9296,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.0007492881762325791,
111
+ "grad_norm": 0.2158203125,
112
  "learning_rate": 0.0002,
113
+ "loss": 1.9783,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.000799240721314751,
118
+ "grad_norm": 0.1982421875,
119
  "learning_rate": 0.0002,
120
+ "loss": 1.9612,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.0008491932663969229,
125
+ "grad_norm": 0.2138671875,
126
  "learning_rate": 0.0002,
127
+ "loss": 1.9648,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.0008991458114790948,
132
+ "grad_norm": 0.197265625,
133
  "learning_rate": 0.0002,
134
+ "loss": 1.9303,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.0009490983565612668,
139
+ "grad_norm": 0.201171875,
140
  "learning_rate": 0.0002,
141
+ "loss": 1.9416,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.0009990509016434388,
146
+ "grad_norm": 0.1904296875,
147
  "learning_rate": 0.0002,
148
+ "loss": 1.9454,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.0010490034467256107,
153
+ "grad_norm": 0.2001953125,
154
  "learning_rate": 0.0002,
155
+ "loss": 1.9603,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.0010989559918077826,
160
+ "grad_norm": 0.171875,
161
  "learning_rate": 0.0002,
162
+ "loss": 1.9216,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.0011489085368899545,
167
+ "grad_norm": 0.2255859375,
168
  "learning_rate": 0.0002,
169
+ "loss": 1.9503,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.0011988610819721264,
174
+ "grad_norm": 0.1572265625,
175
  "learning_rate": 0.0002,
176
+ "loss": 1.9421,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.0012488136270542983,
181
+ "grad_norm": 0.1591796875,
182
  "learning_rate": 0.0002,
183
+ "loss": 1.9211,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.0012987661721364703,
188
+ "grad_norm": 0.1611328125,
189
  "learning_rate": 0.0002,
190
+ "loss": 1.9854,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.0013487187172186424,
195
+ "grad_norm": 0.1455078125,
196
  "learning_rate": 0.0002,
197
+ "loss": 1.9081,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.0013986712623008143,
202
+ "grad_norm": 0.15625,
203
  "learning_rate": 0.0002,
204
+ "loss": 1.9687,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.0014486238073829862,
209
+ "grad_norm": 0.1494140625,
210
  "learning_rate": 0.0002,
211
+ "loss": 1.9339,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.0014985763524651581,
216
+ "grad_norm": 0.1484375,
217
  "learning_rate": 0.0002,
218
+ "loss": 1.9282,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.00154852889754733,
223
+ "grad_norm": 0.138671875,
224
  "learning_rate": 0.0002,
225
+ "loss": 1.9222,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.001598481442629502,
230
+ "grad_norm": 0.158203125,
231
  "learning_rate": 0.0002,
232
+ "loss": 1.937,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.0016484339877116739,
237
+ "grad_norm": 0.1455078125,
238
  "learning_rate": 0.0002,
239
+ "loss": 1.9536,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.0016983865327938458,
244
+ "grad_norm": 0.13671875,
245
  "learning_rate": 0.0002,
246
+ "loss": 1.9108,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.0017483390778760177,
251
+ "grad_norm": 0.1376953125,
252
  "learning_rate": 0.0002,
253
+ "loss": 1.9168,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.0017982916229581896,
258
+ "grad_norm": 0.1435546875,
259
  "learning_rate": 0.0002,
260
+ "loss": 1.9554,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.0018482441680403618,
265
+ "grad_norm": 0.1396484375,
266
  "learning_rate": 0.0002,
267
+ "loss": 1.9365,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.0018981967131225337,
272
+ "grad_norm": 0.130859375,
273
  "learning_rate": 0.0002,
274
+ "loss": 1.9047,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.0019481492582047056,
279
+ "grad_norm": 0.138671875,
280
  "learning_rate": 0.0002,
281
+ "loss": 1.953,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.0019981018032868775,
286
+ "grad_norm": 0.146484375,
287
  "learning_rate": 0.0002,
288
+ "loss": 1.9552,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.0020480543483690494,
293
+ "grad_norm": 0.140625,
294
  "learning_rate": 0.0002,
295
+ "loss": 1.9421,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.0020980068934512213,
300
+ "grad_norm": 0.134765625,
301
  "learning_rate": 0.0002,
302
+ "loss": 1.9078,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.0021479594385333933,
307
+ "grad_norm": 0.1669921875,
308
  "learning_rate": 0.0002,
309
+ "loss": 1.9458,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.002197911983615565,
314
+ "grad_norm": 0.1357421875,
315
  "learning_rate": 0.0002,
316
+ "loss": 1.9172,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.002247864528697737,
321
+ "grad_norm": 0.1396484375,
322
  "learning_rate": 0.0002,
323
+ "loss": 1.8999,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.002297817073779909,
328
+ "grad_norm": 0.1435546875,
329
  "learning_rate": 0.0002,
330
+ "loss": 1.9224,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.002347769618862081,
335
+ "grad_norm": 0.18359375,
336
  "learning_rate": 0.0002,
337
+ "loss": 1.9183,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.002397722163944253,
342
+ "grad_norm": 0.1630859375,
343
  "learning_rate": 0.0002,
344
+ "loss": 1.9258,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.0024476747090264248,
349
+ "grad_norm": 0.1689453125,
350
  "learning_rate": 0.0002,
351
+ "loss": 1.9046,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.0024976272541085967,
356
+ "grad_norm": 0.16015625,
357
  "learning_rate": 0.0002,
358
+ "loss": 1.9506,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.0025475797991907686,
363
+ "grad_norm": 0.171875,
364
  "learning_rate": 0.0002,
365
+ "loss": 1.9396,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.0025975323442729405,
370
  "grad_norm": 0.1552734375,
371
  "learning_rate": 0.0002,
372
+ "loss": 1.9131,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.002647484889355113,
377
+ "grad_norm": 0.1591796875,
378
  "learning_rate": 0.0002,
379
+ "loss": 1.8852,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.0026974374344372848,
384
+ "grad_norm": 0.154296875,
385
  "learning_rate": 0.0002,
386
+ "loss": 1.9306,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.0027473899795194567,
391
+ "grad_norm": 0.14453125,
392
  "learning_rate": 0.0002,
393
+ "loss": 1.8989,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.0027973425246016286,
398
+ "grad_norm": 0.154296875,
399
  "learning_rate": 0.0002,
400
+ "loss": 1.9273,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.0028472950696838005,
405
+ "grad_norm": 0.16015625,
406
  "learning_rate": 0.0002,
407
+ "loss": 1.9314,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.0028972476147659724,
412
+ "grad_norm": 0.1474609375,
413
  "learning_rate": 0.0002,
414
+ "loss": 1.9098,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.0029472001598481443,
419
+ "grad_norm": 0.1884765625,
420
  "learning_rate": 0.0002,
421
+ "loss": 1.8778,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.0029971527049303163,
426
+ "grad_norm": 0.259765625,
427
  "learning_rate": 0.0002,
428
+ "loss": 1.8856,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.003047105250012488,
433
+ "grad_norm": 0.21484375,
434
  "learning_rate": 0.0002,
435
+ "loss": 1.8735,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.00309705779509466,
440
+ "grad_norm": 0.154296875,
441
  "learning_rate": 0.0002,
442
+ "loss": 1.9047,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.003147010340176832,
447
+ "grad_norm": 0.1806640625,
448
  "learning_rate": 0.0002,
449
+ "loss": 1.9401,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.003196962885259004,
454
+ "grad_norm": 0.1826171875,
455
  "learning_rate": 0.0002,
456
+ "loss": 1.9593,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.003246915430341176,
461
+ "grad_norm": 0.1494140625,
462
  "learning_rate": 0.0002,
463
+ "loss": 1.8762,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.0032968679754233478,
468
+ "grad_norm": 0.1943359375,
469
  "learning_rate": 0.0002,
470
+ "loss": 1.9102,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.0033468205205055197,
475
+ "grad_norm": 0.203125,
476
  "learning_rate": 0.0002,
477
+ "loss": 1.8891,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.0033967730655876916,
482
+ "grad_norm": 0.1748046875,
483
  "learning_rate": 0.0002,
484
+ "loss": 1.89,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.0034467256106698635,
489
+ "grad_norm": 0.1591796875,
490
  "learning_rate": 0.0002,
491
+ "loss": 1.9058,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.0034966781557520354,
496
+ "grad_norm": 0.1875,
497
  "learning_rate": 0.0002,
498
+ "loss": 1.921,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.0035466307008342073,
503
+ "grad_norm": 0.169921875,
504
  "learning_rate": 0.0002,
505
+ "loss": 1.9024,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.0035965832459163793,
510
+ "grad_norm": 0.201171875,
511
  "learning_rate": 0.0002,
512
+ "loss": 1.888,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.003646535790998551,
517
+ "grad_norm": 0.1923828125,
518
  "learning_rate": 0.0002,
519
+ "loss": 1.9482,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.0036964883360807235,
524
+ "grad_norm": 0.19140625,
525
  "learning_rate": 0.0002,
526
+ "loss": 1.9083,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.0037464408811628954,
531
+ "grad_norm": 0.177734375,
532
  "learning_rate": 0.0002,
533
+ "loss": 1.8941,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.0037963934262450674,
538
+ "grad_norm": 0.1875,
539
  "learning_rate": 0.0002,
540
+ "loss": 1.8182,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.0038463459713272393,
545
+ "grad_norm": 0.2001953125,
546
  "learning_rate": 0.0002,
547
+ "loss": 1.8851,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.003896298516409411,
552
+ "grad_norm": 0.1806640625,
553
  "learning_rate": 0.0002,
554
+ "loss": 1.8908,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.003946251061491583,
559
+ "grad_norm": 0.17578125,
560
  "learning_rate": 0.0002,
561
+ "loss": 1.865,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.003996203606573755,
566
+ "grad_norm": 0.1884765625,
567
  "learning_rate": 0.0002,
568
+ "loss": 1.8739,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.004046156151655927,
573
+ "grad_norm": 0.2001953125,
574
  "learning_rate": 0.0002,
575
+ "loss": 1.8663,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.004096108696738099,
580
+ "grad_norm": 0.1708984375,
581
  "learning_rate": 0.0002,
582
+ "loss": 1.8611,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.004146061241820271,
587
+ "grad_norm": 0.19921875,
588
  "learning_rate": 0.0002,
589
+ "loss": 1.9407,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.004196013786902443,
594
+ "grad_norm": 0.177734375,
595
  "learning_rate": 0.0002,
596
+ "loss": 1.8856,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.004245966331984615,
601
+ "grad_norm": 0.193359375,
602
  "learning_rate": 0.0002,
603
+ "loss": 1.8824,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.0042959188770667865,
608
+ "grad_norm": 0.1787109375,
609
  "learning_rate": 0.0002,
610
+ "loss": 1.8966,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.004345871422148958,
615
+ "grad_norm": 0.19140625,
616
  "learning_rate": 0.0002,
617
+ "loss": 1.889,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.00439582396723113,
622
+ "grad_norm": 0.1884765625,
623
  "learning_rate": 0.0002,
624
+ "loss": 1.8558,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.004445776512313302,
629
+ "grad_norm": 0.189453125,
630
  "learning_rate": 0.0002,
631
+ "loss": 1.8981,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.004495729057395474,
636
+ "grad_norm": 0.189453125,
637
  "learning_rate": 0.0002,
638
+ "loss": 1.8875,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.004545681602477646,
643
+ "grad_norm": 0.248046875,
644
  "learning_rate": 0.0002,
645
+ "loss": 1.9172,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.004595634147559818,
650
+ "grad_norm": 0.189453125,
651
  "learning_rate": 0.0002,
652
+ "loss": 1.8927,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.00464558669264199,
657
+ "grad_norm": 0.35546875,
658
  "learning_rate": 0.0002,
659
+ "loss": 1.8491,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.004695539237724162,
664
+ "grad_norm": 0.255859375,
665
  "learning_rate": 0.0002,
666
+ "loss": 1.9081,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.004745491782806334,
671
+ "grad_norm": 0.255859375,
672
  "learning_rate": 0.0002,
673
+ "loss": 1.8536,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.004795444327888506,
678
+ "grad_norm": 0.2080078125,
679
  "learning_rate": 0.0002,
680
+ "loss": 1.8619,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.004845396872970678,
685
+ "grad_norm": 0.216796875,
686
  "learning_rate": 0.0002,
687
+ "loss": 1.8592,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.0048953494180528495,
692
+ "grad_norm": 0.2119140625,
693
  "learning_rate": 0.0002,
694
+ "loss": 1.8504,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.004945301963135021,
699
+ "grad_norm": 0.2177734375,
700
  "learning_rate": 0.0002,
701
+ "loss": 1.851,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.004995254508217193,
706
+ "grad_norm": 0.1982421875,
707
  "learning_rate": 0.0002,
708
+ "loss": 1.8595,
709
  "step": 100
710
  }
711
  ],
 
713
  "max_steps": 12011400,
714
  "num_input_tokens_seen": 0,
715
  "num_train_epochs": 600,
716
+ "save_steps": 20,
717
  "stateful_callbacks": {
718
  "TrainerControl": {
719
  "args": {
checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27cb4278e5fc9b69e214c7d2e16bea1fda610c0ac9e44a2316a81663cff89438
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:044c3e4091c3ce376fc056b76f8ea0cb40bb5636f7e81bb50c8b132c3cacb030
3
  size 5944