DmitryYarov commited on
Commit
e19a10b
·
verified ·
1 Parent(s): 3aadaa4

Upload folder using huggingface_hub

Browse files
checkpoint-2436/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ai-forever/rugpt3small_based_on_gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 1,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 2048,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 12,
26
+ "n_positions": 2048,
27
+ "pad_token_id": 0,
28
+ "reorder_and_upcast_attn": false,
29
+ "resid_pdrop": 0.1,
30
+ "scale_attn_by_inverse_layer_idx": false,
31
+ "scale_attn_weights": true,
32
+ "summary_activation": null,
33
+ "summary_first_dropout": 0.1,
34
+ "summary_proj_to_labels": true,
35
+ "summary_type": "cls_index",
36
+ "summary_use_proj": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.48.3",
39
+ "use_cache": true,
40
+ "vocab_size": 50264
41
+ }
checkpoint-2436/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.48.3"
7
+ }
checkpoint-2436/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa4f68ef82234dcec67d2093eb642f04208c0f17a3ee5478daf2e30df8eb83d2
3
+ size 500941440
checkpoint-2436/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a8ae197c56357756193df201987ddfb97fabcae78a99b060c2df40efe1909af
3
+ size 1389114
checkpoint-2436/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:817951fee2eb40d47fa8cb26992bc5cf4ccc62d0f032d75e3aa49b3142f2184a
3
+ size 14244
checkpoint-2436/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be1ac34f0f3f8a970a6b9152cdcffd1d7a6eceb6bc68f995045893573d52ca85
3
+ size 1064
checkpoint-2436/trainer_state.json ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.091330528259277,
3
+ "best_model_checkpoint": "aristotle_new_layer_plain/checkpoint-2436",
4
+ "epoch": 12.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2436,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.24721878862793573,
13
+ "grad_norm": 4.709590435028076,
14
+ "learning_rate": 5e-06,
15
+ "loss": 10.2189,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.49443757725587145,
20
+ "grad_norm": 3.6981208324432373,
21
+ "learning_rate": 1e-05,
22
+ "loss": 9.1727,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.7416563658838071,
27
+ "grad_norm": 3.9341259002685547,
28
+ "learning_rate": 1.5e-05,
29
+ "loss": 8.6877,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.9888751545117429,
34
+ "grad_norm": 3.341215133666992,
35
+ "learning_rate": 2e-05,
36
+ "loss": 8.1544,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_loss": 7.7336225509643555,
42
+ "eval_runtime": 14.2669,
43
+ "eval_samples_per_second": 50.396,
44
+ "eval_steps_per_second": 6.308,
45
+ "step": 203
46
+ },
47
+ {
48
+ "epoch": 1.2323856613102595,
49
+ "grad_norm": 4.8560404777526855,
50
+ "learning_rate": 2.5e-05,
51
+ "loss": 7.5653,
52
+ "step": 250
53
+ },
54
+ {
55
+ "epoch": 1.4796044499381953,
56
+ "grad_norm": 3.1774024963378906,
57
+ "learning_rate": 3e-05,
58
+ "loss": 7.1093,
59
+ "step": 300
60
+ },
61
+ {
62
+ "epoch": 1.726823238566131,
63
+ "grad_norm": 4.33836030960083,
64
+ "learning_rate": 3.5e-05,
65
+ "loss": 6.7529,
66
+ "step": 350
67
+ },
68
+ {
69
+ "epoch": 1.9740420271940669,
70
+ "grad_norm": 2.5972180366516113,
71
+ "learning_rate": 4e-05,
72
+ "loss": 6.536,
73
+ "step": 400
74
+ },
75
+ {
76
+ "epoch": 2.0,
77
+ "eval_loss": 6.426527500152588,
78
+ "eval_runtime": 14.2681,
79
+ "eval_samples_per_second": 50.392,
80
+ "eval_steps_per_second": 6.308,
81
+ "step": 406
82
+ },
83
+ {
84
+ "epoch": 2.2175525339925835,
85
+ "grad_norm": 3.43729567527771,
86
+ "learning_rate": 4.5e-05,
87
+ "loss": 6.3559,
88
+ "step": 450
89
+ },
90
+ {
91
+ "epoch": 2.464771322620519,
92
+ "grad_norm": 3.318251848220825,
93
+ "learning_rate": 5e-05,
94
+ "loss": 6.3251,
95
+ "step": 500
96
+ },
97
+ {
98
+ "epoch": 2.711990111248455,
99
+ "grad_norm": 3.502115488052368,
100
+ "learning_rate": 4.9550359712230215e-05,
101
+ "loss": 6.1368,
102
+ "step": 550
103
+ },
104
+ {
105
+ "epoch": 2.9592088998763906,
106
+ "grad_norm": 3.497938394546509,
107
+ "learning_rate": 4.9100719424460435e-05,
108
+ "loss": 6.0775,
109
+ "step": 600
110
+ },
111
+ {
112
+ "epoch": 3.0,
113
+ "eval_loss": 6.036961555480957,
114
+ "eval_runtime": 14.2475,
115
+ "eval_samples_per_second": 50.465,
116
+ "eval_steps_per_second": 6.317,
117
+ "step": 609
118
+ },
119
+ {
120
+ "epoch": 3.202719406674907,
121
+ "grad_norm": 3.4355390071868896,
122
+ "learning_rate": 4.865107913669065e-05,
123
+ "loss": 5.8684,
124
+ "step": 650
125
+ },
126
+ {
127
+ "epoch": 3.449938195302843,
128
+ "grad_norm": 3.9220526218414307,
129
+ "learning_rate": 4.820143884892087e-05,
130
+ "loss": 5.8101,
131
+ "step": 700
132
+ },
133
+ {
134
+ "epoch": 3.6971569839307787,
135
+ "grad_norm": 3.782421827316284,
136
+ "learning_rate": 4.775179856115108e-05,
137
+ "loss": 5.7784,
138
+ "step": 750
139
+ },
140
+ {
141
+ "epoch": 3.9443757725587143,
142
+ "grad_norm": 3.5181567668914795,
143
+ "learning_rate": 4.7302158273381294e-05,
144
+ "loss": 5.7181,
145
+ "step": 800
146
+ },
147
+ {
148
+ "epoch": 4.0,
149
+ "eval_loss": 5.772126197814941,
150
+ "eval_runtime": 14.2949,
151
+ "eval_samples_per_second": 50.298,
152
+ "eval_steps_per_second": 6.296,
153
+ "step": 812
154
+ },
155
+ {
156
+ "epoch": 4.187886279357231,
157
+ "grad_norm": 3.6087594032287598,
158
+ "learning_rate": 4.685251798561151e-05,
159
+ "loss": 5.5154,
160
+ "step": 850
161
+ },
162
+ {
163
+ "epoch": 4.435105067985167,
164
+ "grad_norm": 3.8448667526245117,
165
+ "learning_rate": 4.640287769784173e-05,
166
+ "loss": 5.4664,
167
+ "step": 900
168
+ },
169
+ {
170
+ "epoch": 4.6823238566131025,
171
+ "grad_norm": 3.594693660736084,
172
+ "learning_rate": 4.595323741007194e-05,
173
+ "loss": 5.4121,
174
+ "step": 950
175
+ },
176
+ {
177
+ "epoch": 4.929542645241038,
178
+ "grad_norm": 3.6225693225860596,
179
+ "learning_rate": 4.550359712230216e-05,
180
+ "loss": 5.3158,
181
+ "step": 1000
182
+ },
183
+ {
184
+ "epoch": 5.0,
185
+ "eval_loss": 5.521895885467529,
186
+ "eval_runtime": 14.2777,
187
+ "eval_samples_per_second": 50.358,
188
+ "eval_steps_per_second": 6.304,
189
+ "step": 1015
190
+ },
191
+ {
192
+ "epoch": 5.173053152039555,
193
+ "grad_norm": 4.245815277099609,
194
+ "learning_rate": 4.505395683453237e-05,
195
+ "loss": 5.1744,
196
+ "step": 1050
197
+ },
198
+ {
199
+ "epoch": 5.420271940667491,
200
+ "grad_norm": 4.306251525878906,
201
+ "learning_rate": 4.460431654676259e-05,
202
+ "loss": 5.1245,
203
+ "step": 1100
204
+ },
205
+ {
206
+ "epoch": 5.667490729295427,
207
+ "grad_norm": 3.7834959030151367,
208
+ "learning_rate": 4.4154676258992806e-05,
209
+ "loss": 5.0729,
210
+ "step": 1150
211
+ },
212
+ {
213
+ "epoch": 5.914709517923362,
214
+ "grad_norm": 4.298359394073486,
215
+ "learning_rate": 4.3705035971223026e-05,
216
+ "loss": 5.0558,
217
+ "step": 1200
218
+ },
219
+ {
220
+ "epoch": 6.0,
221
+ "eval_loss": 5.389599323272705,
222
+ "eval_runtime": 14.2956,
223
+ "eval_samples_per_second": 50.295,
224
+ "eval_steps_per_second": 6.296,
225
+ "step": 1218
226
+ },
227
+ {
228
+ "epoch": 6.158220024721879,
229
+ "grad_norm": 4.416134357452393,
230
+ "learning_rate": 4.325539568345324e-05,
231
+ "loss": 4.839,
232
+ "step": 1250
233
+ },
234
+ {
235
+ "epoch": 6.405438813349814,
236
+ "grad_norm": 4.565963268280029,
237
+ "learning_rate": 4.280575539568346e-05,
238
+ "loss": 4.8297,
239
+ "step": 1300
240
+ },
241
+ {
242
+ "epoch": 6.652657601977751,
243
+ "grad_norm": 4.854921817779541,
244
+ "learning_rate": 4.235611510791367e-05,
245
+ "loss": 4.8175,
246
+ "step": 1350
247
+ },
248
+ {
249
+ "epoch": 6.899876390605686,
250
+ "grad_norm": 4.982056617736816,
251
+ "learning_rate": 4.1906474820143885e-05,
252
+ "loss": 4.8081,
253
+ "step": 1400
254
+ },
255
+ {
256
+ "epoch": 7.0,
257
+ "eval_loss": 5.254246711730957,
258
+ "eval_runtime": 14.2665,
259
+ "eval_samples_per_second": 50.398,
260
+ "eval_steps_per_second": 6.308,
261
+ "step": 1421
262
+ },
263
+ {
264
+ "epoch": 7.143386897404203,
265
+ "grad_norm": 4.195478439331055,
266
+ "learning_rate": 4.14568345323741e-05,
267
+ "loss": 4.6322,
268
+ "step": 1450
269
+ },
270
+ {
271
+ "epoch": 7.3906056860321385,
272
+ "grad_norm": 4.963181972503662,
273
+ "learning_rate": 4.100719424460432e-05,
274
+ "loss": 4.547,
275
+ "step": 1500
276
+ },
277
+ {
278
+ "epoch": 7.637824474660074,
279
+ "grad_norm": 5.290962219238281,
280
+ "learning_rate": 4.055755395683453e-05,
281
+ "loss": 4.5553,
282
+ "step": 1550
283
+ },
284
+ {
285
+ "epoch": 7.88504326328801,
286
+ "grad_norm": 5.0038838386535645,
287
+ "learning_rate": 4.010791366906475e-05,
288
+ "loss": 4.5651,
289
+ "step": 1600
290
+ },
291
+ {
292
+ "epoch": 8.0,
293
+ "eval_loss": 5.183382987976074,
294
+ "eval_runtime": 14.2977,
295
+ "eval_samples_per_second": 50.288,
296
+ "eval_steps_per_second": 6.295,
297
+ "step": 1624
298
+ },
299
+ {
300
+ "epoch": 8.128553770086526,
301
+ "grad_norm": 5.3380446434021,
302
+ "learning_rate": 3.965827338129496e-05,
303
+ "loss": 4.3966,
304
+ "step": 1650
305
+ },
306
+ {
307
+ "epoch": 8.375772558714463,
308
+ "grad_norm": 5.339470863342285,
309
+ "learning_rate": 3.920863309352518e-05,
310
+ "loss": 4.3068,
311
+ "step": 1700
312
+ },
313
+ {
314
+ "epoch": 8.622991347342397,
315
+ "grad_norm": 4.9476189613342285,
316
+ "learning_rate": 3.8758992805755396e-05,
317
+ "loss": 4.3249,
318
+ "step": 1750
319
+ },
320
+ {
321
+ "epoch": 8.870210135970334,
322
+ "grad_norm": 5.430028915405273,
323
+ "learning_rate": 3.8309352517985616e-05,
324
+ "loss": 4.3407,
325
+ "step": 1800
326
+ },
327
+ {
328
+ "epoch": 9.0,
329
+ "eval_loss": 5.13620138168335,
330
+ "eval_runtime": 14.2616,
331
+ "eval_samples_per_second": 50.415,
332
+ "eval_steps_per_second": 6.311,
333
+ "step": 1827
334
+ },
335
+ {
336
+ "epoch": 9.11372064276885,
337
+ "grad_norm": 5.2561259269714355,
338
+ "learning_rate": 3.785971223021583e-05,
339
+ "loss": 4.1746,
340
+ "step": 1850
341
+ },
342
+ {
343
+ "epoch": 9.360939431396787,
344
+ "grad_norm": 5.811314105987549,
345
+ "learning_rate": 3.741007194244605e-05,
346
+ "loss": 4.1324,
347
+ "step": 1900
348
+ },
349
+ {
350
+ "epoch": 9.608158220024722,
351
+ "grad_norm": 5.552155017852783,
352
+ "learning_rate": 3.696043165467626e-05,
353
+ "loss": 4.1058,
354
+ "step": 1950
355
+ },
356
+ {
357
+ "epoch": 9.855377008652658,
358
+ "grad_norm": 6.073920726776123,
359
+ "learning_rate": 3.6510791366906475e-05,
360
+ "loss": 4.0436,
361
+ "step": 2000
362
+ },
363
+ {
364
+ "epoch": 10.0,
365
+ "eval_loss": 5.104895114898682,
366
+ "eval_runtime": 14.2556,
367
+ "eval_samples_per_second": 50.436,
368
+ "eval_steps_per_second": 6.313,
369
+ "step": 2030
370
+ },
371
+ {
372
+ "epoch": 10.098887515451175,
373
+ "grad_norm": 5.994938373565674,
374
+ "learning_rate": 3.606115107913669e-05,
375
+ "loss": 3.99,
376
+ "step": 2050
377
+ },
378
+ {
379
+ "epoch": 10.34610630407911,
380
+ "grad_norm": 6.414961814880371,
381
+ "learning_rate": 3.561151079136691e-05,
382
+ "loss": 3.9013,
383
+ "step": 2100
384
+ },
385
+ {
386
+ "epoch": 10.593325092707046,
387
+ "grad_norm": 6.1248459815979,
388
+ "learning_rate": 3.516187050359712e-05,
389
+ "loss": 3.8884,
390
+ "step": 2150
391
+ },
392
+ {
393
+ "epoch": 10.840543881334982,
394
+ "grad_norm": 5.360867500305176,
395
+ "learning_rate": 3.471223021582734e-05,
396
+ "loss": 3.877,
397
+ "step": 2200
398
+ },
399
+ {
400
+ "epoch": 11.0,
401
+ "eval_loss": 5.103781700134277,
402
+ "eval_runtime": 14.2705,
403
+ "eval_samples_per_second": 50.384,
404
+ "eval_steps_per_second": 6.307,
405
+ "step": 2233
406
+ },
407
+ {
408
+ "epoch": 11.084054388133499,
409
+ "grad_norm": 5.840531349182129,
410
+ "learning_rate": 3.4262589928057554e-05,
411
+ "loss": 3.8216,
412
+ "step": 2250
413
+ },
414
+ {
415
+ "epoch": 11.331273176761433,
416
+ "grad_norm": 7.407821178436279,
417
+ "learning_rate": 3.3812949640287773e-05,
418
+ "loss": 3.6379,
419
+ "step": 2300
420
+ },
421
+ {
422
+ "epoch": 11.57849196538937,
423
+ "grad_norm": 7.770689487457275,
424
+ "learning_rate": 3.3363309352517986e-05,
425
+ "loss": 3.7063,
426
+ "step": 2350
427
+ },
428
+ {
429
+ "epoch": 11.825710754017305,
430
+ "grad_norm": 6.17808198928833,
431
+ "learning_rate": 3.2913669064748206e-05,
432
+ "loss": 3.7008,
433
+ "step": 2400
434
+ },
435
+ {
436
+ "epoch": 12.0,
437
+ "eval_loss": 5.091330528259277,
438
+ "eval_runtime": 14.3396,
439
+ "eval_samples_per_second": 50.141,
440
+ "eval_steps_per_second": 6.276,
441
+ "step": 2436
442
+ }
443
+ ],
444
+ "logging_steps": 50,
445
+ "max_steps": 6060,
446
+ "num_input_tokens_seen": 0,
447
+ "num_train_epochs": 30,
448
+ "save_steps": 500,
449
+ "stateful_callbacks": {
450
+ "EarlyStoppingCallback": {
451
+ "args": {
452
+ "early_stopping_patience": 3,
453
+ "early_stopping_threshold": 0.0
454
+ },
455
+ "attributes": {
456
+ "early_stopping_patience_counter": 0
457
+ }
458
+ },
459
+ "TrainerControl": {
460
+ "args": {
461
+ "should_epoch_stop": false,
462
+ "should_evaluate": false,
463
+ "should_log": false,
464
+ "should_save": true,
465
+ "should_training_stop": false
466
+ },
467
+ "attributes": {}
468
+ }
469
+ },
470
+ "total_flos": 2.0280442355712e+16,
471
+ "train_batch_size": 8,
472
+ "trial_name": null,
473
+ "trial_params": null
474
+ }
checkpoint-2436/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ec087a0e6786ef2e0bcf02310b767379b6de459d7cb254c0c6ae4e881ba0e1
3
+ size 5304
checkpoint-2842/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ai-forever/rugpt3small_based_on_gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 1,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 2048,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 12,
26
+ "n_positions": 2048,
27
+ "pad_token_id": 0,
28
+ "reorder_and_upcast_attn": false,
29
+ "resid_pdrop": 0.1,
30
+ "scale_attn_by_inverse_layer_idx": false,
31
+ "scale_attn_weights": true,
32
+ "summary_activation": null,
33
+ "summary_first_dropout": 0.1,
34
+ "summary_proj_to_labels": true,
35
+ "summary_type": "cls_index",
36
+ "summary_use_proj": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.48.3",
39
+ "use_cache": true,
40
+ "vocab_size": 50264
41
+ }
checkpoint-2842/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.48.3"
7
+ }
checkpoint-2842/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:706dd31f608d02938920be68d84bb0433225705f634457946541a30c69fb4908
3
+ size 500941440
checkpoint-2842/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:509b4f4bc35322abd963fd780f5a4f07543d3a44ad818d5f1eadac5fbcaa80cd
3
+ size 1389114
checkpoint-2842/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a0e136fffb6d352ed6840c10d527267eee09f0ce8460de2ac56fb9d95167217
3
+ size 14244
checkpoint-2842/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a2612a056def0df3ae3c082fb47fd5b4944d59eecf3efb96824e8304ff100e
3
+ size 1064
checkpoint-2842/trainer_state.json ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.091330528259277,
3
+ "best_model_checkpoint": "aristotle_new_layer_plain/checkpoint-2436",
4
+ "epoch": 14.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2842,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.24721878862793573,
13
+ "grad_norm": 4.709590435028076,
14
+ "learning_rate": 5e-06,
15
+ "loss": 10.2189,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.49443757725587145,
20
+ "grad_norm": 3.6981208324432373,
21
+ "learning_rate": 1e-05,
22
+ "loss": 9.1727,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.7416563658838071,
27
+ "grad_norm": 3.9341259002685547,
28
+ "learning_rate": 1.5e-05,
29
+ "loss": 8.6877,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.9888751545117429,
34
+ "grad_norm": 3.341215133666992,
35
+ "learning_rate": 2e-05,
36
+ "loss": 8.1544,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_loss": 7.7336225509643555,
42
+ "eval_runtime": 14.2669,
43
+ "eval_samples_per_second": 50.396,
44
+ "eval_steps_per_second": 6.308,
45
+ "step": 203
46
+ },
47
+ {
48
+ "epoch": 1.2323856613102595,
49
+ "grad_norm": 4.8560404777526855,
50
+ "learning_rate": 2.5e-05,
51
+ "loss": 7.5653,
52
+ "step": 250
53
+ },
54
+ {
55
+ "epoch": 1.4796044499381953,
56
+ "grad_norm": 3.1774024963378906,
57
+ "learning_rate": 3e-05,
58
+ "loss": 7.1093,
59
+ "step": 300
60
+ },
61
+ {
62
+ "epoch": 1.726823238566131,
63
+ "grad_norm": 4.33836030960083,
64
+ "learning_rate": 3.5e-05,
65
+ "loss": 6.7529,
66
+ "step": 350
67
+ },
68
+ {
69
+ "epoch": 1.9740420271940669,
70
+ "grad_norm": 2.5972180366516113,
71
+ "learning_rate": 4e-05,
72
+ "loss": 6.536,
73
+ "step": 400
74
+ },
75
+ {
76
+ "epoch": 2.0,
77
+ "eval_loss": 6.426527500152588,
78
+ "eval_runtime": 14.2681,
79
+ "eval_samples_per_second": 50.392,
80
+ "eval_steps_per_second": 6.308,
81
+ "step": 406
82
+ },
83
+ {
84
+ "epoch": 2.2175525339925835,
85
+ "grad_norm": 3.43729567527771,
86
+ "learning_rate": 4.5e-05,
87
+ "loss": 6.3559,
88
+ "step": 450
89
+ },
90
+ {
91
+ "epoch": 2.464771322620519,
92
+ "grad_norm": 3.318251848220825,
93
+ "learning_rate": 5e-05,
94
+ "loss": 6.3251,
95
+ "step": 500
96
+ },
97
+ {
98
+ "epoch": 2.711990111248455,
99
+ "grad_norm": 3.502115488052368,
100
+ "learning_rate": 4.9550359712230215e-05,
101
+ "loss": 6.1368,
102
+ "step": 550
103
+ },
104
+ {
105
+ "epoch": 2.9592088998763906,
106
+ "grad_norm": 3.497938394546509,
107
+ "learning_rate": 4.9100719424460435e-05,
108
+ "loss": 6.0775,
109
+ "step": 600
110
+ },
111
+ {
112
+ "epoch": 3.0,
113
+ "eval_loss": 6.036961555480957,
114
+ "eval_runtime": 14.2475,
115
+ "eval_samples_per_second": 50.465,
116
+ "eval_steps_per_second": 6.317,
117
+ "step": 609
118
+ },
119
+ {
120
+ "epoch": 3.202719406674907,
121
+ "grad_norm": 3.4355390071868896,
122
+ "learning_rate": 4.865107913669065e-05,
123
+ "loss": 5.8684,
124
+ "step": 650
125
+ },
126
+ {
127
+ "epoch": 3.449938195302843,
128
+ "grad_norm": 3.9220526218414307,
129
+ "learning_rate": 4.820143884892087e-05,
130
+ "loss": 5.8101,
131
+ "step": 700
132
+ },
133
+ {
134
+ "epoch": 3.6971569839307787,
135
+ "grad_norm": 3.782421827316284,
136
+ "learning_rate": 4.775179856115108e-05,
137
+ "loss": 5.7784,
138
+ "step": 750
139
+ },
140
+ {
141
+ "epoch": 3.9443757725587143,
142
+ "grad_norm": 3.5181567668914795,
143
+ "learning_rate": 4.7302158273381294e-05,
144
+ "loss": 5.7181,
145
+ "step": 800
146
+ },
147
+ {
148
+ "epoch": 4.0,
149
+ "eval_loss": 5.772126197814941,
150
+ "eval_runtime": 14.2949,
151
+ "eval_samples_per_second": 50.298,
152
+ "eval_steps_per_second": 6.296,
153
+ "step": 812
154
+ },
155
+ {
156
+ "epoch": 4.187886279357231,
157
+ "grad_norm": 3.6087594032287598,
158
+ "learning_rate": 4.685251798561151e-05,
159
+ "loss": 5.5154,
160
+ "step": 850
161
+ },
162
+ {
163
+ "epoch": 4.435105067985167,
164
+ "grad_norm": 3.8448667526245117,
165
+ "learning_rate": 4.640287769784173e-05,
166
+ "loss": 5.4664,
167
+ "step": 900
168
+ },
169
+ {
170
+ "epoch": 4.6823238566131025,
171
+ "grad_norm": 3.594693660736084,
172
+ "learning_rate": 4.595323741007194e-05,
173
+ "loss": 5.4121,
174
+ "step": 950
175
+ },
176
+ {
177
+ "epoch": 4.929542645241038,
178
+ "grad_norm": 3.6225693225860596,
179
+ "learning_rate": 4.550359712230216e-05,
180
+ "loss": 5.3158,
181
+ "step": 1000
182
+ },
183
+ {
184
+ "epoch": 5.0,
185
+ "eval_loss": 5.521895885467529,
186
+ "eval_runtime": 14.2777,
187
+ "eval_samples_per_second": 50.358,
188
+ "eval_steps_per_second": 6.304,
189
+ "step": 1015
190
+ },
191
+ {
192
+ "epoch": 5.173053152039555,
193
+ "grad_norm": 4.245815277099609,
194
+ "learning_rate": 4.505395683453237e-05,
195
+ "loss": 5.1744,
196
+ "step": 1050
197
+ },
198
+ {
199
+ "epoch": 5.420271940667491,
200
+ "grad_norm": 4.306251525878906,
201
+ "learning_rate": 4.460431654676259e-05,
202
+ "loss": 5.1245,
203
+ "step": 1100
204
+ },
205
+ {
206
+ "epoch": 5.667490729295427,
207
+ "grad_norm": 3.7834959030151367,
208
+ "learning_rate": 4.4154676258992806e-05,
209
+ "loss": 5.0729,
210
+ "step": 1150
211
+ },
212
+ {
213
+ "epoch": 5.914709517923362,
214
+ "grad_norm": 4.298359394073486,
215
+ "learning_rate": 4.3705035971223026e-05,
216
+ "loss": 5.0558,
217
+ "step": 1200
218
+ },
219
+ {
220
+ "epoch": 6.0,
221
+ "eval_loss": 5.389599323272705,
222
+ "eval_runtime": 14.2956,
223
+ "eval_samples_per_second": 50.295,
224
+ "eval_steps_per_second": 6.296,
225
+ "step": 1218
226
+ },
227
+ {
228
+ "epoch": 6.158220024721879,
229
+ "grad_norm": 4.416134357452393,
230
+ "learning_rate": 4.325539568345324e-05,
231
+ "loss": 4.839,
232
+ "step": 1250
233
+ },
234
+ {
235
+ "epoch": 6.405438813349814,
236
+ "grad_norm": 4.565963268280029,
237
+ "learning_rate": 4.280575539568346e-05,
238
+ "loss": 4.8297,
239
+ "step": 1300
240
+ },
241
+ {
242
+ "epoch": 6.652657601977751,
243
+ "grad_norm": 4.854921817779541,
244
+ "learning_rate": 4.235611510791367e-05,
245
+ "loss": 4.8175,
246
+ "step": 1350
247
+ },
248
+ {
249
+ "epoch": 6.899876390605686,
250
+ "grad_norm": 4.982056617736816,
251
+ "learning_rate": 4.1906474820143885e-05,
252
+ "loss": 4.8081,
253
+ "step": 1400
254
+ },
255
+ {
256
+ "epoch": 7.0,
257
+ "eval_loss": 5.254246711730957,
258
+ "eval_runtime": 14.2665,
259
+ "eval_samples_per_second": 50.398,
260
+ "eval_steps_per_second": 6.308,
261
+ "step": 1421
262
+ },
263
+ {
264
+ "epoch": 7.143386897404203,
265
+ "grad_norm": 4.195478439331055,
266
+ "learning_rate": 4.14568345323741e-05,
267
+ "loss": 4.6322,
268
+ "step": 1450
269
+ },
270
+ {
271
+ "epoch": 7.3906056860321385,
272
+ "grad_norm": 4.963181972503662,
273
+ "learning_rate": 4.100719424460432e-05,
274
+ "loss": 4.547,
275
+ "step": 1500
276
+ },
277
+ {
278
+ "epoch": 7.637824474660074,
279
+ "grad_norm": 5.290962219238281,
280
+ "learning_rate": 4.055755395683453e-05,
281
+ "loss": 4.5553,
282
+ "step": 1550
283
+ },
284
+ {
285
+ "epoch": 7.88504326328801,
286
+ "grad_norm": 5.0038838386535645,
287
+ "learning_rate": 4.010791366906475e-05,
288
+ "loss": 4.5651,
289
+ "step": 1600
290
+ },
291
+ {
292
+ "epoch": 8.0,
293
+ "eval_loss": 5.183382987976074,
294
+ "eval_runtime": 14.2977,
295
+ "eval_samples_per_second": 50.288,
296
+ "eval_steps_per_second": 6.295,
297
+ "step": 1624
298
+ },
299
+ {
300
+ "epoch": 8.128553770086526,
301
+ "grad_norm": 5.3380446434021,
302
+ "learning_rate": 3.965827338129496e-05,
303
+ "loss": 4.3966,
304
+ "step": 1650
305
+ },
306
+ {
307
+ "epoch": 8.375772558714463,
308
+ "grad_norm": 5.339470863342285,
309
+ "learning_rate": 3.920863309352518e-05,
310
+ "loss": 4.3068,
311
+ "step": 1700
312
+ },
313
+ {
314
+ "epoch": 8.622991347342397,
315
+ "grad_norm": 4.9476189613342285,
316
+ "learning_rate": 3.8758992805755396e-05,
317
+ "loss": 4.3249,
318
+ "step": 1750
319
+ },
320
+ {
321
+ "epoch": 8.870210135970334,
322
+ "grad_norm": 5.430028915405273,
323
+ "learning_rate": 3.8309352517985616e-05,
324
+ "loss": 4.3407,
325
+ "step": 1800
326
+ },
327
+ {
328
+ "epoch": 9.0,
329
+ "eval_loss": 5.13620138168335,
330
+ "eval_runtime": 14.2616,
331
+ "eval_samples_per_second": 50.415,
332
+ "eval_steps_per_second": 6.311,
333
+ "step": 1827
334
+ },
335
+ {
336
+ "epoch": 9.11372064276885,
337
+ "grad_norm": 5.2561259269714355,
338
+ "learning_rate": 3.785971223021583e-05,
339
+ "loss": 4.1746,
340
+ "step": 1850
341
+ },
342
+ {
343
+ "epoch": 9.360939431396787,
344
+ "grad_norm": 5.811314105987549,
345
+ "learning_rate": 3.741007194244605e-05,
346
+ "loss": 4.1324,
347
+ "step": 1900
348
+ },
349
+ {
350
+ "epoch": 9.608158220024722,
351
+ "grad_norm": 5.552155017852783,
352
+ "learning_rate": 3.696043165467626e-05,
353
+ "loss": 4.1058,
354
+ "step": 1950
355
+ },
356
+ {
357
+ "epoch": 9.855377008652658,
358
+ "grad_norm": 6.073920726776123,
359
+ "learning_rate": 3.6510791366906475e-05,
360
+ "loss": 4.0436,
361
+ "step": 2000
362
+ },
363
+ {
364
+ "epoch": 10.0,
365
+ "eval_loss": 5.104895114898682,
366
+ "eval_runtime": 14.2556,
367
+ "eval_samples_per_second": 50.436,
368
+ "eval_steps_per_second": 6.313,
369
+ "step": 2030
370
+ },
371
+ {
372
+ "epoch": 10.098887515451175,
373
+ "grad_norm": 5.994938373565674,
374
+ "learning_rate": 3.606115107913669e-05,
375
+ "loss": 3.99,
376
+ "step": 2050
377
+ },
378
+ {
379
+ "epoch": 10.34610630407911,
380
+ "grad_norm": 6.414961814880371,
381
+ "learning_rate": 3.561151079136691e-05,
382
+ "loss": 3.9013,
383
+ "step": 2100
384
+ },
385
+ {
386
+ "epoch": 10.593325092707046,
387
+ "grad_norm": 6.1248459815979,
388
+ "learning_rate": 3.516187050359712e-05,
389
+ "loss": 3.8884,
390
+ "step": 2150
391
+ },
392
+ {
393
+ "epoch": 10.840543881334982,
394
+ "grad_norm": 5.360867500305176,
395
+ "learning_rate": 3.471223021582734e-05,
396
+ "loss": 3.877,
397
+ "step": 2200
398
+ },
399
+ {
400
+ "epoch": 11.0,
401
+ "eval_loss": 5.103781700134277,
402
+ "eval_runtime": 14.2705,
403
+ "eval_samples_per_second": 50.384,
404
+ "eval_steps_per_second": 6.307,
405
+ "step": 2233
406
+ },
407
+ {
408
+ "epoch": 11.084054388133499,
409
+ "grad_norm": 5.840531349182129,
410
+ "learning_rate": 3.4262589928057554e-05,
411
+ "loss": 3.8216,
412
+ "step": 2250
413
+ },
414
+ {
415
+ "epoch": 11.331273176761433,
416
+ "grad_norm": 7.407821178436279,
417
+ "learning_rate": 3.3812949640287773e-05,
418
+ "loss": 3.6379,
419
+ "step": 2300
420
+ },
421
+ {
422
+ "epoch": 11.57849196538937,
423
+ "grad_norm": 7.770689487457275,
424
+ "learning_rate": 3.3363309352517986e-05,
425
+ "loss": 3.7063,
426
+ "step": 2350
427
+ },
428
+ {
429
+ "epoch": 11.825710754017305,
430
+ "grad_norm": 6.17808198928833,
431
+ "learning_rate": 3.2913669064748206e-05,
432
+ "loss": 3.7008,
433
+ "step": 2400
434
+ },
435
+ {
436
+ "epoch": 12.0,
437
+ "eval_loss": 5.091330528259277,
438
+ "eval_runtime": 14.3396,
439
+ "eval_samples_per_second": 50.141,
440
+ "eval_steps_per_second": 6.276,
441
+ "step": 2436
442
+ },
443
+ {
444
+ "epoch": 12.069221260815821,
445
+ "grad_norm": 6.636974334716797,
446
+ "learning_rate": 3.246402877697842e-05,
447
+ "loss": 3.6062,
448
+ "step": 2450
449
+ },
450
+ {
451
+ "epoch": 12.316440049443758,
452
+ "grad_norm": 6.129552364349365,
453
+ "learning_rate": 3.201438848920863e-05,
454
+ "loss": 3.4693,
455
+ "step": 2500
456
+ },
457
+ {
458
+ "epoch": 12.563658838071694,
459
+ "grad_norm": 7.458967208862305,
460
+ "learning_rate": 3.1564748201438845e-05,
461
+ "loss": 3.507,
462
+ "step": 2550
463
+ },
464
+ {
465
+ "epoch": 12.810877626699629,
466
+ "grad_norm": 6.7472243309021,
467
+ "learning_rate": 3.1115107913669065e-05,
468
+ "loss": 3.5042,
469
+ "step": 2600
470
+ },
471
+ {
472
+ "epoch": 13.0,
473
+ "eval_loss": 5.1072187423706055,
474
+ "eval_runtime": 14.225,
475
+ "eval_samples_per_second": 50.545,
476
+ "eval_steps_per_second": 6.327,
477
+ "step": 2639
478
+ },
479
+ {
480
+ "epoch": 13.054388133498145,
481
+ "grad_norm": 6.99562406539917,
482
+ "learning_rate": 3.066546762589928e-05,
483
+ "loss": 3.4307,
484
+ "step": 2650
485
+ },
486
+ {
487
+ "epoch": 13.301606922126082,
488
+ "grad_norm": 7.893692493438721,
489
+ "learning_rate": 3.0215827338129498e-05,
490
+ "loss": 3.2852,
491
+ "step": 2700
492
+ },
493
+ {
494
+ "epoch": 13.548825710754016,
495
+ "grad_norm": 7.662129878997803,
496
+ "learning_rate": 2.976618705035971e-05,
497
+ "loss": 3.3116,
498
+ "step": 2750
499
+ },
500
+ {
501
+ "epoch": 13.796044499381953,
502
+ "grad_norm": 7.612554550170898,
503
+ "learning_rate": 2.931654676258993e-05,
504
+ "loss": 3.3243,
505
+ "step": 2800
506
+ },
507
+ {
508
+ "epoch": 14.0,
509
+ "eval_loss": 5.115802764892578,
510
+ "eval_runtime": 14.2589,
511
+ "eval_samples_per_second": 50.425,
512
+ "eval_steps_per_second": 6.312,
513
+ "step": 2842
514
+ }
515
+ ],
516
+ "logging_steps": 50,
517
+ "max_steps": 6060,
518
+ "num_input_tokens_seen": 0,
519
+ "num_train_epochs": 30,
520
+ "save_steps": 500,
521
+ "stateful_callbacks": {
522
+ "EarlyStoppingCallback": {
523
+ "args": {
524
+ "early_stopping_patience": 3,
525
+ "early_stopping_threshold": 0.0
526
+ },
527
+ "attributes": {
528
+ "early_stopping_patience_counter": 2
529
+ }
530
+ },
531
+ "TrainerControl": {
532
+ "args": {
533
+ "should_epoch_stop": false,
534
+ "should_evaluate": false,
535
+ "should_log": false,
536
+ "should_save": true,
537
+ "should_training_stop": false
538
+ },
539
+ "attributes": {}
540
+ }
541
+ },
542
+ "total_flos": 2.3660516081664e+16,
543
+ "train_batch_size": 8,
544
+ "trial_name": null,
545
+ "trial_params": null
546
+ }
checkpoint-2842/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ec087a0e6786ef2e0bcf02310b767379b6de459d7cb254c0c6ae4e881ba0e1
3
+ size 5304
checkpoint-3045/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ai-forever/rugpt3small_based_on_gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 1,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "gradient_checkpointing": false,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "layer_norm_epsilon": 1e-05,
20
+ "model_type": "gpt2",
21
+ "n_ctx": 2048,
22
+ "n_embd": 768,
23
+ "n_head": 12,
24
+ "n_inner": null,
25
+ "n_layer": 12,
26
+ "n_positions": 2048,
27
+ "pad_token_id": 0,
28
+ "reorder_and_upcast_attn": false,
29
+ "resid_pdrop": 0.1,
30
+ "scale_attn_by_inverse_layer_idx": false,
31
+ "scale_attn_weights": true,
32
+ "summary_activation": null,
33
+ "summary_first_dropout": 0.1,
34
+ "summary_proj_to_labels": true,
35
+ "summary_type": "cls_index",
36
+ "summary_use_proj": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.48.3",
39
+ "use_cache": true,
40
+ "vocab_size": 50264
41
+ }
checkpoint-3045/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.48.3"
7
+ }
checkpoint-3045/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77765d2854aa28043621c32dfe457dbd70e5ccd3c462b7da20e24e1321bedfe
3
+ size 500941440
checkpoint-3045/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e0b26d5997481a5036e1cbf6a55f2d7133a8568f6cc47c3b00ef57de5118ec
3
+ size 1389114
checkpoint-3045/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06712f07314b639891611e3372c71ae4f05e3f52fb233051f2b1da8dbc95c5c
3
+ size 14244
checkpoint-3045/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b973dad44b1d1fff3b9930c7bd03223dc77c72636ac760061bf865c110dc04db
3
+ size 1064
checkpoint-3045/trainer_state.json ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.091330528259277,
3
+ "best_model_checkpoint": "aristotle_new_layer_plain/checkpoint-2436",
4
+ "epoch": 15.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3045,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.24721878862793573,
13
+ "grad_norm": 4.709590435028076,
14
+ "learning_rate": 5e-06,
15
+ "loss": 10.2189,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.49443757725587145,
20
+ "grad_norm": 3.6981208324432373,
21
+ "learning_rate": 1e-05,
22
+ "loss": 9.1727,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.7416563658838071,
27
+ "grad_norm": 3.9341259002685547,
28
+ "learning_rate": 1.5e-05,
29
+ "loss": 8.6877,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.9888751545117429,
34
+ "grad_norm": 3.341215133666992,
35
+ "learning_rate": 2e-05,
36
+ "loss": 8.1544,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_loss": 7.7336225509643555,
42
+ "eval_runtime": 14.2669,
43
+ "eval_samples_per_second": 50.396,
44
+ "eval_steps_per_second": 6.308,
45
+ "step": 203
46
+ },
47
+ {
48
+ "epoch": 1.2323856613102595,
49
+ "grad_norm": 4.8560404777526855,
50
+ "learning_rate": 2.5e-05,
51
+ "loss": 7.5653,
52
+ "step": 250
53
+ },
54
+ {
55
+ "epoch": 1.4796044499381953,
56
+ "grad_norm": 3.1774024963378906,
57
+ "learning_rate": 3e-05,
58
+ "loss": 7.1093,
59
+ "step": 300
60
+ },
61
+ {
62
+ "epoch": 1.726823238566131,
63
+ "grad_norm": 4.33836030960083,
64
+ "learning_rate": 3.5e-05,
65
+ "loss": 6.7529,
66
+ "step": 350
67
+ },
68
+ {
69
+ "epoch": 1.9740420271940669,
70
+ "grad_norm": 2.5972180366516113,
71
+ "learning_rate": 4e-05,
72
+ "loss": 6.536,
73
+ "step": 400
74
+ },
75
+ {
76
+ "epoch": 2.0,
77
+ "eval_loss": 6.426527500152588,
78
+ "eval_runtime": 14.2681,
79
+ "eval_samples_per_second": 50.392,
80
+ "eval_steps_per_second": 6.308,
81
+ "step": 406
82
+ },
83
+ {
84
+ "epoch": 2.2175525339925835,
85
+ "grad_norm": 3.43729567527771,
86
+ "learning_rate": 4.5e-05,
87
+ "loss": 6.3559,
88
+ "step": 450
89
+ },
90
+ {
91
+ "epoch": 2.464771322620519,
92
+ "grad_norm": 3.318251848220825,
93
+ "learning_rate": 5e-05,
94
+ "loss": 6.3251,
95
+ "step": 500
96
+ },
97
+ {
98
+ "epoch": 2.711990111248455,
99
+ "grad_norm": 3.502115488052368,
100
+ "learning_rate": 4.9550359712230215e-05,
101
+ "loss": 6.1368,
102
+ "step": 550
103
+ },
104
+ {
105
+ "epoch": 2.9592088998763906,
106
+ "grad_norm": 3.497938394546509,
107
+ "learning_rate": 4.9100719424460435e-05,
108
+ "loss": 6.0775,
109
+ "step": 600
110
+ },
111
+ {
112
+ "epoch": 3.0,
113
+ "eval_loss": 6.036961555480957,
114
+ "eval_runtime": 14.2475,
115
+ "eval_samples_per_second": 50.465,
116
+ "eval_steps_per_second": 6.317,
117
+ "step": 609
118
+ },
119
+ {
120
+ "epoch": 3.202719406674907,
121
+ "grad_norm": 3.4355390071868896,
122
+ "learning_rate": 4.865107913669065e-05,
123
+ "loss": 5.8684,
124
+ "step": 650
125
+ },
126
+ {
127
+ "epoch": 3.449938195302843,
128
+ "grad_norm": 3.9220526218414307,
129
+ "learning_rate": 4.820143884892087e-05,
130
+ "loss": 5.8101,
131
+ "step": 700
132
+ },
133
+ {
134
+ "epoch": 3.6971569839307787,
135
+ "grad_norm": 3.782421827316284,
136
+ "learning_rate": 4.775179856115108e-05,
137
+ "loss": 5.7784,
138
+ "step": 750
139
+ },
140
+ {
141
+ "epoch": 3.9443757725587143,
142
+ "grad_norm": 3.5181567668914795,
143
+ "learning_rate": 4.7302158273381294e-05,
144
+ "loss": 5.7181,
145
+ "step": 800
146
+ },
147
+ {
148
+ "epoch": 4.0,
149
+ "eval_loss": 5.772126197814941,
150
+ "eval_runtime": 14.2949,
151
+ "eval_samples_per_second": 50.298,
152
+ "eval_steps_per_second": 6.296,
153
+ "step": 812
154
+ },
155
+ {
156
+ "epoch": 4.187886279357231,
157
+ "grad_norm": 3.6087594032287598,
158
+ "learning_rate": 4.685251798561151e-05,
159
+ "loss": 5.5154,
160
+ "step": 850
161
+ },
162
+ {
163
+ "epoch": 4.435105067985167,
164
+ "grad_norm": 3.8448667526245117,
165
+ "learning_rate": 4.640287769784173e-05,
166
+ "loss": 5.4664,
167
+ "step": 900
168
+ },
169
+ {
170
+ "epoch": 4.6823238566131025,
171
+ "grad_norm": 3.594693660736084,
172
+ "learning_rate": 4.595323741007194e-05,
173
+ "loss": 5.4121,
174
+ "step": 950
175
+ },
176
+ {
177
+ "epoch": 4.929542645241038,
178
+ "grad_norm": 3.6225693225860596,
179
+ "learning_rate": 4.550359712230216e-05,
180
+ "loss": 5.3158,
181
+ "step": 1000
182
+ },
183
+ {
184
+ "epoch": 5.0,
185
+ "eval_loss": 5.521895885467529,
186
+ "eval_runtime": 14.2777,
187
+ "eval_samples_per_second": 50.358,
188
+ "eval_steps_per_second": 6.304,
189
+ "step": 1015
190
+ },
191
+ {
192
+ "epoch": 5.173053152039555,
193
+ "grad_norm": 4.245815277099609,
194
+ "learning_rate": 4.505395683453237e-05,
195
+ "loss": 5.1744,
196
+ "step": 1050
197
+ },
198
+ {
199
+ "epoch": 5.420271940667491,
200
+ "grad_norm": 4.306251525878906,
201
+ "learning_rate": 4.460431654676259e-05,
202
+ "loss": 5.1245,
203
+ "step": 1100
204
+ },
205
+ {
206
+ "epoch": 5.667490729295427,
207
+ "grad_norm": 3.7834959030151367,
208
+ "learning_rate": 4.4154676258992806e-05,
209
+ "loss": 5.0729,
210
+ "step": 1150
211
+ },
212
+ {
213
+ "epoch": 5.914709517923362,
214
+ "grad_norm": 4.298359394073486,
215
+ "learning_rate": 4.3705035971223026e-05,
216
+ "loss": 5.0558,
217
+ "step": 1200
218
+ },
219
+ {
220
+ "epoch": 6.0,
221
+ "eval_loss": 5.389599323272705,
222
+ "eval_runtime": 14.2956,
223
+ "eval_samples_per_second": 50.295,
224
+ "eval_steps_per_second": 6.296,
225
+ "step": 1218
226
+ },
227
+ {
228
+ "epoch": 6.158220024721879,
229
+ "grad_norm": 4.416134357452393,
230
+ "learning_rate": 4.325539568345324e-05,
231
+ "loss": 4.839,
232
+ "step": 1250
233
+ },
234
+ {
235
+ "epoch": 6.405438813349814,
236
+ "grad_norm": 4.565963268280029,
237
+ "learning_rate": 4.280575539568346e-05,
238
+ "loss": 4.8297,
239
+ "step": 1300
240
+ },
241
+ {
242
+ "epoch": 6.652657601977751,
243
+ "grad_norm": 4.854921817779541,
244
+ "learning_rate": 4.235611510791367e-05,
245
+ "loss": 4.8175,
246
+ "step": 1350
247
+ },
248
+ {
249
+ "epoch": 6.899876390605686,
250
+ "grad_norm": 4.982056617736816,
251
+ "learning_rate": 4.1906474820143885e-05,
252
+ "loss": 4.8081,
253
+ "step": 1400
254
+ },
255
+ {
256
+ "epoch": 7.0,
257
+ "eval_loss": 5.254246711730957,
258
+ "eval_runtime": 14.2665,
259
+ "eval_samples_per_second": 50.398,
260
+ "eval_steps_per_second": 6.308,
261
+ "step": 1421
262
+ },
263
+ {
264
+ "epoch": 7.143386897404203,
265
+ "grad_norm": 4.195478439331055,
266
+ "learning_rate": 4.14568345323741e-05,
267
+ "loss": 4.6322,
268
+ "step": 1450
269
+ },
270
+ {
271
+ "epoch": 7.3906056860321385,
272
+ "grad_norm": 4.963181972503662,
273
+ "learning_rate": 4.100719424460432e-05,
274
+ "loss": 4.547,
275
+ "step": 1500
276
+ },
277
+ {
278
+ "epoch": 7.637824474660074,
279
+ "grad_norm": 5.290962219238281,
280
+ "learning_rate": 4.055755395683453e-05,
281
+ "loss": 4.5553,
282
+ "step": 1550
283
+ },
284
+ {
285
+ "epoch": 7.88504326328801,
286
+ "grad_norm": 5.0038838386535645,
287
+ "learning_rate": 4.010791366906475e-05,
288
+ "loss": 4.5651,
289
+ "step": 1600
290
+ },
291
+ {
292
+ "epoch": 8.0,
293
+ "eval_loss": 5.183382987976074,
294
+ "eval_runtime": 14.2977,
295
+ "eval_samples_per_second": 50.288,
296
+ "eval_steps_per_second": 6.295,
297
+ "step": 1624
298
+ },
299
+ {
300
+ "epoch": 8.128553770086526,
301
+ "grad_norm": 5.3380446434021,
302
+ "learning_rate": 3.965827338129496e-05,
303
+ "loss": 4.3966,
304
+ "step": 1650
305
+ },
306
+ {
307
+ "epoch": 8.375772558714463,
308
+ "grad_norm": 5.339470863342285,
309
+ "learning_rate": 3.920863309352518e-05,
310
+ "loss": 4.3068,
311
+ "step": 1700
312
+ },
313
+ {
314
+ "epoch": 8.622991347342397,
315
+ "grad_norm": 4.9476189613342285,
316
+ "learning_rate": 3.8758992805755396e-05,
317
+ "loss": 4.3249,
318
+ "step": 1750
319
+ },
320
+ {
321
+ "epoch": 8.870210135970334,
322
+ "grad_norm": 5.430028915405273,
323
+ "learning_rate": 3.8309352517985616e-05,
324
+ "loss": 4.3407,
325
+ "step": 1800
326
+ },
327
+ {
328
+ "epoch": 9.0,
329
+ "eval_loss": 5.13620138168335,
330
+ "eval_runtime": 14.2616,
331
+ "eval_samples_per_second": 50.415,
332
+ "eval_steps_per_second": 6.311,
333
+ "step": 1827
334
+ },
335
+ {
336
+ "epoch": 9.11372064276885,
337
+ "grad_norm": 5.2561259269714355,
338
+ "learning_rate": 3.785971223021583e-05,
339
+ "loss": 4.1746,
340
+ "step": 1850
341
+ },
342
+ {
343
+ "epoch": 9.360939431396787,
344
+ "grad_norm": 5.811314105987549,
345
+ "learning_rate": 3.741007194244605e-05,
346
+ "loss": 4.1324,
347
+ "step": 1900
348
+ },
349
+ {
350
+ "epoch": 9.608158220024722,
351
+ "grad_norm": 5.552155017852783,
352
+ "learning_rate": 3.696043165467626e-05,
353
+ "loss": 4.1058,
354
+ "step": 1950
355
+ },
356
+ {
357
+ "epoch": 9.855377008652658,
358
+ "grad_norm": 6.073920726776123,
359
+ "learning_rate": 3.6510791366906475e-05,
360
+ "loss": 4.0436,
361
+ "step": 2000
362
+ },
363
+ {
364
+ "epoch": 10.0,
365
+ "eval_loss": 5.104895114898682,
366
+ "eval_runtime": 14.2556,
367
+ "eval_samples_per_second": 50.436,
368
+ "eval_steps_per_second": 6.313,
369
+ "step": 2030
370
+ },
371
+ {
372
+ "epoch": 10.098887515451175,
373
+ "grad_norm": 5.994938373565674,
374
+ "learning_rate": 3.606115107913669e-05,
375
+ "loss": 3.99,
376
+ "step": 2050
377
+ },
378
+ {
379
+ "epoch": 10.34610630407911,
380
+ "grad_norm": 6.414961814880371,
381
+ "learning_rate": 3.561151079136691e-05,
382
+ "loss": 3.9013,
383
+ "step": 2100
384
+ },
385
+ {
386
+ "epoch": 10.593325092707046,
387
+ "grad_norm": 6.1248459815979,
388
+ "learning_rate": 3.516187050359712e-05,
389
+ "loss": 3.8884,
390
+ "step": 2150
391
+ },
392
+ {
393
+ "epoch": 10.840543881334982,
394
+ "grad_norm": 5.360867500305176,
395
+ "learning_rate": 3.471223021582734e-05,
396
+ "loss": 3.877,
397
+ "step": 2200
398
+ },
399
+ {
400
+ "epoch": 11.0,
401
+ "eval_loss": 5.103781700134277,
402
+ "eval_runtime": 14.2705,
403
+ "eval_samples_per_second": 50.384,
404
+ "eval_steps_per_second": 6.307,
405
+ "step": 2233
406
+ },
407
+ {
408
+ "epoch": 11.084054388133499,
409
+ "grad_norm": 5.840531349182129,
410
+ "learning_rate": 3.4262589928057554e-05,
411
+ "loss": 3.8216,
412
+ "step": 2250
413
+ },
414
+ {
415
+ "epoch": 11.331273176761433,
416
+ "grad_norm": 7.407821178436279,
417
+ "learning_rate": 3.3812949640287773e-05,
418
+ "loss": 3.6379,
419
+ "step": 2300
420
+ },
421
+ {
422
+ "epoch": 11.57849196538937,
423
+ "grad_norm": 7.770689487457275,
424
+ "learning_rate": 3.3363309352517986e-05,
425
+ "loss": 3.7063,
426
+ "step": 2350
427
+ },
428
+ {
429
+ "epoch": 11.825710754017305,
430
+ "grad_norm": 6.17808198928833,
431
+ "learning_rate": 3.2913669064748206e-05,
432
+ "loss": 3.7008,
433
+ "step": 2400
434
+ },
435
+ {
436
+ "epoch": 12.0,
437
+ "eval_loss": 5.091330528259277,
438
+ "eval_runtime": 14.3396,
439
+ "eval_samples_per_second": 50.141,
440
+ "eval_steps_per_second": 6.276,
441
+ "step": 2436
442
+ },
443
+ {
444
+ "epoch": 12.069221260815821,
445
+ "grad_norm": 6.636974334716797,
446
+ "learning_rate": 3.246402877697842e-05,
447
+ "loss": 3.6062,
448
+ "step": 2450
449
+ },
450
+ {
451
+ "epoch": 12.316440049443758,
452
+ "grad_norm": 6.129552364349365,
453
+ "learning_rate": 3.201438848920863e-05,
454
+ "loss": 3.4693,
455
+ "step": 2500
456
+ },
457
+ {
458
+ "epoch": 12.563658838071694,
459
+ "grad_norm": 7.458967208862305,
460
+ "learning_rate": 3.1564748201438845e-05,
461
+ "loss": 3.507,
462
+ "step": 2550
463
+ },
464
+ {
465
+ "epoch": 12.810877626699629,
466
+ "grad_norm": 6.7472243309021,
467
+ "learning_rate": 3.1115107913669065e-05,
468
+ "loss": 3.5042,
469
+ "step": 2600
470
+ },
471
+ {
472
+ "epoch": 13.0,
473
+ "eval_loss": 5.1072187423706055,
474
+ "eval_runtime": 14.225,
475
+ "eval_samples_per_second": 50.545,
476
+ "eval_steps_per_second": 6.327,
477
+ "step": 2639
478
+ },
479
+ {
480
+ "epoch": 13.054388133498145,
481
+ "grad_norm": 6.99562406539917,
482
+ "learning_rate": 3.066546762589928e-05,
483
+ "loss": 3.4307,
484
+ "step": 2650
485
+ },
486
+ {
487
+ "epoch": 13.301606922126082,
488
+ "grad_norm": 7.893692493438721,
489
+ "learning_rate": 3.0215827338129498e-05,
490
+ "loss": 3.2852,
491
+ "step": 2700
492
+ },
493
+ {
494
+ "epoch": 13.548825710754016,
495
+ "grad_norm": 7.662129878997803,
496
+ "learning_rate": 2.976618705035971e-05,
497
+ "loss": 3.3116,
498
+ "step": 2750
499
+ },
500
+ {
501
+ "epoch": 13.796044499381953,
502
+ "grad_norm": 7.612554550170898,
503
+ "learning_rate": 2.931654676258993e-05,
504
+ "loss": 3.3243,
505
+ "step": 2800
506
+ },
507
+ {
508
+ "epoch": 14.0,
509
+ "eval_loss": 5.115802764892578,
510
+ "eval_runtime": 14.2589,
511
+ "eval_samples_per_second": 50.425,
512
+ "eval_steps_per_second": 6.312,
513
+ "step": 2842
514
+ },
515
+ {
516
+ "epoch": 14.03955500618047,
517
+ "grad_norm": 6.775301456451416,
518
+ "learning_rate": 2.8866906474820144e-05,
519
+ "loss": 3.2555,
520
+ "step": 2850
521
+ },
522
+ {
523
+ "epoch": 14.286773794808406,
524
+ "grad_norm": 7.874576091766357,
525
+ "learning_rate": 2.841726618705036e-05,
526
+ "loss": 3.075,
527
+ "step": 2900
528
+ },
529
+ {
530
+ "epoch": 14.53399258343634,
531
+ "grad_norm": 7.965319633483887,
532
+ "learning_rate": 2.7967625899280573e-05,
533
+ "loss": 3.177,
534
+ "step": 2950
535
+ },
536
+ {
537
+ "epoch": 14.781211372064277,
538
+ "grad_norm": 7.535608291625977,
539
+ "learning_rate": 2.7517985611510793e-05,
540
+ "loss": 3.14,
541
+ "step": 3000
542
+ },
543
+ {
544
+ "epoch": 15.0,
545
+ "eval_loss": 5.132053375244141,
546
+ "eval_runtime": 14.2378,
547
+ "eval_samples_per_second": 50.499,
548
+ "eval_steps_per_second": 6.321,
549
+ "step": 3045
550
+ }
551
+ ],
552
+ "logging_steps": 50,
553
+ "max_steps": 6060,
554
+ "num_input_tokens_seen": 0,
555
+ "num_train_epochs": 30,
556
+ "save_steps": 500,
557
+ "stateful_callbacks": {
558
+ "EarlyStoppingCallback": {
559
+ "args": {
560
+ "early_stopping_patience": 3,
561
+ "early_stopping_threshold": 0.0
562
+ },
563
+ "attributes": {
564
+ "early_stopping_patience_counter": 3
565
+ }
566
+ },
567
+ "TrainerControl": {
568
+ "args": {
569
+ "should_epoch_stop": false,
570
+ "should_evaluate": false,
571
+ "should_log": false,
572
+ "should_save": true,
573
+ "should_training_stop": true
574
+ },
575
+ "attributes": {}
576
+ }
577
+ },
578
+ "total_flos": 2.535055294464e+16,
579
+ "train_batch_size": 8,
580
+ "trial_name": null,
581
+ "trial_params": null
582
+ }
checkpoint-3045/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ec087a0e6786ef2e0bcf02310b767379b6de459d7cb254c0c6ae4e881ba0e1
3
+ size 5304